In [26]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image

# Load the pre-trained ResNet model
resnet = models.resnet50(pretrained=True)
resnet.eval()

# Function to extract features from an image
def extract_features(image_path):
    img = Image.open(image_path)
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    img_t = transform(img)
    img_t = img_t.unsqueeze(0)

    with torch.no_grad():
        features = resnet(img_t)
    return features


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:06<00:00, 16.9MB/s]


In [27]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

def generate_caption(features):
    input_ids = tokenizer.encode("Image features: " + str(features), return_tensors='pt')
    output = model.generate(input_ids, max_length=50, num_return_sequences=1)
    caption = tokenizer.decode(output[0], skip_special_tokens=True)
    return caption


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


In [28]:
from nltk.translate.bleu_score import sentence_bleu

# Actual and generated captions
reference = "A dog is running in the park"
hypothesis = generate_caption(features)

# Calculate BLEU score
score = sentence_bleu([reference.split()], hypothesis.split())
print(f"BLEU Score: {score}")


NameError: name 'features' is not defined

In [36]:
import torch
from torchvision.models import resnet50, ResNet50_Weights
import torchvision.transforms as transforms
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from PIL import Image
import requests
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Load the pre-trained ResNet model
resnet = resnet50(weights=ResNet50_Weights.DEFAULT)
resnet.eval()

# Load the pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Function to extract features from an image
def extract_features(image_path):
    img = Image.open(image_path)
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    img_t = transform(img)
    img_t = img_t.unsqueeze(0)

    with torch.no_grad():
        features = resnet(img_t)
    return features

# Function to generate a caption based on image features
def generate_caption(features):
    input_ids = tokenizer.encode("Image features: " + str(features[0][:10].tolist()), return_tensors='pt')
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
    output = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=50, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    caption = tokenizer.decode(output[0], skip_special_tokens=True)
    return caption

# Example usage
def main():
    # Download an example image
    image_url = "https://images.unsplash.com/photo-1517423440428-a5a00ad493e8"
    image_path = "./example.jpg"
    img_data = requests.get(image_url).content
    with open(image_path, 'wb') as handler:
        handler.write(img_data)

    # Extract features from the image
    features = extract_features(image_path)

    # Generate a caption for the image
    generated_caption = generate_caption(features)
    print(f"Generated Caption: {generated_caption}")

    # Example reference caption for evaluation
    reference_caption = "A dog is sitting on the grass in a park."

    # Evaluate the generated caption using BLEU score with smoothing
    smoothing_function = SmoothingFunction().method1
    bleu_score = sentence_bleu([reference_caption.split()], generated_caption.split(), smoothing_function=smoothing_function)
    print(f"BLEU Score: {bleu_score}")

if __name__ == "__main__":
    main()


Generated Caption: Image features: [-0.13652336597442627, -0.09201764315366745, 0.5357312560081482, 0.39029809832572937, 0.5979483723640442, 0.652328372001648, 0.1887870877981186, 0.13419455289840698, 0.26435065269470215, -0.0016855307621881366]

The following table shows the average number of times the number of times the number of times the number of times the number of times the number of times the number of times the number of times the number of times the number of times the number of
BLEU Score: 0.003040349233039763
