# Image Captioning Evaluation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Setting up Dependencies

Install the required libraries and import necessary modules for the evaluation.

In [None]:
!pip install sentence-transformers
!pip install -q datasets
!pip install -q transformers



In [None]:
!pip install -q rouge

In [None]:
from tqdm import tqdm
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from transformers import VisionEncoderDecoderModel
from datasets import load_dataset

#  Loading the Test Dataset
Load the test dataset from the specified directory using the datasets library.

In [None]:
root = "/content/drive/MyDrive/MSC_PROJECT/Flickr30k/Formatted_Flickr30k/"

test_dataset = load_dataset('imagefolder', data_dir = root, split='test')

# Defining the Image Captioning Dataset
Define a custom dataset class, ImageCaptioningDataset, to process and return the images and their corresponding captions in a format suitable for the model.

In [None]:
from PIL import Image
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, ViTFeatureExtractor

class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, feature_extractor, tokenizer, max_target_length=128):
        self.dataset = dataset
        self.feature_extractor = feature_extractor
        self.tokenizer = tokenizer
        self.max_length = max_target_length

    def __len__(self):
        return len(self.dataset)


    def __getitem__(self, idx):
        item = self.dataset[idx]

        # prepare image
        pixel_values = self.feature_extractor(images=item["image"], return_tensors="np").pixel_values

        # add captions by encoding the input
        captions = self.tokenizer(
            text=item["text"], padding="max_length", max_length=self.max_length
        ).input_ids

        encoding = {
            "pixel_values": torch.from_numpy(pixel_values.squeeze()),
            "labels": torch.tensor(captions),
        }
        return encoding


# Preparing Tokenizer and Feature Extractor
Load the necessary tokenizers and feature extractors for the vision and text models to be used. These are crucial for processing the input data correctly.

In [None]:

from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoFeatureExtractor

image_encoder_model = "google/vit-base-patch16-224-in21k"
text_decoder_model = "bipin/malayalam-gpt2"

feature_extractor = AutoFeatureExtractor.from_pretrained(image_encoder_model)
tokenizer = AutoTokenizer.from_pretrained(text_decoder_model)
"""

from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoFeatureExtractor


image_encoder_model = "google/vit-base-patch16-224-in21k"
text_decoder_model = "l3cube-pune/malayalam-bert"

# image feature extractor
feature_extractor = AutoFeatureExtractor.from_pretrained(image_encoder_model)
# text tokenizer
tokenizer = AutoTokenizer.from_pretrained(text_decoder_model)"""

# Pre-processing the Test Data
Use the ImageCaptioningDataset class to process the raw test data, making it ready for evaluation.

In [None]:
test_data = ImageCaptioningDataset(test_dataset, feature_extractor, tokenizer)

In [None]:
test_data

# Loading the Pre-trained Model
Load the pre-trained VisionEncoderDecoderModel from the specified path.

In [None]:

#vit+ gpt2
path = "/content/drive/MyDrive/Flickr30k_Models/vitgptlr1e-4epochs10"
model = VisionEncoderDecoderModel.from_pretrained(path)
"""

path = "/content/drive/MyDrive/Flickr30k_Models/bert/lre-6epoch20/vitbertlr1e-6epochs7.96"
model = VisionEncoderDecoderModel.from_pretrained(path)
"""

# Generating Predictions
Function to generate caption predictions for each image in the dataset.

In [None]:

def generate_predictions(model, dataset, tokenizer):
    model.eval()  # Set the model to evaluation mode

    predictions = []
    actual = []

    with torch.no_grad():  # Do not calculate gradients
        for batch in tqdm(dataset):  # Use tqdm for a progress bar
            # Forward pass through the model
            outputs = model.generate(pixel_values=batch["pixel_values"].unsqueeze(0),
                                     max_length=128,
                                     do_sample=True)

            # Decode the outputs and append to lists
            predictions.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
            actual.append(tokenizer.decode(batch["labels"], skip_special_tokens=True))

    return predictions, actual



In [None]:
predictions, actual = generate_predictions(model, test_data, tokenizer )

# Embedding the Captions
Embed the generated predictions and the actual captions into vector representations using the sentence-transformers library.

In [None]:

from sentence_transformers import SentenceTransformer
#model = SentenceTransformer('bipin/malayalam-gpt2')
transformer = SentenceTransformer('l3cube-pune/malayalam-bert')
malayalam_embeddings = transformer.encode(predictions)
english_embeddings = transformer.encode(actual)
malayalam_embeddings = torch.tensor(malayalam_embeddings)
english_embeddings = torch.tensor(english_embeddings)


# Cosine Similarity Calculation

Compute the cosine similarity between the embeddings of the predictions and the actual captions. This measures the similarity between the generated and actual captions.

In [None]:
similarities = cosine_similarity(malayalam_embeddings, english_embeddings)
mean_similarity = np.mean(np.diag(similarities))
print('Mean Cosine Similarity:', mean_similarity)


# Metrics Computation for Predictions
Generate predictions in a format suitable for metric computation, and define a function to compute various evaluation metrics such as BLEU, METEOR, and ROUGE.

In [None]:
def generate_predictions_for_metrics(model, dataset):

    model.eval()  # Set the model to evaluation mode

    predictions = []
    labels = []

    with torch.no_grad():  # Do not calculate gradients
        for batch in dataset:  # Iterate over batches in the dataset
            # Forward pass through the model
            outputs = model.generate(pixel_values=batch["pixel_values"].unsqueeze(0),
                                     max_length=128,
                                     do_sample=True)

            # Append to lists
            predictions.append(outputs[0].cpu().numpy())
            labels.append(batch["labels"].cpu().numpy())

    return predictions, labels

In [None]:
test_predictions, test_labels = generate_predictions_for_metrics(model, test_data)

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    if ignore_pad_token_for_loss:
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Compute ROUGE scores
    rouge = Rouge()
    rouge_scores = [rouge.get_scores(pred, ref[0]) for pred, ref in zip(decoded_preds, decoded_labels)]
    rouge_1 = np.mean([score[0]['rouge-1']['f'] for score in rouge_scores])
    rouge_2 = np.mean([score[0]['rouge-2']['f'] for score in rouge_scores])
    rouge_l = np.mean([score[0]['rouge-l']['f'] for score in rouge_scores])

    # Compute BLEU scores
    tokenized_preds = [tokenizer.tokenize(pred) for pred in decoded_preds]
    tokenized_labels = [[tokenizer.tokenize(label)] for label in decoded_labels]
    smoothing = SmoothingFunction().method7
    bleu1 = corpus_bleu(tokenized_labels, tokenized_preds, weights=(1, 0, 0, 0), smoothing_function=smoothing)
    bleu2 = corpus_bleu(tokenized_labels, tokenized_preds, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing)
    bleu3 = corpus_bleu(tokenized_labels, tokenized_preds, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing)
    bleu4 = corpus_bleu(tokenized_labels, tokenized_preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing)

    # Compute METEOR scores
    meteor_scores = [meteor_score([ref.split()], pred.split()) for ref, pred in zip(decoded_labels, decoded_preds)]
    mean_meteor = sum(meteor_scores) / len(meteor_scores)

    # Compute individual BLEU-4 scores for each pair of prediction and label, include the index in the tuple
    individual_bleu4_scores = [
        (index, corpus_bleu([label_tokens], [pred_tokens], weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing), pred, label)
        for index, (pred_tokens, label_tokens, pred, label) in enumerate(zip(tokenized_preds, tokenized_labels, decoded_preds, decoded_labels))
    ]
    # Sort the list of tuples by BLEU-4 score in descending order
    individual_bleu4_scores.sort(key=lambda x: x[1], reverse=True)

    # Return the BLEU, METEOR, and ROUGE scores in a format compatible with Trainer's logging
    return {
        "bleu1": bleu1,
        "bleu2": bleu2,
        "bleu3": bleu3,
        "bleu4": bleu4,
        "meteor": mean_meteor,
        "rouge-1": rouge_1,
        "rouge-2": rouge_2,
        "rouge-l": rouge_l,
        "highest_bleu4": individual_bleu4_scores  # The pair with the highest BLEU-4 score
    }

In [None]:
test_metrics = compute_metrics((test_predictions, test_labels))
print(test_metrics)

# Visualization and Analysis
Visualize an example from the test dataset and its corresponding prediction. This provides a qualitative understanding of how well the model is performing.



In [None]:
processed_example = test_data[939]
image_tensor = processed_example["pixel_values"]
plt.imshow(image_tensor.permute(1, 2, 0))
plt.axis('off')
plt.show()
caption = tokenizer.decode(processed_example["labels"], skip_special_tokens=True)
print("Caption:", caption)
text = tokenizer.decode(test_predictions[939], skip_special_tokens=True)
print("Predictions:", text)
