## Baseline (blip-vqa-base)

In [None]:
import os

import torch
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm
from transformers import BlipProcessor, BlipForQuestionAnswering


In [None]:
CSV_FILE_PATH = "/content/drive/MyDrive/qna_2.csv" 
IMAGE_FOLDER = "/content/images"          
MODEL_NAME = "Salesforce/blip-vqa-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 16

In [None]:
def exact_match(prediction, ground_truth):
    if isinstance(prediction, str) and isinstance(ground_truth, str):
        return prediction.strip().lower() == ground_truth.strip().lower()
    return False

In [None]:
print(f"Using device: {DEVICE}")

In [None]:
processor = BlipProcessor.from_pretrained(MODEL_NAME)
model = BlipForQuestionAnswering.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval() 

In [None]:
try:
    df = pd.read_csv(CSV_FILE_PATH)
    # Ensure required columns exist
    required_cols = ['image_path', 'question', 'answer']
    if not all(col in df.columns for col in required_cols):
        print(f"CSV file must contain columns: {', '.join(required_cols)}")
        print(f"Found columns: {', '.join(df.columns)}")

    print(f"Found {len(df)} samples in the dataset.")
except FileNotFoundError:
    print(f"Error: CSV file not found at {CSV_FILE_PATH}")
except Exception as e:
    print(f"Error reading CSV file: {e}")



In [None]:
all_results_data = []
batched_data = []

correct_em_predictions = 0
total_processed_samples = 0

In [None]:
print("Processing dataset and evaluating...")
for index, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating"):
    image_relative_path = str(row['path'])
    question = str(row['question'])
    ground_truth_answer = str(row['answer'])

    full_image_path = os.path.join(IMAGE_FOLDER, image_relative_path)

    try:
        raw_image = Image.open(full_image_path).convert('RGB')
    except FileNotFoundError:
        print(f"Warning: Image not found at {full_image_path}. Skipping this sample.")
        continue
    except Exception as e:
        print(f"Warning: Could not open image {full_image_path}: {e}. Skipping.")
        continue

    batched_data.append({
        "image": raw_image,
        "question": question,
        "ground_truth": ground_truth_answer,
        "image_path_original": image_relative_path
    })

    if len(batched_data) == BATCH_SIZE or index == len(df) - 1:
        if not batched_data:
            continue

        images_batch = [item["image"] for item in batched_data]
        questions_batch = [item["question"] for item in batched_data] 
        inputs = processor(images=images_batch, text=questions_batch, return_tensors="pt", padding=True, truncation=True).to(DEVICE)

        with torch.no_grad(): 
            outputs = model.generate(**inputs, max_length=100, num_beams=3, early_stopping=True)

        predicted_answers_decoded = processor.batch_decode(outputs, skip_special_tokens=True)

        for i, pred_text in enumerate(predicted_answers_decoded):
            current_item = batched_data[i] 

            gt_ans = current_item["ground_truth"]
            q_text = current_item["question"]
            img_path_orig = current_item["image_path_original"]
            pred_ans_stripped = pred_text.strip()

            all_results_data.append({
                "image_path": img_path_orig,
                "question": q_text,
                "ground_truth_answer": gt_ans,
                "predicted_answer": pred_ans_stripped
            })

            # print(f"\n  Image: {img_path_orig}")
            # print(f"  Q: {q_text}")
            # print(f"  GT: {gt_ans}")
            # print(f"  Pred: {pred_ans_stripped}")

            if exact_match(pred_ans_stripped, gt_ans):
                correct_em_predictions += 1
            total_processed_samples += 1

        batched_data = []

In [None]:
OUTPUT_CSV_FILE = "blip-vqa-base_pred.csv"

results_df = pd.DataFrame(all_results_data)
results_df.to_csv(OUTPUT_CSV_FILE, index=False)

In [None]:
if total_processed_samples > 0:
    em_accuracy = (correct_em_predictions / total_processed_samples) * 100
    print(f"Exact Match (EM) Accuracy: {em_accuracy:.2f}% ({correct_em_predictions}/{total_processed_samples})")
else:
    print("No samples available for EM Accuracy calculation.")

In [None]:
BERTSCORE_MODEL = "distilbert-base-uncased"
from bert_score import score as bert_score_calculator

predictions_list = results_df['predicted_answer'].astype(str).tolist()
references_list = results_df['ground_truth_answer'].astype(str).tolist()

if predictions_list and references_list:
    P, R, F1 = bert_score_calculator(
        predictions_list,
        references_list,
        model_type=BERTSCORE_MODEL,
        lang="en",       
        verbose=True,
        device=DEVICE,    
        batch_size=max(16, BATCH_SIZE * 2)
    )
    print(f"BERTScore Precision: {P.mean():.4f}")
    print(f"BERTScore Recall:    {R.mean():.4f}")
    print(f"BERTScore F1:        {F1.mean():.4f}")

else:
    print("Not enough data to calculate BERTScore.")


## Baseline - (vilt-b32-finetuned-vqa)

In [None]:
import torch
from PIL import Image
import pandas as pd
from transformers import ViltProcessor, ViltForQuestionAnswering
import os
from tqdm import tqdm
from bert_score import score as bert_score_calculator

In [None]:

CSV_FILE_PATH = "/content/drive/MyDrive/qna_2.csv"
IMAGE_FOLDER = "/content/images"
MODEL_NAME = "dandelin/vilt-b32-finetuned-vqa"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 8 
OUTPUT_CSV_FILE = "vilt_vqa_classification_predictions.csv"
BERTSCORE_MODEL = "distilbert-base-uncased"

In [None]:
def exact_match(prediction, ground_truth):
    if isinstance(prediction, str) and isinstance(ground_truth, str):
        return prediction.strip().lower() == ground_truth.strip().lower()
    return False

In [None]:
processor = ViltProcessor.from_pretrained(MODEL_NAME)
model = ViltForQuestionAnswering.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

In [None]:
try:
    df = pd.read_csv(CSV_FILE_PATH)
    required_cols = ['path', 'question', 'answer']
    if not all(col in df.columns for col in required_cols):
        print(f"CSV file must contain columns: {', '.join(required_cols)}. Found: {', '.join(df.columns)}")
    print(f"Found {len(df)} samples in the dataset.")
except FileNotFoundError:
    print(f"Error: CSV file not found at {CSV_FILE_PATH}")
except Exception as e:
    print(f"Error reading CSV file: {e}")

In [None]:
all_results_data = []
batched_data = []
correct_em_predictions = 0
total_processed_samples = 0

In [None]:
for index, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating"):
    image_relative_path = str(row['path'])
    question = str(row['question'])
    ground_truth_answer = str(row['answer'])
    full_image_path = os.path.join(IMAGE_FOLDER, image_relative_path)

    try:
        raw_image = Image.open(full_image_path).convert('RGB')
    except FileNotFoundError:
        print(f"Warning: Image not found at {full_image_path}. Skipping.")
        continue
    except Exception as e:
        print(f"Warning: Could not open image {full_image_path}: {e}. Skipping.")
        continue

    batched_data.append({
        "image": raw_image,
        "question": question,
        "ground_truth": ground_truth_answer,
        "image_path_original": image_relative_path
    })

    if len(batched_data) == BATCH_SIZE or index == len(df) - 1:
        if not batched_data:
            continue

        images_batch = [item["image"] for item in batched_data]
        questions_batch = [item["question"] for item in batched_data]

        inputs = processor(images=images_batch, text=questions_batch, return_tensors="pt", padding=True, truncation=True).to(DEVICE)

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_class_idxs = logits.argmax(-1)

        predicted_answers_text = []
        for idx in predicted_class_idxs:
            predicted_answers_text.append(model.config.id2label[idx.item()])

        for i, pred_text in enumerate(predicted_answers_text):
            current_item = batched_data[i]
            gt_ans = current_item["ground_truth"]
            q_text = current_item["question"]
            img_path_orig = current_item["image_path_original"]
            pred_ans_stripped = pred_text.strip()

            all_results_data.append({
                "image_path": img_path_orig,
                "question": q_text,
                "ground_truth_answer": gt_ans,
                "predicted_answer": pred_ans_stripped
            })

            if exact_match(pred_ans_stripped, gt_ans):
                correct_em_predictions += 1
            total_processed_samples += 1

        batched_data = []

In [None]:
results_df = pd.DataFrame(all_results_data)
results_df.to_csv(OUTPUT_CSV_FILE, index=False)

In [None]:
if total_processed_samples > 0:
    em_accuracy = (correct_em_predictions / total_processed_samples) * 100
    print(f"Exact Match (EM) Accuracy: {em_accuracy:.2f}% ({correct_em_predictions}/{total_processed_samples})")
else:
    print("No samples available for EM Accuracy calculation.")

In [None]:
predictions_list = results_df['predicted_answer'].astype(str).tolist()
references_list = results_df['ground_truth_answer'].astype(str).tolist()

if predictions_list and references_list:
    P, R, F1 = bert_score_calculator(
        predictions_list,
        references_list,
        model_type=BERTSCORE_MODEL,
        lang="en",
        verbose=True,
        device=DEVICE,
        batch_size=max(16, BATCH_SIZE * 2)
    )
    print(f"BERTScore Precision: {P.mean():.4f}")
    print(f"BERTScore Recall:    {R.mean():.4f}")
    print(f"BERTScore F1:        {F1.mean():.4f}")
else:
    print("Not enough data to calculate BERTScore.")