In [None]:
import torch
from PIL import Image
import pandas as pd
from transformers import BlipProcessor, BlipForQuestionAnswering
from peft import PeftModel, PeftConfig
import os
from tqdm import tqdm
import evaluate

In [None]:
BASE_MODEL_NAME = "Salesforce/blip-vqa-base"
ADAPTER_MODEL_PATH = "/kaggle/input/dataset-mp/epoch_3/epoch_3"  
IMAGES_DIR = "/kaggle/input/dataset-mp/images/images"  
CSV_FILE_PATH = "/kaggle/input/dataset-mp/qna_2.csv" 
IMAGE_FILENAME_COL = "filename"
QUESTION_COL = "question"     
ANSWER_COL = "answer"          
BERT_SCORE_MODEL_TYPE = "distilbert-base-uncased"
BATCH_SIZE = 8

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
base_model = BlipForQuestionAnswering.from_pretrained(BASE_MODEL_NAME)
processor = BlipProcessor.from_pretrained(BASE_MODEL_NAME)

model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_PATH)
model = model.to(device)
model.eval() # Set to evaluation mode
print("PEFT model loaded and merged with base model.")


In [None]:
df = pd.read_csv(CSV_FILE_PATH)
print(f"Found {len(df)} samples in the CSV.")

In [None]:
predictions = []
references = []
exact_matches = 0
processed_samples = 0

for i in tqdm(range(0, len(df), BATCH_SIZE), desc="Evaluating Batches"):
    batch_df = df.iloc[i:i+BATCH_SIZE]
    
    batch_images = []
    batch_questions = []
    batch_gt_answers = []
    
    valid_indices_in_batch = [] 

    for idx, row in batch_df.iterrows():
        image_filename = row[IMAGE_FILENAME_COL]
        question = str(row[QUESTION_COL])
        gt_answer = str(row[ANSWER_COL])  

        image_path = os.path.join(IMAGES_DIR, image_filename)

        try:
            raw_image = Image.open(image_path).convert('RGB')
            batch_images.append(raw_image)
            batch_questions.append(question)
            batch_gt_answers.append(gt_answer)
            valid_indices_in_batch.append(idx) # Store original df index
        except FileNotFoundError:
            print(f"Warning: Image not found at {image_path}. Skipping this sample.")
        except Exception as e:
            print(f"Warning: Could not load image {image_path}: {e}. Skipping this sample.")

    if not batch_images:
        continue

    try:

        inputs = processor(images=batch_images, text=batch_questions, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}


        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_length=50) 

        generated_answers = processor.batch_decode(generated_ids, skip_special_tokens=True)
        
        for gen_ans, gt_ans, original_df_idx in zip(generated_answers, batch_gt_answers, valid_indices_in_batch):
            pred_text = gen_ans.strip()
            ref_text = gt_ans.strip()

            predictions.append(pred_text)
            references.append(ref_text)

            if pred_text.lower() == ref_text.lower():
                exact_matches += 1
            
            if processed_samples < 5: 
                 print(f"\nSample {processed_samples + 1}:")
                 print(f"  Image: {df.loc[original_df_idx, IMAGE_FILENAME_COL]}")
                 print(f"  Question: {df.loc[original_df_idx, QUESTION_COL]}")
                 print(f"  Ground Truth: {ref_text}")
                 print(f"  Predicted: {pred_text}")

            processed_samples += 1

    except Exception as e:
        print(f"Error during batch processing: {e}")

In [None]:
df = pd.DataFrame({
    'pred': predictions,
    'ground_truth': references
})

df.to_csv('fintuned_pred.csv', index=False)

In [None]:
! pip install bert-score

In [None]:
print("\nCalculating Exact Match (EM) Score...")
em_score = (exact_matches / processed_samples) * 100 if processed_samples > 0 else 0
print(f"Exact Match (EM) Score: {em_score:.2f}% ({exact_matches}/{processed_samples})")

print("\nCalculating BERTScore...")
try:
    bertscore = evaluate.load("bertscore")
    results = bertscore.compute(predictions=predictions, references=references, lang="en",
                                model_type=BERT_SCORE_MODEL_TYPE, device=device)

    avg_precision = sum(results['precision']) / len(results['precision'])
    avg_recall = sum(results['recall']) / len(results['recall'])
    avg_f1 = sum(results['f1']) / len(results['f1'])

    print(f"BERTScore Precision: {avg_precision:.4f}")
    print(f"BERTScore Recall:    {avg_recall:.4f}")
    print(f"BERTScore F1:        {avg_f1:.4f}")
except Exception as e:
    print(f"Could not calculate BERTScore: {e}")
    print("Make sure you have 'bert_score' and 'evaluate' libraries installed.")
    print("You might also need to download BERTScore models the first time you run it.")

print("\nEvaluation complete.")
