In [None]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
import torch

# Load model and processor
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16).to("cuda")


In [None]:
import pandas as pd

# Load the CSV into a DataFrame
df = pd.read_csv("/content/drive/MyDrive/dataset/qna.csv")

# Display the first few rows
print(df.head())


In [None]:
import os

# Step 2: Fix image paths

def fix_path(rel_path):
    rel_subpath = rel_path.replace("../dataset/", "")
    return os.path.join("/content/drive/MyDrive/dataset", rel_subpath)

df["fixed_path"] = df["image_path"].apply(fix_path)


In [None]:
df

In [None]:
# Complete VQA evaluation with accuracy and BERTScore

import pandas as pd
from PIL import Image
from bert_score import score

def digit_to_word(d):
    mapping = {
        "0": "zero", "1": "one", "2": "two", "3": "three",
        "4": "four", "5": "five", "6": "six", "7": "seven",
        "8": "eight", "9": "nine"
    }
    return mapping.get(d, d)

# Load your dataframe

correct = 0
results = []

for i, row in df.iterrows():
    try:
        # Load and display image if needed
        image = Image.open(row["fixed_path"]).convert("RGB")
        question = row["question"]
        gt_answer = row["answer"].strip().lower()

        # Prepare prompt
        prompt = (
            "You are a visual question answering assistant. "
            f"Question: {question}\n"
            "give answer in word even if the answer is numerical \n"
            "Answer (one word):"
        )

        inputs = processor(images=image, text=prompt, return_tensors="pt")
        inputs = {
            "input_ids": inputs["input_ids"].to("cuda"),
            "attention_mask": inputs["attention_mask"].to("cuda"),
            "pixel_values": inputs["pixel_values"].to("cuda").half()
        }

        out = model.generate(
            **inputs,
            max_new_tokens=16,
            num_beams=4,
            no_repeat_ngram_size=2,
            early_stopping=True
        )

        # Decode prediction
        full_ids = out[0]
        prompt_len = inputs["input_ids"].shape[-1]
        gen_ids = full_ids[prompt_len:]
        pred = processor.decode(gen_ids, skip_special_tokens=True).strip().lower()
        pred = digit_to_word(pred)

        # Compare
        is_correct = (pred == gt_answer)
        if is_correct:
            correct += 1

        # Store result
        results.append({
            "image_path": row["fixed_path"],
            "question": question,
            "ground_truth": gt_answer,
            "prediction": pred,
            "correct": is_correct
        })

        print(f"Q: {question}\nGT: {gt_answer}\nPredicted: {pred}\n")

    except Exception as e:
        print(f"Error at row {i}: {e}")

# Compute accuracy
accuracy = correct / len(df)
print(f"\nOverall Accuracy: {accuracy:.2%}\n")

# Compute BERTScore
preds = [r['prediction'] for r in results]
refs = [r['ground_truth'] for r in results]
P, R, F1 = score(preds, refs, lang='en', verbose=True)

print(f"BERTScore Precision: {P.mean().item():.4f}")
print(f"BERTScore Recall:    {R.mean().item():.4f}")
print(f"BERTScore F1:        {F1.mean().item():.4f}\n")

# Add BERT scores back to results_df if desired
for idx, (p, r, f) in enumerate(zip(P.tolist(), R.tolist(), F1.tolist())):
    results[idx]['bert_precision'] = p
    results[idx]['bert_recall'] = r
    results[idx]['bert_f1'] = f

# Save results
results_df = pd.DataFrame(results)
results_df.to_csv(
    "/content/drive/MyDrive/dataset_unzipped/dataset/vqa_results_with_bert.csv",
    index=False
)
