In [None]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
import torch

# Load model and processor
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16).to("cuda")


In [None]:
import pandas as pd

# Load the CSV into a DataFrame
df = pd.read_csv("/content/drive/MyDrive/dataset/qna.csv")

# Display the first few rows
print(df.head())


In [None]:
import os

# Step 2: Fix image paths

def fix_path(rel_path):
    rel_subpath = rel_path.replace("../dataset/", "")
    return os.path.join("/content/drive/MyDrive/dataset", rel_subpath)

df["fixed_path"] = df["image_path"].apply(fix_path)


In [None]:
df

In [None]:
# Track results
correct = 0
results = []

# Iterate through each row
for i, row in df.iterrows():
    try:
        image = Image.open(row["fixed_path"]).convert("RGB")
        question = row["question"]
        gt_answer = row["answer"].strip().lower()

        # VQA inference
        inputs = processor(images=image, text=question, return_tensors="pt").to("cuda", torch.float16)
        out = model.generate(**inputs)
        pred = processor.decode(out[0], skip_special_tokens=True).strip().lower()

        is_correct = pred == gt_answer

        results.append({
            "image_path": row["image_path"],
            "question": question,
            "ground_truth": gt_answer,
            "predicted": pred,
            "correct": is_correct
        })

        if is_correct:
            correct += 1

        print(f"Q: {question}\nGT: {gt_answer}\nPredicted: {pred}\n")

    except Exception as e:
        print(f"Error at row {i}: {e}")

# Final accuracy
accuracy = correct / len(df)
print(f"\nOverall Accuracy: {accuracy:.2%}")

# Save results
results_df = pd.DataFrame(results)
results_df.to_csv("/content/drive/MyDrive/dataset/vqa_results.csv", index=False)