In [1]:
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from transformers import pipeline  # For BARTScore

# Install BERTScore (if not already installed)
!pip install bert-score

from bert_score import score as bert_score

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("Using CPU")

# Load the BLIP processor and model, move to GPU
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)

import pandas as pd

# Load your data
csv_path = "/kaggle/input/vr-final-project-baseline/VR_Final_Project/Dataset/qna_final.csv"
df = pd.read_csv(csv_path)

# Get unique Item_IDs from the DataFrame
unique_ids = df['Item_ID'].unique()

# Select the first 20 unique Item_IDs
first_20_unique_ids = unique_ids[:20]

# Create a new DataFrame containing the first occurrence of each of these 20 unique Item_IDs
df_20_unique_rows = df[df['Item_ID'].isin(first_20_unique_ids)].drop_duplicates(subset=['Item_ID'], keep='first').head(20).copy()

# Reset the index of the new DataFrame
df_20_unique_rows = df_20_unique_rows.reset_index(drop=True)

# Print the new DataFrame to verify
print(df.head())
print(df_20_unique_rows.head())


# Check the number of rows and unique Item_IDs in the new DataFrame
print(f"\nNumber of rows in df_20_unique_rows: {len(df_20_unique_rows)}")
print(f"Number of unique Item_IDs in df_20_unique_rows: {df_20_unique_rows['Item_ID'].nunique()}")

# If you want to work with this smaller DataFrame instead of the original 'df',
# you can replace 'df' with 'df_20_unique_rows' in the subsequent code.
#df = df_20_unique_rows # changed df here

# Split data into train and test sets
unique_ids = df["Item_ID"].unique()
print(unique_ids)
random_seed = 42
np.random.seed(random_seed)
train_ids, temp_ids = train_test_split(unique_ids, test_size=0.3, random_state=random_seed)
val_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=random_seed)
train_df = df[df["Item_ID"].isin(train_ids)]
val_df = df[df["Item_ID"].isin(val_ids)]
test_df = df[df["Item_ID"].isin(test_ids)]

print(f"Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")
# Image directory and path function
image_dir = "/kaggle/input/vr-final-project-baseline/VR_Final_Project/Dataset/extracted/images/small"
def get_relative_path(full_path):
    parts = full_path.split('/')
    return f"{parts[-2]}/{parts[-1]}" if len(parts) >= 2 else full_path

# Function to get predictions
def predict_answers(image_paths, questions):
    predicted_answers = []
    for img_path, question in zip(image_paths, questions):
        relative_path = get_relative_path(img_path)
        full_image_path = f"{image_dir}/{relative_path}"
        try:
            image = Image.open(full_image_path).convert("RGB")
        except FileNotFoundError:
            print(f"Error: Image not found at {full_image_path} (from original: {img_path})")
            predicted_answers.append(None)
            continue

        inputs = processor(images=image, text=question, return_tensors="pt").to(device) # Move inputs to GPU
        with torch.no_grad():
            outputs = model.generate(**inputs)
        answer = processor.decode(outputs[0], skip_special_tokens=True).strip()
        predicted_answers.append(answer)
    return predicted_answers
# Get predictions for the test set
test_image_paths = [f"{image_dir}/{get_relative_path(path)}" for path in test_df['Image_Path']]
test_questions = test_df['Question'].tolist()
ground_truth_answers = test_df['Answer'].tolist()
predicted_answers = predict_answers(test_image_paths, test_questions)
# --- Evaluate Metrics ---

# 1. Accuracy
predicted_answers_lower = [ans.lower() if ans is not None else "" for ans in predicted_answers] # Handle None predictions
ground_truth_answers_lower = [ans.lower() for ans in ground_truth_answers]
accuracy = accuracy_score(ground_truth_answers_lower, predicted_answers_lower)
print(f"Baseline Accuracy: {accuracy:.4f}")

# 2. F1 Score
# For F1 score, we need to consider how to handle multiple possible "classes" (unique answers).
# We'll calculate a micro-averaged F1 score, treating each answer as a class.
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(ground_truth_answers_lower)  # Fit on ground truth *first*
encoded_ground_truth = label_encoder.transform(ground_truth_answers_lower)
try:
    encoded_predicted = label_encoder.transform(predicted_answers_lower) # Then transform predictions
except ValueError as e:
    print(f"Error encoding predictions: {e}")
    encoded_predicted = np.zeros_like(encoded_ground_truth)  # Or handle it in a way that makes sense for your task
f1 = f1_score(encoded_ground_truth, encoded_predicted, average='micro')
print(f"Baseline F1 Score (Micro): {f1:.4f}")

# 3. BARTScore
# Load the BARTScore model
#bartscore_pipeline = pipeline("bart-score", model="facebook/bart-large-cnn", device=device) # Move pipeline to GPU
# Calculate BARTScore (takes a list of predicted and a list of ground truth)
if all(predicted_answers) and all(ground_truth_answers):
    bart_scores = bert_score(predicted_answers, ground_truth_answers, model_type='facebook/bart-large-cnn',  device=device)
    avg_bart_score = sum(bart_scores[0].tolist()) / len(bart_scores[0])
    print(f"Baseline BARTScore: {avg_bart_score:.4f}")
else:
    print("Skipping BARTScore calculation due to missing predictions.")

# 4. BERTScore
# Calculate BERTScore
if all(predicted_answers) and all(ground_truth_answers):
    P, R, F1 = bert_score(predicted_answers, ground_truth_answers, lang="en", verbose=True, device=device) # Move calculation to GPU
    avg_bert_f1 = F1.mean().item()
    print(f"Baseline BERTScore F1: {avg_bert_f1:.4f}")
else:
    print("Skipping BERTScore calculation due to missing predictions.")


2025-05-14 05:57:06.422598: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747202226.444667      89 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747202226.451386      89 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered




Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Using GPU: Tesla T4
      Image_ID     Item_ID                   Question Answer       Image_Path
0  81mEuveXFVL  B07PW6XQY8  What is the bed's finish?  Wenge  c3/c339ab63.jpg
1  81mEuveXFVL  B07PW6XQY8  What is the bed's finish?  Wenge  a1/a19ec002.jpg
2  81mEuveXFVL  B07PW6XQY8  What is the bed's finish?  Wenge  b0/b022e37b.jpg
3  81mEuveXFVL  B07PW6XQY8  What is the bed's finish?  Wenge  23/23063c20.jpg
4  81mEuveXFVL  B07PW6XQY8  What is the bed's finish?  Wenge  15/15dd91bd.jpg
      Image_ID     Item_ID                                           Question  \
0  81mEuveXFVL  B07PW6XQY8                          What is the bed's finish?   
1  61mZ+WX+N8L  B074VDJ7KZ                          Is this product unsalted?   
2  51JKXD3XTJL  B00004SD6V                           Is the trowel rustproof?   
3  41ZZFRX9TXL  B00005041B             What is the shape of the light shades?   
4  51jNv-Fzw6L  B000GXF4UC  What is the color of the Strathwood Camano Sun...   

  Answer       Image_Path

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Baseline BARTScore: 0.6892


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/9 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/63 [00:00<?, ?it/s]

done in 1.38 seconds, 2882.65 sentences/sec
Baseline BERTScore F1: 0.9675


In [2]:
#storing the test_df, train_df and val_df for future references
# Assuming 'df' is your DataFrame
test_df.to_csv('test_qna_final.csv', index=False)
train_df.to_csv('train_qna_final.csv', index=False)
val_df.to_csv('val_qna_final.csv', index=False)