In [3]:
import pandas as pd
import torch
from PIL import Image
import os
from transformers import BlipProcessor, BlipForQuestionAnswering
from tqdm.notebook import tqdm
import evaluate
from sklearn.metrics import f1_score
import warnings
import re
import math

In [24]:
DATASET_CSV = '/kaggle/input/image-input/output.csv'
IMAGE_BASE_DIR = '/kaggle/working/images/small'
MODEL_NAME = "Salesforce/blip-vqa-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 64
print(f"Using device: {DEVICE}")
print(f"Using Batch Size: {BATCH_SIZE}")

Using device: cuda
Using Batch Size: 64


In [12]:
print("Loading dataset...")
try:
    df = pd.read_csv(DATASET_CSV)
    # Optional: Sample the dataset for faster testing
    # df = df.sample(n=100, random_state=42).reset_index(drop=True)
    print(f"Loaded {len(df)} samples.")
except FileNotFoundError:
    print(f"Error: {DATASET_CSV} not found. Make sure it's in the correct directory.")
    exit()

# --- Load Model and Processor ---
print(f"Loading model: {MODEL_NAME}...")
processor = BlipProcessor.from_pretrained(MODEL_NAME, use_fast=True)
model = BlipForQuestionAnswering.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval() # Set model to evaluation mode
print("Model loaded.")

Loading dataset...
Loaded 33866 samples.
Loading model: Salesforce/blip-vqa-base...


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Model loaded.


In [15]:
def get_vqa_prediction(image_path, question):
    """Gets a VQA prediction for a given image path and question."""
    try:
        raw_image = Image.open(image_path).convert('RGB')
    except FileNotFoundError:
        print(f"Warning: Image not found at {image_path}")
        return "[Image Load Error]"
    except Exception as e:
        print(f"Warning: Error loading image {image_path}: {e}")
        return "[Image Load Error]"

    # Prepare inputs
    inputs = processor(raw_image, question, return_tensors="pt").to(DEVICE)

    # Generate answer
    with torch.no_grad(): # Ensure no gradients are calculated during inference
        outputs = model.generate(**inputs, max_new_tokens=10) # Limit generated tokens for single-word answers

    # Decode answer
    answer = processor.decode(outputs[0], skip_special_tokens=True).strip()
    return answer


In [16]:
df.head()

Unnamed: 0,id,question,answer
0,718mYsQTQbL,What are the items in the image?,Bibs
1,718mYsQTQbL,What color is the solid bib?,Yellow
2,718mYsQTQbL,How many bibs are shown?,Six
3,718mYsQTQbL,What material are the bibs?,Cotton
4,718mYsQTQbL,Does one bib have a striped pattern?,Yes


In [None]:
# !gunzip /kaggle/working/images/metadata/images.csv.gz

In [21]:
directory = "/kaggle/working/listings/metadata"

df1 = pd.read_csv(r'/kaggle/working/images/metadata/images.csv')

In [23]:
for idx, row in df.iterrows():
    imageId = row['id']
    question = row['question']
    pt= df1[df1['image_id']==imageId]
    pt= pt['path'].values[0]
        
    true_answer = str(row['answer']).lower().strip()

4c/4c533ad7.jpg What are the items in the image? bibs


In [25]:
print("Running batched inference...")
predictions = []
ground_truths_normalized = [] # Store normalized ground truths for metrics
original_indices = []
num_batches = math.ceil(len(df) / BATCH_SIZE)

with torch.no_grad(): # Disable gradient calculations for inference
    for i in tqdm(range(0, len(df), BATCH_SIZE), total=num_batches, desc="Evaluating Batches"):
        batch_df = df[i:i+BATCH_SIZE]

        batch_images_pil = []
        batch_questions = []
        batch_ground_truths = []
        batch_valid_indices = [] # Store original indices of valid items in this batch

        # 1. Load images and collect data for the current batch
        for idx, row in batch_df.iterrows():
            imageId = row['id']
            question = row['question']
            pt= df1[df1['image_id']==imageId]
            pt= pt['path'].values[0]
            true_answer = str(row['answer']).lower().strip()
            img_path = os.path.join(IMAGE_BASE_DIR, pt)

            try:
                raw_image = Image.open(img_path).convert('RGB')
                batch_images_pil.append(raw_image)
                batch_questions.append(question)
                batch_ground_truths.append(true_answer)
                batch_valid_indices.append(idx) # Add original index if image loaded
            except FileNotFoundError:
                print(f"Warning: Image not found at {img_path}. Skipping row {idx}.")
                # Optionally store placeholders for missing images if needed later
            except Exception as e:
                print(f"Warning: Error loading image {img_path} for row {idx}: {e}. Skipping.")
                # Optionally store placeholders

        # 2. Process the batch if any valid images were loaded
        if not batch_images_pil:
            print(f"Warning: No valid images loaded for batch starting at index {i}. Skipping batch.")
            continue # Skip to the next batch

        # Use the processor for the entire batch
        inputs = processor(images=batch_images_pil, text=batch_questions, return_tensors="pt", padding=True, truncation=True).to(DEVICE)

        # 3. Generate answers for the batch
        outputs = model.generate(**inputs, max_new_tokens=10)

        # 4. Decode and store results for the batch
        batch_preds_decoded = processor.batch_decode(outputs, skip_special_tokens=True)

        for pred_idx, original_df_idx in enumerate(batch_valid_indices):
            # Normalize prediction
            predicted_answer = batch_preds_decoded[pred_idx].strip().lower()
            predicted_answer = re.sub(r'[^\w\s]', '', predicted_answer) # Basic cleanup

            # Normalize corresponding ground truth
            true_answer_normalized = batch_ground_truths[pred_idx] # Already lowercased/stripped
            true_answer_normalized = re.sub(r'[^\w\s]', '', true_answer_normalized) # Basic cleanup

            predictions.append(predicted_answer)
            ground_truths_normalized.append(true_answer_normalized)
            original_indices.append(original_df_idx) # Store the original index

    

Running batched inference...


Evaluating Batches:   0%|          | 0/530 [00:00<?, ?it/s]

In [None]:
# torch.cuda.empty_cache()
# import torch
# import gc

# # Delete unused variables
# del model, inputs, outputs  # or any other variables you created
# gc.collect()                # Run garbage collection
# torch.cuda.empty_cache()    # Release cached memory from PyTorch
# torch.cuda.ipc_collect()    # Additional cleanup for inter-process communication (optional)
# !nvidia-smi



In [28]:
results_df = pd.DataFrame({
    'original_index': original_indices,
    'predicted_answer': predictions,
    'ground_truth_normalized': ground_truths_normalized
})
# import pandas as pd
# results_df = pd.read_csv("./rs.csv")
# Merge results back with original dataframe (optional, but useful)
# Ensure the original df has a unique index if it was reset during sampling
df_with_results = df.merge(results_df, left_index=True, right_on='original_index', how='right') # 'right' join keeps only processed rows

# Save results
results_filename = 'vqa_results_baseline_batched.csv'
df_with_results.to_csv(results_filename, index=False)
print(f"Results saved to {results_filename}")

Results saved to vqa_results_baseline_batched.csv


In [1]:
# results_df.to_csv("./rs.csv")
import pandas as pd
results_df = pd.read_csv("../VR-mini-Proj-2/BLIP_vqa_results_baseline_batched.csv")

In [2]:
import evaluate
import bert_score

valid_predictions = results_df['predicted_answer'].to_list()
valid_ground_truths = results_df['ground_truth_normalized'].to_list()

if not valid_predictions:
    print("Error: No valid predictions available to calculate metrics.")
    exit()

# 1. Accuracy (Exact Match)
correct_predictions = sum(p == gt for p, gt in zip(valid_predictions, valid_ground_truths))
total_valid = len(valid_predictions)
accuracy = correct_predictions / total_valid if total_valid > 0 else 0


print(f"Accuracy (Exact Match): {accuracy:.4f}")


  from .autonotebook import tqdm as notebook_tqdm


Accuracy (Exact Match): 0.4248


In [3]:
f1_macro_simple = accuracy # As explained before, for binary match/no-match
print(f"F1 Score (Macro, based on Exact Match): {f1_macro_simple:.4f}")

F1 Score (Macro, based on Exact Match): 0.4248


In [4]:
from evaluate import load
bertscore = load("bertscore")
results = bertscore.compute(references=valid_ground_truths, predictions=valid_predictions,lang="en",model_type="distilbert-base-uncased",rescale_with_baseline=True)
print(results)

Downloading builder script: 100%|██████████| 7.95k/7.95k [00:00<00:00, 7.06MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


{'precision': [1.0, 1.0, 0.5916965007781982, 0.6486443877220154, 1.0000003576278687, 0.14919547736644745, 0.5461448431015015, 0.552427351474762, 0.6439481973648071, 1.0000003576278687, 1.0000003576278687, 0.10951437056064606, 1.0000003576278687, 1.0000003576278687, 1.0000003576278687, 0.5744565725326538, 0.7097427845001221, 0.1955794095993042, 0.21993836760520935, 0.2986866533756256, 0.509417712688446, 0.20546826720237732, 0.21071670949459076, 1.0000003576278687, 1.0000003576278687, 1.0000003576278687, 1.0000003576278687, 1.0000003576278687, 1.0000003576278687, 0.9999991059303284, 1.0, 0.033683400601148605, 0.40979093313217163, 0.3251575529575348, 0.12511059641838074, 0.571087121963501, 0.6439481973648071, 0.12792208790779114, 1.0000003576278687, 1.0000003576278687, 0.6734561920166016, 0.6439481973648071, 1.0, 1.0000003576278687, 0.9999991059303284, 1.0, 0.9999991059303284, -0.0924333706498146, 0.1541067510843277, 0.5090774893760681, 1.0000007152557373, 0.1868663877248764, 1.0000003576

In [5]:
import numpy as np
print(np.mean(results['precision']))
print(np.mean(results['recall']))
print(np.mean(results['f1']))

0.6075401236502055
0.6054488398469636
0.6065456961339067


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn.functional as F

vectorizer = TfidfVectorizer()
all_sentences = valid_predictions + valid_ground_truths
vectorizer.fit(all_sentences)

vec1 = vectorizer.transform(valid_predictions).toarray()
vec2 = vectorizer.transform(valid_ground_truths).toarray()

pred_vec = torch.tensor(vec1, dtype=torch.float32)
gt_vec = torch.tensor(vec2, dtype=torch.float32)


cos_sim = F.cosine_similarity(pred_vec, gt_vec, dim=1)
print("Cosine similarity:", cos_sim)

Cosine similarity: tensor([1., 1., 0.,  ..., 1., 0., 0.])
