In [1]:
import pandas as pd
import torch
from PIL import Image
import os
from transformers import BlipProcessor, BlipForQuestionAnswering
from tqdm.notebook import tqdm
import evaluate
from sklearn.metrics import f1_score
import warnings
import re
import math
import bert_score

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os, math, re
import torch
from transformers import ViltProcessor, ViltForQuestionAnswering
from PIL import Image
from tqdm.auto import tqdm

# ——— Settings ———
IMAGE_BASE_DIR = rf"/content/images/small"
BATCH_SIZE      = 32   # you can bump this up—ViLT uses less memory
DEVICE          = "cuda" if torch.cuda.is_available() else "cpu"
df=pd.read_csv('a.csv')
# 1. Load processor & model
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa").to(DEVICE)

# 2. Wrap for multi-GPU (optional)
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)

# 3. Unwrap for calls
real_model = model.module if isinstance(model, torch.nn.DataParallel) else model

print(f"Using {real_model.__class__.__name__} on {torch.cuda.device_count()} GPU(s)")

# ——— Inference Loop ———
predictions = []
ground_truths = []
original_indices = []
num_batches = math.ceil(len(df) / BATCH_SIZE)

with torch.no_grad():
    for start in tqdm(range(0, len(df), BATCH_SIZE), total=num_batches, desc="VILT Batches"):
        batch = df.iloc[start : start + BATCH_SIZE]

        imgs, qs, gts, idxs = [], [], [], []
        for idx, row in batch.iterrows():
            fp = os.path.join(IMAGE_BASE_DIR, row["filename"])
            try:
                imgs.append(Image.open(fp).convert("RGB"))
                qs.append(row["question"])
                gt = re.sub(r"[^\w\s]", "", str(row["answer"]).lower())
                gts.append(gt)
                idxs.append(idx)
            except:
                continue

        if not imgs:
            continue

        # preprocess
        inputs = processor(
            images=imgs,
            text=qs,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(DEVICE)

        # forward + argmax
        outputs = real_model(**inputs)
        # logits shape: [batch_size, num_labels]
        preds = outputs.logits.argmax(-1).tolist()
        # map indices -> label strings
        labels = [real_model.config.id2label[i] for i in preds]

        # normalize & store
        for i, orig_idx in enumerate(idxs):
            pred = re.sub(r"[^\w\s]", "", labels[i].lower().strip())
            predictions.append(pred)
            ground_truths.append(gts[i])
            original_indices.append(orig_idx)

# now compute your metrics on predictions vs ground_truths


Using ViltForQuestionAnswering on 1 GPU(s)


VILT Batches:   0%|          | 0/1552 [00:00<?, ?it/s]

In [None]:
print(predictions[:5])

['shirts', 'blue', '0', 'cotton', 'yes']


In [None]:
gt=[]
for i in range(len(original_indices)):
  gt.append(df.iloc[original_indices[i]]['answer'])

In [None]:
def f(l):
  l1=[]
  for i in range(len(l)):
    l[i]=str(l[i])
    l1.append(re.sub(r'[^\w\s]', '', l[i]))
  return l1

In [None]:
pred=f(predictions)
gt=f(gt)

In [None]:
predictions=pred
ground_truths_normalized=gt
results_df = pd.DataFrame({
    'original_index': original_indices,
    'predicted_answer': predictions,
    'ground_truth_normalized': ground_truths_normalized
})
# import pandas as pd
# results_df = pd.read_csv("./rs.csv")
# Merge results back with original dataframe (optional, but useful)
# Ensure the original df has a unique index if it was reset during sampling
df_with_results = df.merge(results_df, left_index=True, right_on='original_index', how='right') # 'right' join keeps only processed rows

# Save results
results_filename = 'vilt.csv'
df_with_results.to_csv(results_filename, index=False)
print(f"Results saved to {results_filename}")

Results saved to vilt.csv


In [None]:
valid_predictions = results_df['predicted_answer'].to_list()
valid_ground_truths = results_df['ground_truth_normalized'].to_list()

if not valid_predictions:
    print("Error: No valid predictions available to calculate metrics.")
    exit()

# 1. Accuracy (Exact Match)
correct_predictions = sum(p == gt for p, gt in zip(valid_predictions, valid_ground_truths))
total_valid = len(valid_predictions)
accuracy = correct_predictions / total_valid if total_valid > 0 else 0
print(f"Accuracy (Exact Match): {accuracy:.4f}")

Accuracy (Exact Match): 0.0224


In [None]:
from evaluate import load
bertscore = load("bertscore")
results = bertscore.compute(references=gt, predictions=pred,lang="en",model_type="distilbert-base-uncased",rescale_with_baseline=True)
print(results)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]



{'precision': [0.23259402811527252, 0.643298864364624, -0.02828621305525303, 1.0, 1.0000003576278687, 0.26691246032714844, 0.23748669028282166, 0.5591428279876709, 0.6439485549926758, 1.0000003576278687, 0.9999996423721313, -0.4996555745601654, 0.6105254292488098, 1.0000003576278687, 1.0000003576278687, 0.4216839075088501, 0.7097427845001221, 0.1327856183052063, 0.21993926167488098, 0.29868701100349426, -0.4145849049091339, 0.3430083692073822, 0.06639620661735535, 1.0000003576278687, 1.0000003576278687, 1.0000003576278687, 1.0000003576278687, 0.6439485549926758, 1.0000003576278687, 0.6439485549926758, 1.0000003576278687, 0.004265869501978159, 0.1122463047504425, 0.2660362422466278, 0.1276625096797943, 0.5710873007774353, 1.0000003576278687, 0.12792208790779114, 1.0000003576278687, 1.0000003576278687, 0.5471804738044739, 0.6439485549926758, 0.9999996423721313, 1.0000003576278687, 1.0000003576278687, 0.5620726943016052, 0.21203236281871796, 0.0335734523832798, 0.15410728752613068, 0.5090

In [None]:
import numpy as np
print(np.mean(results['precision']))
print(np.mean(results['recall']))
print(np.mean(results['f1']))

0.5520453261868089
0.5427727528822738
0.5476276128425076
