In [1]:
"""Imports go here"""

from transformers import ViltProcessor, ViltForQuestionAnswering
import torch
from PIL import Image
import requests

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
#from word2number import w2n


2025-05-14 02:55:51.306136: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747191351.490545      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747191351.540763      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
""" Load CSV and split the data """

# Load the CSV file
csv_path = "/kaggle/input/qna-final/qna_final.csv"
df = pd.read_csv(csv_path)

# Get unique Item_IDs
unique_ids = df["Item_ID"].unique()

# Set random seed for reproducibility
random_seed = 42
np.random.seed(random_seed)

# Shuffle and split the unique IDs
train_ids, temp_ids = train_test_split(unique_ids, test_size=0.3, random_state=random_seed)  # 70% train
val_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=random_seed)      # 15% val, 15% test

# Create train, val, and test DataFrames
train_df = df[df["Item_ID"].isin(train_ids)]
val_df = df[df["Item_ID"].isin(val_ids)]
test_df = df[df["Item_ID"].isin(test_ids)]

print(f"Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")


Train size: 16666, Val size: 3663, Test size: 3558


In [3]:
"""Normalizing and mapping non-existing answers to semantically similar existing answers in label2id"""
# Load the model and processor
model_config_source = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# Text-to-number mapping (same as before)
def get_text_to_num_mapping():
    text_to_num = {
        "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4",
        "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9",
        "ten": "10", "eleven": "11", "twelve": "12", "thirteen": "13",
        "fourteen": "14", "fifteen": "15", "sixteen": "16", "seventeen": "17",
        "eighteen": "18", "nineteen": "19", "twenty": "20",
    }
    for i in range(21, 1001):
        text_to_num[str(i)] = str(i)
    return text_to_num

text_to_num_map = get_text_to_num_mapping()

def normalize_answer(answer_str):
    normalized = str(answer_str).strip().lower()
    return text_to_num_map.get(normalized, normalized)

# Create DataFrames with .copy()
train_df = df[df["Item_ID"].isin(train_ids)].copy()
val_df = df[df["Item_ID"].isin(val_ids)].copy()
test_df = df[df["Item_ID"].isin(test_ids)].copy()

# Add normalized answers
train_df['normalized_answer'] = train_df['Answer'].apply(normalize_answer)
val_df['normalized_answer'] = val_df['Answer'].apply(normalize_answer)
test_df['normalized_answer'] = test_df['Answer'].apply(normalize_answer)

# Get the original label2id
original_label2id = model_config_source.config.label2id
original_answers = list(original_label2id.keys())

# Load sentence-transformers for semantic similarity
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load a pre-trained model for semantic embeddings
print("Loading sentence transformer model for semantic matching...")
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')  # A lightweight model that works well for semantic similarity

# Pre-compute embeddings for all original answers
print("Computing embeddings for original vocabulary...")
original_embeddings = semantic_model.encode(original_answers, show_progress_bar=False)

def find_semantically_similar_answer(new_answer, original_answers, original_embeddings):
    """Find the most semantically similar answer in original_answers to new_answer"""
    # Get embedding for the new answer
    new_embedding = semantic_model.encode([new_answer], show_progress_bar=False)
    
    # Calculate cosine similarity between new answer and all original answers
    similarities = cosine_similarity(new_embedding, original_embeddings)[0]
    
    # Get the index of the most similar answer
    most_similar_idx = np.argmax(similarities)
    similarity_score = similarities[most_similar_idx]
    
    return original_answers[most_similar_idx], similarity_score

# Create a mapping dictionary for unseen answers
answer_mapping = {}
similarity_scores = {}

print("Creating semantic mappings for unseen answers...")
# Process all datasets to create mappings
for dataset_name, df in [("train", train_df), ("val", val_df), ("test", test_df)]:
    mapped_count = 0
    for ans in df['normalized_answer'].unique():
        if ans not in original_label2id and ans not in answer_mapping:
            similar_ans, score = find_semantically_similar_answer(ans, original_answers, original_embeddings)
            answer_mapping[ans] = similar_ans
            similarity_scores[ans] = score
            mapped_count += 1
    
    print(f"Dataset {dataset_name}: Mapped {mapped_count} unseen answers to semantically similar existing answers")

# Apply mapping to create mapped_answer column
def map_to_similar_answer(answer):
    if answer in original_label2id:
        return answer  # Already in the vocabulary
    return answer_mapping.get(answer, answer)  # Map to similar answer if needed

train_df['mapped_answer'] = train_df['normalized_answer'].apply(map_to_similar_answer)
val_df['mapped_answer'] = val_df['normalized_answer'].apply(map_to_similar_answer)
test_df['mapped_answer'] = test_df['normalized_answer'].apply(map_to_similar_answer)

# Print some statistics about the mapping
print("\nAnswer mapping examples (with similarity scores):")
if answer_mapping:
    # Sort by similarity score for better examples display
    sorted_mappings = sorted([(k, v, similarity_scores[k]) for k, v in answer_mapping.items()], 
                            key=lambda x: x[2], reverse=True)
    
    for i, (new_ans, similar_ans, score) in enumerate(sorted_mappings[:10]):  # Show first 10 examples
        print(f"  '{new_ans}' -> '{similar_ans}' (similarity: {score:.3f})")

    if len(answer_mapping) > 10:
        print(f"  ... and {len(answer_mapping) - 10} more mappings")
else:
    print("  No mappings were created (all answers already in vocabulary)")

# --- Final Check ---
print("\nFinal sizes:")
print(f"Train: {len(train_df)}")
print(f"Val: {len(val_df)}")
print(f"Test: {len(test_df)}")
print(f"Original vocabulary size: {len(original_label2id)}")
print(f"Total answer mappings created: {len(answer_mapping)}")

config.json:   0%|          | 0.00/136k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/470M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/470M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Loading sentence transformer model for semantic matching...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing embeddings for original vocabulary...
Creating semantic mappings for unseen answers...
Dataset train: Mapped 335 unseen answers to semantically similar existing answers
Dataset val: Mapped 40 unseen answers to semantically similar existing answers
Dataset test: Mapped 57 unseen answers to semantically similar existing answers

Answer mapping examples (with similarity scores):
  'multi-colored' -> 'multi colored' (similarity: 0.977)
  'grey' -> 'gray' (similarity: 0.968)
  'aluminium' -> 'aluminum' (similarity: 0.960)
  'multicolor' -> 'multicolored' (similarity: 0.950)
  'sunsets' -> 'sunset' (similarity: 0.941)
  '6-foot' -> '6 feet' (similarity: 0.921)
  'almond' -> 'almonds' (similarity: 0.913)
  '89' -> '88' (similarity: 0.913)
  'olive' -> 'olives' (similarity: 0.906)
  'letter' -> 'letters' (similarity: 0.902)
  ... and 422 more mappings

Final sizes:
Train: 16666
Val: 3663
Test: 3558
Original vocabulary size: 3129
Total answer mappings created: 432


In [5]:
"""Create a custom dataset class"""

class QnADataset(Dataset):
    def __init__(self, dataframe, image_dir, processor, label2id): # Processor is not strictly needed here anymore, but label2id is
        self.dataframe = dataframe
        self.image_dir = image_dir
        # self.processor = processor # Not used directly in __getitem__ anymore
        self.label2id = label2id
        self.text_to_num = self.generate_text_to_num_mapping()

    def generate_text_to_num_mapping(self):
        # (Your existing generate_text_to_num_mapping method - keep as is)
        text_to_num = {
            "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4",
            "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9",
            "ten": "10", "eleven": "11", "twelve": "12", "thirteen": "13",
            "fourteen": "14", "fifteen": "15", "sixteen": "16", "seventeen": "17",
            "eighteen": "18", "nineteen": "19", "twenty": "20",
        }
        for i in range(21, 1001):
            text_to_num[str(i)] = str(i)
        return text_to_num

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_path = f"{self.image_dir}/{row['Image_Path']}"
        question_text = row["Question"]  # Keep as raw text
        
        # Use mapped_answer instead of Answer
        mapped_answer = row["mapped_answer"].strip().lower()

        # Convert text-based numbers to numerical strings if needed
        if mapped_answer in self.text_to_num:
            processed_answer_str = self.text_to_num[mapped_answer]
        else:
            processed_answer_str = mapped_answer

        # Load PIL image
        try:
            pil_image = Image.open(image_path).convert("RGB")
        except FileNotFoundError:
            print(f"Error: Image not found at {image_path}")
            # Handle appropriately: skip, return None, or use a placeholder
            # For now, let's re-raise to make it obvious during debugging
            raise
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            raise

        # Encode the answer string to an ID
        if processed_answer_str in self.label2id:
            answer_id = self.label2id[processed_answer_str]
        else:
            # This should be less common now since we're using mapped answers
            print(f"Warning: Mapped answer '{processed_answer_str}' not found in label2id mapping. Item index: {idx}, Image: {row['Image_Path']}")
            # We'll still include error handling for robustness
            raise ValueError(f"Mapped answer '{processed_answer_str}' (from original '{row.get('Answer', 'N/A')}') not found in label2id mapping for image {row['Image_Path']}.")

        return {
            "image": pil_image,          # Return the PIL Image object
            "question": question_text,   # Return the raw question string
            "labels": torch.tensor(answer_id, dtype=torch.long) # Return the label as a tensor
        }

In [6]:
""" Prepare dataloaders """
from functools import partial

# Use original_label2id instead of extended_label2id
num_labels = len(original_label2id)

# Directory containing the images
image_dir = "/kaggle/input/filtered-small-amazon-qna"

# Create datasets with ORIGINAL labels and dataframes containing mapped_answer column
train_dataset = QnADataset(train_df, image_dir, processor, original_label2id)
val_dataset = QnADataset(val_df, image_dir, processor, original_label2id)  # Use full val_df, not filtered
test_dataset = QnADataset(test_df, image_dir, processor, original_label2id)  # Use full test_df, not filtered

# Collate function with original num_labels
def collate_fn(batch, processor, num_classes=num_labels):
    """ViLT-compatible collate function with one-hot encoding"""
    # Filter out invalid entries
    valid_batch = [
        item for item in batch 
        if item is not None 
        and isinstance(item.get("image"), Image.Image)
        and item.get("question") 
        and item.get("labels") is not None
    ]
    
    if not valid_batch:
        return None
    
    # Process valid items
    images = [item["image"] for item in valid_batch]
    texts = [item["question"] for item in valid_batch]
    labels = [item["labels"] for item in valid_batch]  # Should be class indices

    # Process through processor
    try:
        encoding = processor(
            images=images,
            text=texts,
            return_tensors="pt",
            padding="longest",
            truncation=True,
            max_length=512
        )
    except Exception as e:
        print(f"Skipping batch: {str(e)}")
        return None

    # Convert labels to one-hot encoding
    batch_size = len(labels)
    one_hot_labels = torch.zeros(batch_size, num_classes)
    for i, label in enumerate(labels):
        one_hot_labels[i, label] = 1.0

    encoding["labels"] = one_hot_labels
    return encoding

# Create DataLoaders with proper partial binding
train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    collate_fn=partial(collate_fn, processor=processor),  # Keyword argument binding
    drop_last=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=16,
    shuffle=False,
    collate_fn=partial(collate_fn, processor=processor)  # Keyword argument binding
)

test_loader = DataLoader(
    test_dataset,
    batch_size=16,
    shuffle=False,
    collate_fn=partial(collate_fn, processor=processor)  # Keyword argument binding
)

# Fine tuning part

In [10]:
import os
import time
import torch
from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup
import torch.optim as optim
from peft import LoraConfig, get_peft_model

# --- Config ---
NUM_EPOCHS    = 20
LEARNING_RATE = 1e-4
WEIGHT_DECAY  = 1e-2
WARMUP_RATIO  = 0.1       # 10% of total steps
MAX_GRAD_NORM = 1.0
OUTPUT_DIR    = "/kaggle/working/vilt-lora-manual-best"
os.makedirs(OUTPUT_DIR, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Model + LoRA setup (UPDATED) ---
original_model = ViltForQuestionAnswering.from_pretrained(
    "dandelin/vilt-b32-finetuned-vqa",
    # Using original vocabulary from the pretrained model
    num_labels=len(original_label2id),
    id2label=model_config_source.config.id2label,
    label2id=original_label2id
    # Removed ignore_mismatched_sizes since we're using original sizes
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"]
)
model = get_peft_model(original_model, lora_config)
model.to(device)
model.print_trainable_parameters()

# --- Optimizer + Scheduler + AMP Scaler ---
optimizer = optim.AdamW(
    model.parameters(), 
    lr=LEARNING_RATE, 
    weight_decay=WEIGHT_DECAY
)

total_steps   = len(train_loader) * NUM_EPOCHS
warmup_steps  = int(WARMUP_RATIO * total_steps)
scheduler     = get_linear_schedule_with_warmup(
    optimizer, warmup_steps, total_steps
)

scaler = torch.cuda.amp.GradScaler()

best_val_loss = float('inf')
patience, patience_counter = 10, 0

for epoch in range(1, NUM_EPOCHS + 1):
    print(f"\n-- Epoch {epoch}/{NUM_EPOCHS} --")
    t0_epoch = time.time()

    # ---- TRAIN ----
    model.train()
    train_loss = 0.0
    train_batches = 0
    pbar = tqdm(train_loader, desc="Train", leave=False)
    for batch in pbar:
        # Skip None batches
        if batch is None:
            continue
            
        batch = {k: v.to(device) for k,v in batch.items() if v is not None}

        optimizer.zero_grad()
        with torch.amp.autocast('cuda'):  # Updated to new format
            outputs = model(**batch)
            loss = outputs.loss

        scaler.scale(loss).backward()
        # clip grads
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        train_loss += loss.item()
        train_batches += 1
        pbar.set_postfix(loss=loss.item())

    avg_train = train_loss / max(train_batches, 1)  # Avoid division by zero
    
    # ---- VALIDATION ----
    model.eval()
    val_loss = 0.0
    val_batches = 0
    with torch.no_grad():
        pbar = tqdm(val_loader, desc="Valid", leave=False)
        for batch in pbar:
            # Skip None batches
            if batch is None:
                continue
                
            batch = {k: v.to(device) for k,v in batch.items() if v is not None}
            with torch.amp.autocast('cuda'):  # Updated to new format
                loss = model(**batch).loss
            val_loss += loss.item()
            val_batches += 1
            pbar.set_postfix(loss=loss.item())

    avg_val = val_loss / max(val_batches, 1)  # Avoid division by zero
    print(f"Train Loss: {avg_train:.4f} | Val Loss: {avg_val:.4f} | Time: {(time.time()-t0_epoch):.1f}s")

    # ---- Early Stopping & Checkpointing ----
    if avg_val < best_val_loss:
        best_val_loss = avg_val
        patience_counter = 0
        print(f" New best! Saving to {OUTPUT_DIR}")
        model.save_pretrained(OUTPUT_DIR)
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Stopping early (no improvement for {patience} epochs).")
            break

print("\n=== Training Complete ===")
print(f"Best Validation Loss: {best_val_loss:.4f}")
print(f"Best model saved at: {OUTPUT_DIR}")

Using device: cuda


  scaler = torch.cuda.amp.GradScaler()


trainable params: 6,583,353 || all params: 124,171,890 || trainable%: 5.3018

-- Epoch 1/20 --


Valid:   6%|▌         | 14/229 [00:04<01:05,  3.27it/s, loss=2.63]    

Skipping batch: height and width must be > 0


Valid:  55%|█████▍    | 125/229 [00:36<00:31,  3.28it/s, loss=1.17] 

Skipping batch: height and width must be > 0


                                                                     

Train Loss: 2.9718 | Val Loss: 2.2320 | Time: 498.8s
 New best! Saving to /kaggle/working/vilt-lora-manual-best

-- Epoch 2/20 --


Valid:   6%|▌         | 14/229 [00:04<01:03,  3.38it/s, loss=2.69]    

Skipping batch: height and width must be > 0


Valid:  55%|█████▍    | 125/229 [00:34<00:31,  3.35it/s, loss=0.946] 

Skipping batch: height and width must be > 0


                                                                      

Train Loss: 1.8692 | Val Loss: 2.0703 | Time: 489.7s
 New best! Saving to /kaggle/working/vilt-lora-manual-best

-- Epoch 3/20 --


Valid:   6%|▌         | 14/229 [00:04<01:03,  3.37it/s, loss=2.82]    

Skipping batch: height and width must be > 0


Valid:  55%|█████▍    | 125/229 [00:34<00:31,  3.31it/s, loss=1.3]    

Skipping batch: height and width must be > 0


                                                                       

Train Loss: 1.3256 | Val Loss: 2.1360 | Time: 490.2s

-- Epoch 4/20 --


Valid:   6%|▌         | 14/229 [00:04<01:03,  3.41it/s, loss=4.44]    

Skipping batch: height and width must be > 0


Valid:  55%|█████▍    | 125/229 [00:34<00:30,  3.36it/s, loss=1.38]   

Skipping batch: height and width must be > 0


                                                                       

Train Loss: 0.9537 | Val Loss: 2.2327 | Time: 489.6s

-- Epoch 5/20 --


Valid:   6%|▌         | 14/229 [00:04<01:02,  3.42it/s, loss=4.17]    

Skipping batch: height and width must be > 0


Valid:  55%|█████▍    | 125/229 [00:34<00:31,  3.35it/s, loss=1.67]   

Skipping batch: height and width must be > 0


                                                                       

Train Loss: 0.7343 | Val Loss: 2.3835 | Time: 484.4s

-- Epoch 6/20 --


Valid:   6%|▌         | 14/229 [00:04<01:03,  3.38it/s, loss=4.99]     

Skipping batch: height and width must be > 0


Valid:  55%|█████▍    | 125/229 [00:34<00:31,  3.26it/s, loss=2.13]   

Skipping batch: height and width must be > 0


                                                                       

Train Loss: 0.5816 | Val Loss: 2.5106 | Time: 484.8s

-- Epoch 7/20 --


Valid:   6%|▌         | 14/229 [00:04<01:02,  3.46it/s, loss=5.47]     

Skipping batch: height and width must be > 0


Valid:  55%|█████▍    | 125/229 [00:34<00:31,  3.33it/s, loss=2.31]    

Skipping batch: height and width must be > 0


                                                                       

Train Loss: 0.4787 | Val Loss: 2.6536 | Time: 484.4s

-- Epoch 8/20 --


Valid:   6%|▌         | 14/229 [00:04<01:02,  3.42it/s, loss=5.89]     

Skipping batch: height and width must be > 0


Valid:  55%|█████▍    | 125/229 [00:34<00:31,  3.30it/s, loss=1.99]    

Skipping batch: height and width must be > 0


                                                                       

Train Loss: 0.4019 | Val Loss: 2.8437 | Time: 485.0s

-- Epoch 9/20 --


Valid:   6%|▌         | 14/229 [00:04<01:04,  3.36it/s, loss=7.05]     

Skipping batch: height and width must be > 0


Valid:  55%|█████▍    | 125/229 [00:34<00:31,  3.30it/s, loss=0.797]   

Skipping batch: height and width must be > 0


                                                                       

Train Loss: 0.3471 | Val Loss: 2.8858 | Time: 485.0s

-- Epoch 10/20 --


Valid:   6%|▌         | 14/229 [00:04<01:02,  3.44it/s, loss=6.65]     

Skipping batch: height and width must be > 0


Valid:  55%|█████▍    | 125/229 [00:34<00:31,  3.29it/s, loss=2.19]    

Skipping batch: height and width must be > 0


                                                                       

Train Loss: 0.2983 | Val Loss: 3.0493 | Time: 485.9s

-- Epoch 11/20 --


Valid:   6%|▌         | 14/229 [00:04<01:02,  3.46it/s, loss=7.5]      

Skipping batch: height and width must be > 0


Valid:  55%|█████▍    | 125/229 [00:34<00:31,  3.31it/s, loss=2.44]    

Skipping batch: height and width must be > 0


                                                                       

Train Loss: 0.2638 | Val Loss: 3.1226 | Time: 486.3s

-- Epoch 12/20 --


Valid:   6%|▌         | 14/229 [00:04<01:04,  3.33it/s, loss=7.8]      

Skipping batch: height and width must be > 0


Valid:  55%|█████▍    | 125/229 [00:34<00:31,  3.30it/s, loss=2.97]    

Skipping batch: height and width must be > 0


                                                                       

Train Loss: 0.2339 | Val Loss: 3.2616 | Time: 485.0s
Stopping early (no improvement for 10 epochs).

=== Training Complete ===
Best Validation Loss: 2.0703
Best model saved at: /kaggle/working/vilt-lora-manual-best




# Metrics

In [11]:
!pip install bert-score
!git clone https://github.com/neulab/BARTScore.git
import sys
sys.path.append("./BARTScore")
# Now import
from bart_score import BARTScorer

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.0.0->bert-score)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.0.0->bert-score)
  

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


remote: Enumerating objects: 220, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 220 (delta 18), reused 14 (delta 14), pack-reused 194 (from 1)[K
Receiving objects: 100% (220/220), 101.98 MiB | 21.64 MiB/s, done.
Resolving deltas: 100% (47/47), done.
Updating files: 100% (192/192), done.


In [13]:
import sys
import time
import torch
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# 1. Add local BARTScore code into Python’s import path
sys.path.append("./BARTScore")

# 2. Semantic‐similarity imports
from bert_score import score as bert_score
from bart_score import BARTScorer

# 3. PEFT & model imports
from transformers import ViltForQuestionAnswering
from peft import PeftModel

# 4. Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 5. Reload base + LoRA‐finetuned model
#    (assumes you previously saved to OUTPUT_DIR)
OUTPUT_DIR = "/kaggle/working/vilt-lora-manual-best"
base_model = ViltForQuestionAnswering.from_pretrained(
    "dandelin/vilt-b32-finetuned-vqa",
    num_labels=len(original_label2id),
    id2label=model_config_source.config.id2label,
    label2id=original_label2id,
    ignore_mismatched_sizes=True
)
model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
model.to(device)
model.eval()

# 6. Accumulators
all_pred_ids   = []
all_true_ids   = []
all_pred_texts = []
all_true_texts = []

# --- Start overall timer ---
t0_overall = time.time()

# 7. Inference + gather labels/texts with progress bar
t0_loop = time.time()
for batch in tqdm(test_loader, desc="Evaluating batches"):
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    logits  = outputs.logits

    # Predicted & true IDs
    pred_ids = logits.argmax(dim=-1)
    true_ids = batch["labels"].argmax(dim=-1)

    # Flatten for metrics
    pred_flat = pred_ids.view(-1).cpu().numpy()
    true_flat = true_ids.view(-1).cpu().numpy()
    all_pred_ids.extend(pred_flat)
    all_true_ids.extend(true_flat)

    # Convert to label strings
    all_pred_texts.extend([model.config.id2label[i] for i in pred_flat])
    all_true_texts.extend([model.config.id2label[i] for i in true_flat])
t1_loop = time.time()
print(f"\nInference & gathering took {t1_loop - t0_loop:.2f}s")

# 8. Classification metrics
t0_cls = time.time()
accuracy  = accuracy_score(all_true_ids, all_pred_ids)
precision = precision_score(all_true_ids, all_pred_ids, average="macro", zero_division=0)
recall    = recall_score(all_true_ids, all_pred_ids, average="macro", zero_division=0)
f1        = f1_score(all_true_ids, all_pred_ids, average="macro", zero_division=0)
t1_cls = time.time()

print(f"\nClassification metrics computed in {t1_cls - t0_cls:.2f}s")
print("=== Classification Metrics ===")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision (M) : {precision:.4f}")
print(f"Recall    (M) : {recall:.4f}")
print(f"F1 Score  (M) : {f1:.4f}")

# 9. BERTScore (semantic similarity)
t0_bert = time.time()
bert_p, bert_r, bert_f1 = bert_score(
    all_pred_texts,
    all_true_texts,
    lang="en",
    model_type="bert-base-uncased",
    rescale_with_baseline=True
)
t1_bert = time.time()
print(f"\nBERTScore computed in {t1_bert - t0_bert:.2f}s")
print("=== BERTScore ===")
print(f"Precision : {bert_p.mean().item():.4f}")
print(f"Recall    : {bert_r.mean().item():.4f}")
print(f"F1        : {bert_f1.mean().item():.4f}")

# 10. BARTScore (semantic entailment)
t0_bart = time.time()
bart_scorer = BARTScorer(device=device.type, checkpoint="facebook/bart-large-cnn")
bart_scores = bart_scorer.score(
    all_pred_texts,
    all_true_texts,
    batch_size=8
)
t1_bart = time.time()
mean_bart = sum(bart_scores) / len(bart_scores)
print(f"\nBARTScore computed in {t1_bart - t0_bart:.2f}s")
print("=== BARTScore ===")
print(f"Mean score: {mean_bart:.4f}")

# --- End overall timer ---
t1_overall = time.time()
print(f"\nTotal evaluation time: {t1_overall - t0_overall:.2f}s")


Using device: cuda


Evaluating batches: 100%|██████████| 223/223 [01:53<00:00,  1.96it/s]



Inference & gathering took 113.86s

Classification metrics computed in 0.02s
=== Classification Metrics ===
Accuracy      : 0.6796
Precision (M) : 0.1208
Recall    (M) : 0.1220
F1 Score  (M) : 0.1090


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


BERTScore computed in 7.16s
=== BERTScore ===
Precision : 0.8412
Recall    : 0.8376
F1        : 0.8382


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]


BARTScore computed in 21.09s
=== BARTScore ===
Mean score: -3.3844

Total evaluation time: 142.13s


In [14]:
import shutil
import os

folder_to_zip = "/kaggle/working/vilt-lora-manual-best"
output_zip_name = "/kaggle/working/vilt-lora-manual-best_model" # Name for the zip file (no .zip here)

try:
    shutil.make_archive(output_zip_name,  # The name of the file to create (e.g., /kaggle/working/my_model_archive)
                        'zip',             # The format (zip, tar, etc.)
                        root_dir=os.path.dirname(folder_to_zip), # The directory containing the folder to zip
                        base_dir=os.path.basename(folder_to_zip)) # The folder to zip

    print(f"Successfully created zip file: {output_zip_name}.zip")
    print(f"You can now find '{os.path.basename(output_zip_name)}.zip' in the Output section on the right sidebar (or under /kaggle/working/) and download it.")
except FileNotFoundError:
    print(f"Error: The folder {folder_to_zip} was not found. Please check the path.")
except Exception as e:
    print(f"An error occurred during zipping: {e}")



Successfully created zip file: /kaggle/working/vilt-lora-manual-best_model.zip
You can now find 'vilt-lora-manual-best_model.zip' in the Output section on the right sidebar (or under /kaggle/working/) and download it.


In [20]:
import zipfile
import os

# Define the output zip file name
zip_filename = '/kaggle/working/output.zip'

# List of files and directories to include
items_to_zip = [
    'vilt-lora-manual-best',
    'README.md',
    'adapter_config.json',
    'adapter_model.safetensors'
]

def zip_directory(path, ziph):
    # Walk through all files in the directory
    for root, dirs, files in os.walk(path):
        for file in files:
            file_path = os.path.join(root, file)
            # Add file to zip file, preserving directory structure
            ziph.write(file_path, os.path.relpath(file_path, os.path.dirname(path)))

with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for item in items_to_zip:
        if os.path.isdir(item):
            # Add directory recursively
            zip_directory(item, zipf)
        elif os.path.isfile(item):
            # Add single file
            zipf.write(item)
            
print(f"Zip file created at: {zip_filename}")

Zip file created at: /kaggle/working/output.zip


In [19]:
!ls -lh /kaggle/working/

total 25M
drwxr-xr-x 2 root root 4.0K May 10 18:08 vilt-lora-manual-best
-rw-r--r-- 1 root root  25M May 11 03:58 vilt-lora-manual-best_model.zip


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
exit()