# Fine-tuning Qwen2.5-VL-7B-Instruct on MathVerse Dataset

This notebook fine-tunes the Qwen2.5-VL-7B-Instruct vision-language model using Unsloth for efficient training.

**Important:** Make sure to enable GPU runtime (T4, V100, or A100) in Colab:
- Runtime → Change runtime type → GPU

## 1. Environment Setup

**⚠️ IMPORTANT - First Time Setup:**

If you're seeing KeyError about `align_logprobs_with_mask`:
1. **Runtime → Factory reset runtime** (clears all packages)
2. Then run the installation cell below
3. After installation, manually restart: **Runtime → Restart runtime**
4. Continue from cell 2 (imports)

In [None]:
# Installation matching Unsloth official notebook
import os, re
import torch

# Detect torch version for xformers compatibility
v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")

# First upgrade bitsandbytes
!pip install -q -U bitsandbytes

# Install dependencies without deps to avoid conflicts
!pip install --no-deps -q accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
!pip install -q sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer

# Install unsloth without deps
!pip install --no-deps -q unsloth

# Install compatible transformers and trl
!pip install -q transformers
!pip install --no-deps -q trl==0.22.2

print("\n✅ Installation complete!")
print("⚠️ Now restart runtime: Runtime → Restart session")
print("After restart, skip this cell and run from imports")

In [None]:
# Import libraries - IMPORTANT: Import unsloth FIRST
import warnings
warnings.filterwarnings('ignore')

# Import unsloth first for optimizations
from unsloth import FastVisionModel, is_bfloat16_supported

# Then import other libraries
import os
import json
import torch
from PIL import Image
from pathlib import Path
from datasets import Dataset, Features, Value, Image as DatasetImage
from transformers import (
    AutoProcessor,
    Qwen2VLForConditionalGeneration,
    TrainingArguments,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

## 2. Upload Your Data to Colab

You need to upload:
1. `mathverse_testmini.jsonl`
2. `mathverse_testmini_images/` folder (or zip it first)

You can use Google Drive for large datasets:

In [None]:
# Option 1: Mount Google Drive (recommended for large datasets)
from google.colab import drive
drive.mount('/content/drive')

# Update these paths to match your Google Drive structure
DATA_DIR = "/content/drive/MyDrive/Colab Notebooks/data"  # Updated based on your structure
JSONL_PATH = f"{DATA_DIR}/mathverse_testmini.jsonl"
IMAGES_DIR = f"{DATA_DIR}/mathverse_testmini_images"

In [None]:
# Option 2: Upload files directly to Colab (for smaller datasets)
# Uncomment if you prefer direct upload
# from google.colab import files
# uploaded = files.upload()  # Upload your JSONL file

# # Then extract images if uploaded as zip
# !unzip -q mathverse_testmini_images.zip

# DATA_DIR = "/content"
# JSONL_PATH = f"{DATA_DIR}/mathverse_testmini.jsonl"
# IMAGES_DIR = f"{DATA_DIR}/mathverse_testmini_images"

## 3. Load and Prepare Data

In [None]:
# Load JSONL data
import json
import os
import re
from PIL import Image, ImageFile

# Allow loading of truncated images
ImageFile.LOAD_TRUNCATED_IMAGES = True

def clean_image_tokens(text):
    """Remove any existing image tokens from text."""
    # Remove common vision tokens that might cause conflicts
    patterns = [
        r'<\|image_pad\|>',
        r'<\|vision_start\|>',
        r'<\|vision_end\|>',
        r'<image>',
        r'</image>',
        r'\[IMG\d*\]',
    ]
    cleaned = text
    for pattern in patterns:
        cleaned = re.sub(pattern, '', cleaned)
    return cleaned.strip()

def load_mathverse_data(jsonl_path, images_dir, max_samples=None):
    """
    Load MathVerse dataset from JSONL file and images.
    Returns data in Unsloth conversation format with PIL Images.
    """
    data = []
    errors = 0
    image_errors = 0
    
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for idx, line in enumerate(f):
            if max_samples and len(data) >= max_samples:
                break
            
            # Skip empty lines
            if not line.strip():
                continue
                
            try:
                item = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"Warning: Skipping line {idx+1} due to JSON error: {e}")
                errors += 1
                continue
            
            # Construct image path
            image_path = os.path.join(images_dir, item['image_path'])
            
            # Skip if image doesn't exist
            if not os.path.exists(image_path):
                print(f"Warning: Image not found: {image_path}")
                continue
            
            # Get question and answer - clean any existing image tokens
            question = clean_image_tokens(item.get('query', ''))
            answer = item.get('answer', '')
            
            if not question or not answer:
                print(f"Warning: Skipping line {idx+1} - missing question or answer")
                continue
            
            # Load and verify image
            try:
                pil_image = Image.open(image_path).convert('RGB')
                _ = pil_image.size  # Verify it loads
            except (OSError, IOError) as e:
                print(f"Warning: Skipping line {idx+1} - corrupted image {image_path}: {e}")
                image_errors += 1
                continue
            
            # Format exactly like official Unsloth notebook - image BEFORE text
            conversation = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": pil_image},
                        {"type": "text", "text": question}
                    ]
                },
                {
                    "role": "assistant",
                    "content": [
                        {"type": "text", "text": answer}
                    ]
                }
            ]
            
            # Store ONLY messages, no separate images key
            data.append({"messages": conversation})
    
    print(f"✅ Loaded {len(data)} samples")
    if errors > 0:
        print(f"⚠️ Skipped {errors} lines due to JSON errors")
    if image_errors > 0:
        print(f"⚠️ Skipped {image_errors} corrupted/truncated images")
    return data

# Load data (set max_samples to a number for testing, None loads full dataset)
raw_data = load_mathverse_data(
    JSONL_PATH, 
    IMAGES_DIR,
    max_samples=None  # Set to 100 for quick testing, None for full dataset
)

# Display sample
if raw_data:
    print("\nSample data:")
    print(f"Keys: {raw_data[0].keys()}")
    print(f"Number of messages: {len(raw_data[0]['messages'])}")
    print(f"Content order: {[c['type'] for c in raw_data[0]['messages'][0]['content']]}")
    print(f"Question (first 200 chars): {raw_data[0]['messages'][0]['content'][1]['text'][:200]}...")
    print(f"Answer: {raw_data[0]['messages'][1]['content'][0]['text']}")
else:
    print("\n❌ Error: No data loaded. Please check your data files.")

In [None]:
# Split data into train and validation sets
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(
    raw_data, 
    test_size=0.1,  # 10% for validation
    random_state=42
)

print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")

# DO NOT convert to HF Dataset - keep as plain Python lists
# HuggingFace Dataset.from_list() converts PIL Images to dicts which breaks Unsloth
train_dataset = train_data  # Keep as list
val_dataset = val_data      # Keep as list

print(f"\n✅ Using plain Python lists (not HF Dataset)")
print(f"Train samples: {len(train_dataset)}")
print(f"Val samples: {len(val_dataset)}")

# Verify images are still PIL Images
print(f"Image type check: {type(train_dataset[0]['messages'][0]['content'][0]['image'])}")

## 4. Load Model with Unsloth

In [None]:
# Model configuration
MODEL_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"
MAX_SEQ_LENGTH = 2048
LOAD_IN_4BIT = True  # Set to False if you have enough VRAM

# Load model and processor with Unsloth
model, processor = FastVisionModel.from_pretrained(
    MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    load_in_4bit=LOAD_IN_4BIT,
    dtype=None,  # Auto-detect
)

print("Model loaded successfully!")

## 5. Configure LoRA for Fine-tuning

In [None]:
# Apply LoRA using Unsloth
model = FastVisionModel.get_peft_model(
    model,
    r=16,  # LoRA rank
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    use_gradient_checkpointing="unsloth",  # Use Unsloth's optimized gradient checkpointing
    random_state=42,
)

# Print trainable parameters
model.print_trainable_parameters()

## 6. Prepare Dataset for Training

In [None]:
# Dataset is already prepared in Unsloth format
# No additional preprocessing needed - Unsloth will handle it
print("✅ Dataset ready for training")
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

## 7. Configure Training Arguments

In [None]:
# Training configuration
from unsloth import UnslothTrainingArguments
from unsloth.trainer import UnslothVisionDataCollator

OUTPUT_DIR = "./qwen2.5-vl-mathverse-finetuned"

training_args = UnslothTrainingArguments(
    output_dir=OUTPUT_DIR,
    
    # Training hyperparameters - start with batch size 1 to debug
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Reduce to 1 to avoid batching issues
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,  # Increase to maintain effective batch size = 8
    
    # Optimizer settings
    learning_rate=2e-5,
    warmup_steps=5,
    weight_decay=0.01,
    
    # Precision and performance
    bf16=is_bfloat16_supported(),
    fp16=not is_bfloat16_supported(),
    optim="adamw_8bit",  # Memory-efficient optimizer
    
    # Logging and evaluation
    logging_steps=1,
    eval_strategy="steps",
    eval_steps=20,
    save_strategy="steps",
    save_steps=20,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    
    # Important for vision models
    remove_unused_columns=False,
    dataset_text_field="",
    dataset_kwargs={"skip_prepare_dataset": True},
    
    # Other settings
    report_to="none",  # Change to "wandb" if you want to use Weights & Biases
    seed=42,
)

print("Training configuration:")
print(f"  - Epochs: {training_args.num_train_epochs}")
print(f"  - Batch size: {training_args.per_device_train_batch_size}")
print(f"  - Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"  - Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  - Learning rate: {training_args.learning_rate}")

## 8. Data Collator

In [None]:
# Use Unsloth's vision data collator
from unsloth.trainer import UnslothVisionDataCollator

data_collator = UnslothVisionDataCollator(model, processor)
print("✅ Using UnslothVisionDataCollator")

## 9. Train the Model

In [None]:
# Initialize trainer with Unsloth's SFTTrainer
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=processor.tokenizer,
)

print("Starting training...")
print("=" * 50)

In [None]:
# Start training
trainer.train()

print("\nTraining completed!")

## 10. Save the Model

In [None]:
# Save the fine-tuned model
FINAL_MODEL_DIR = "./qwen2.5-vl-mathverse-final"

# Save LoRA adapters
model.save_pretrained(FINAL_MODEL_DIR)
processor.save_pretrained(FINAL_MODEL_DIR)

print(f"Model saved to {FINAL_MODEL_DIR}")

In [None]:
# Optional: Save to Google Drive
!cp -r {FINAL_MODEL_DIR} /content/drive/MyDrive/
print("Model copied to Google Drive!")

## 11. Merge LoRA Weights (Optional)

In [None]:
# Merge LoRA weights with base model for easier deployment
# Warning: This requires more memory

merged_model = model.merge_and_unload()
MERGED_MODEL_DIR = "./qwen2.5-vl-mathverse-merged"

merged_model.save_pretrained(MERGED_MODEL_DIR)
processor.save_pretrained(MERGED_MODEL_DIR)

print(f"Merged model saved to {MERGED_MODEL_DIR}")

## 12. Test the Fine-tuned Model

In [None]:
# Test on a single validation sample (quick test)
from PIL import Image

# Set model to evaluation mode
model.eval()

# Test on a validation sample
test_sample = val_data[0]
# Extract image and question from our conversation format
test_image = test_sample['messages'][0]['content'][0]['image']  # Image is first in content
test_question = test_sample['messages'][0]['content'][1]['text']  # Text is second
expected_answer = test_sample['messages'][1]['content'][0]['text']

# Prepare input
conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": test_image},
            {"type": "text", "text": test_question}
        ]
    }
]

text_prompt = processor.apply_chat_template(
    conversation,
    add_generation_prompt=True,
    tokenize=False
)

inputs = processor(
    text=[text_prompt],
    images=[test_image],
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response
with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=128,
        do_sample=False,
        temperature=0.7,
        top_p=0.9
    )

# Decode response
generated_text = processor.batch_decode(
    output,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=True
)[0]

print("Question:")
print(test_question[:200])
print("\nExpected Answer:")
print(expected_answer)
print("\nModel Response:")
print(generated_text)
print("\nImage:")
display(test_image)

## 13. Full Dataset Evaluation

In [None]:
# Full Test Dataset Evaluation - PURE VISION (No Text At All)
import torch
import json
import re
from tqdm.auto import tqdm

def extract_answer(text, question_type='multi-choice'):
    """
    Extract answer from model output.
    For multi-choice: look for single letter A, B, C, D
    For free-form: return the full text
    """
    if question_type == 'multi-choice':
        # Look for patterns like "A", "B", "C", "D" (case insensitive)
        # Try to find the last occurrence to get the final answer
        matches = re.findall(r'\b([A-D])\b', text.upper())
        if matches:
            return matches[-1]  # Return last match
        return None
    else:
        # For free-form, return cleaned text
        return text.strip()

def evaluate_model_pure_vision(model, processor, test_data, max_samples=None):
    """
    Evaluate model on test dataset - PURE VISION (NO TEXT AT ALL).
    Tests if model can answer based on image alone with no text prompt.
    """
    model.eval()
    results = []
    correct = 0
    total = 0

    # Limit samples if specified
    samples_to_eval = test_data[:max_samples] if max_samples else test_data

    print(f"Evaluating on {len(samples_to_eval)} samples (PURE VISION - NO TEXT)...")

    for idx, sample in enumerate(tqdm(samples_to_eval, desc="Evaluating")):
        # Extract data from conversation format
        test_image = sample['messages'][0]['content'][0]['image']
        test_question = sample['messages'][0]['content'][1]['text']  # Keep for reference only
        expected_answer = sample['messages'][1]['content'][0]['text']

        # Determine question type
        question_type = 'multi-choice' if 'Choices:' in test_question else 'free-form'

        # Prepare input - IMAGE ONLY, NO TEXT AT ALL
        conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": test_image}
                ]
            }
        ]

        text_prompt = processor.apply_chat_template(
            conversation,
            add_generation_prompt=True,
            tokenize=False
        )

        inputs = processor(
            text=[text_prompt],
            images=[test_image],
            return_tensors="pt",
            padding=True
        ).to(model.device)

        # Generate response
        try:
            with torch.no_grad():
                output = model.generate(
                    **inputs,
                    max_new_tokens=128,
                    do_sample=False,
                    temperature=0.0,  # Greedy decoding for evaluation
                )

            # Decode response
            generated_text = processor.batch_decode(
                output,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )[0]

            # Extract answer
            predicted_answer = extract_answer(generated_text, question_type)

            # Check correctness
            is_correct = False
            if question_type == 'multi-choice':
                expected_letter = extract_answer(expected_answer, 'multi-choice')
                if expected_letter is None:
                    expected_letter = expected_answer.strip().upper()
                is_correct = (predicted_answer == expected_letter)
            else:
                is_correct = (expected_answer.lower() in generated_text.lower())

            if is_correct:
                correct += 1
            total += 1

            # Store result
            results.append({
                'index': idx,
                'original_question': test_question[:200] + '...' if len(test_question) > 200 else test_question,
                'expected_answer': expected_answer,
                'predicted_answer': predicted_answer if predicted_answer else generated_text[:100],
                'full_response': generated_text,
                'question_type': question_type,
                'correct': is_correct
            })

        except Exception as e:
            print(f"\nError on sample {idx}: {e}")
            results.append({
                'index': idx,
                'error': str(e),
                'correct': False
            })

        # Print progress every 10 samples
        if (idx + 1) % 10 == 0:
            current_acc = (correct / total * 100) if total > 0 else 0
            print(f"\nProgress: {idx + 1}/{len(samples_to_eval)} | Accuracy: {current_acc:.2f}%")

    # Calculate metrics
    accuracy = (correct / total * 100) if total > 0 else 0
    mc_correct = sum(1 for r in results if r.get('question_type') == 'multi-choice' and r['correct'])
    mc_total = sum(1 for r in results if r.get('question_type') == 'multi-choice')
    ff_correct = sum(1 for r in results if r.get('question_type') == 'free-form' and r['correct'])
    ff_total = sum(1 for r in results if r.get('question_type') == 'free-form')

    metrics = {
        'total_samples': total,
        'correct': correct,
        'accuracy': accuracy,
        'multi_choice_accuracy': (mc_correct / mc_total * 100) if mc_total > 0 else 0,
        'multi_choice_count': mc_total,
        'free_form_accuracy': (ff_correct / ff_total * 100) if ff_total > 0 else 0,
        'free_form_count': ff_total
    }

    return results, metrics

# Run PURE VISION evaluation
print("=" * 80)
print("PURE VISION EVALUATION (NO TEXT - IMAGE ONLY)")
print("=" * 80)
print("Model receives ONLY the image, no text prompt at all")
print("=" * 80)

results_pure_vision, metrics_pure_vision = evaluate_model_pure_vision(
    model,
    processor,
    val_data,
    max_samples=None  # Set to 10 for quick test, None for full evaluation
)

# Print metrics
print("\n" + "=" * 80)
print("PURE VISION RESULTS")
print("=" * 80)
print(f"Total Samples: {metrics_pure_vision['total_samples']}")
print(f"Correct: {metrics_pure_vision['correct']}")
print(f"Overall Accuracy: {metrics_pure_vision['accuracy']:.2f}%")
print(f"\nMulti-Choice: {metrics_pure_vision['multi_choice_accuracy']:.2f}% ({metrics_pure_vision['multi_choice_count']} samples)")
print(f"Free-Form: {metrics_pure_vision['free_form_accuracy']:.2f}% ({metrics_pure_vision['free_form_count']} samples)")
print("=" * 80)

# Save results
output_file = "./evaluation_results_pure_vision.json"
with open(output_file, 'w') as f:
    json.dump({'metrics': metrics_pure_vision, 'results': results_pure_vision}, f, indent=2)

print(f"\n✅ Results saved to: {output_file}")

# Show examples
print("\n" + "=" * 80)
print("SAMPLE PREDICTIONS (First 5)")
print("=" * 80)
for i, result in enumerate(results_pure_vision[:5]):
    if 'original_question' in result:
        print(f"\nSample {i+1}:")
        print(f"Original Question (NOT shown to model): {result['original_question']}")
        print(f"Expected: {result['expected_answer']}")
        print(f"Predicted: {result['predicted_answer']}")
        print(f"Full Response: {result['full_response'][:150]}...")
        print(f"Correct: {'✓' if result['correct'] else '✗'}")
        print("-" * 40)

## 14. Compare Base Model vs Fine-tuned Model

# Load Original Base Model for Comparison
print("=" * 80)
print("LOADING ORIGINAL BASE MODEL (No Fine-tuning)")
print("=" * 80)

# Free up memory first
import gc
import torch

# Clear current model from GPU
if 'model' in globals():
    del model
gc.collect()
torch.cuda.empty_cache()

# Load the original base model (without LoRA)
from unsloth import FastVisionModel

base_model, base_processor = FastVisionModel.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct",
    max_seq_length=2048,
    load_in_4bit=True,
    dtype=None,
)

base_model.eval()

print("✅ Base model loaded successfully!")
print("=" * 80)

In [None]:
# Evaluate Base Model (Pure Vision)
print("=" * 80)
print("EVALUATING BASE MODEL (Pure Vision)")
print("=" * 80)

results_base, metrics_base = evaluate_model_pure_vision(
    base_model,
    base_processor,
    val_data,
    max_samples=None  # Set to 10 for quick test, None for full evaluation
)

# Print base model metrics
print("\n" + "=" * 80)
print("BASE MODEL RESULTS")
print("=" * 80)
print(f"Total Samples: {metrics_base['total_samples']}")
print(f"Correct: {metrics_base['correct']}")
print(f"Overall Accuracy: {metrics_base['accuracy']:.2f}%")
print(f"\nMulti-Choice: {metrics_base['multi_choice_accuracy']:.2f}% ({metrics_base['multi_choice_count']} samples)")
print(f"Free-Form: {metrics_base['free_form_accuracy']:.2f}% ({metrics_base['free_form_count']} samples)")
print("=" * 80)

# Save base model results
output_file_base = "./evaluation_results_base_model.json"
with open(output_file_base, 'w') as f:
    json.dump({'metrics': metrics_base, 'results': results_base}, f, indent=2)

print(f"\n✅ Base model results saved to: {output_file_base}")

In [None]:
# Reload Fine-tuned Model
print("=" * 80)
print("RELOADING FINE-TUNED MODEL")
print("=" * 80)

# Free up base model
del base_model
del base_processor
gc.collect()
torch.cuda.empty_cache()

# Reload fine-tuned model with LoRA weights
from unsloth import FastVisionModel
from peft import PeftModel

finetuned_model, finetuned_processor = FastVisionModel.from_pretrained(
    model_name="Qwen/Qwen2.5-VL-7B-Instruct",
    max_seq_length=2048,
    load_in_4bit=True,
    dtype=None,
)

# Apply LoRA configuration
finetuned_model = FastVisionModel.get_peft_model(
    finetuned_model,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    use_gradient_checkpointing="unsloth",
    random_state=42,
)

# Load the saved fine-tuned weights
FINAL_MODEL_DIR = "./qwen2.5-vl-mathverse-final"
finetuned_model = PeftModel.from_pretrained(finetuned_model, FINAL_MODEL_DIR)
finetuned_model.eval()

print("✅ Fine-tuned model reloaded successfully!")
print("=" * 80)

In [None]:
# Evaluate Fine-tuned Model (Pure Vision)
print("=" * 80)
print("EVALUATING FINE-TUNED MODEL (Pure Vision)")
print("=" * 80)

results_finetuned, metrics_finetuned = evaluate_model_pure_vision(
    finetuned_model,
    finetuned_processor,
    val_data,
    max_samples=None
)

# Print fine-tuned metrics
print("\n" + "=" * 80)
print("FINE-TUNED MODEL RESULTS")
print("=" * 80)
print(f"Total Samples: {metrics_finetuned['total_samples']}")
print(f"Correct: {metrics_finetuned['correct']}")
print(f"Overall Accuracy: {metrics_finetuned['accuracy']:.2f}%")
print(f"\nMulti-Choice: {metrics_finetuned['multi_choice_accuracy']:.2f}% ({metrics_finetuned['multi_choice_count']} samples)")
print(f"Free-Form: {metrics_finetuned['free_form_accuracy']:.2f}% ({metrics_finetuned['free_form_count']} samples)")
print("=" * 80)

# Save results
with open('./evaluation_results_finetuned_model.json', 'w') as f:
    json.dump({'metrics': metrics_finetuned, 'results': results_finetuned}, f, indent=2)

print(f"\n✅ Results saved")

In [None]:
# Side-by-Side Comparison
print("\n" + "=" * 80)
print("COMPARISON: BASE vs FINE-TUNED")
print("=" * 80)

import pandas as pd

# Create comparison table
comparison_data = {
    'Metric': [
        'Overall Accuracy',
        'Multi-Choice Accuracy',
        'Free-Form Accuracy',
        'Total Samples',
        'Correct Answers'
    ],
    'Base Model': [
        f"{metrics_base['accuracy']:.2f}%",
        f"{metrics_base['multi_choice_accuracy']:.2f}%",
        f"{metrics_base['free_form_accuracy']:.2f}%",
        metrics_base['total_samples'],
        metrics_base['correct']
    ],
    'Fine-Tuned Model': [
        f"{metrics_finetuned['accuracy']:.2f}%",
        f"{metrics_finetuned['multi_choice_accuracy']:.2f}%",
        f"{metrics_finetuned['free_form_accuracy']:.2f}%",
        metrics_finetuned['total_samples'],
        metrics_finetuned['correct']
    ],
    'Improvement': [
        f"{metrics_finetuned['accuracy'] - metrics_base['accuracy']:+.2f}%",
        f"{metrics_finetuned['multi_choice_accuracy'] - metrics_base['multi_choice_accuracy']:+.2f}%",
        f"{metrics_finetuned['free_form_accuracy'] - metrics_base['free_form_accuracy']:+.2f}%",
        '-',
        f"{metrics_finetuned['correct'] - metrics_base['correct']:+d}"
    ]
}

df_comparison = pd.DataFrame(comparison_data)
print(df_comparison.to_string(index=False))
print("\n" + "=" * 80)

# Relative improvement
if metrics_base['accuracy'] > 0:
    relative_improvement = ((metrics_finetuned['accuracy'] - metrics_base['accuracy']) / metrics_base['accuracy']) * 100
    print(f"\nRelative Improvement: {relative_improvement:+.2f}%")

# Save comparison
comparison_output = {
    'base_model': metrics_base,
    'finetuned_model': metrics_finetuned,
    'improvement': {
        'overall_accuracy': metrics_finetuned['accuracy'] - metrics_base['accuracy'],
        'multi_choice_accuracy': metrics_finetuned['multi_choice_accuracy'] - metrics_base['multi_choice_accuracy'],
        'correct_answers': metrics_finetuned['correct'] - metrics_base['correct']
    }
}

with open('./comparison_results.json', 'w') as f:
    json.dump(comparison_output, f, indent=2)

print("✅ Comparison saved to: ./comparison_results.json")

# Examples where fine-tuning helped
print("\n" + "=" * 80)
print("EXAMPLES WHERE FINE-TUNING HELPED")
print("=" * 80)

helped_count = 0
for i in range(len(results_base)):
    if not results_base[i].get('correct', False) and results_finetuned[i].get('correct', False):
        helped_count += 1
        if helped_count <= 3:
            print(f"\nExample {helped_count}:")
            print(f"Question: {results_base[i].get('original_question', '')[:150]}...")
            print(f"Expected: {results_base[i].get('expected_answer', '')}")
            print(f"Base: {results_base[i].get('predicted_answer', '')} ✗")
            print(f"Fine-Tuned: {results_finetuned[i].get('predicted_answer', '')} ✓")
            print("-" * 40)

print(f"\n✅ Fine-tuning helped on {helped_count} questions!")

# Regressions
print("\n" + "=" * 80)
print("REGRESSIONS")
print("=" * 80)

regression_count = 0
for i in range(len(results_base)):
    if results_base[i].get('correct', False) and not results_finetuned[i].get('correct', False):
        regression_count += 1
        if regression_count <= 3:
            print(f"\nRegression {regression_count}:")
            print(f"Question: {results_base[i].get('original_question', '')[:150]}...")
            print(f"Expected: {results_base[i].get('expected_answer', '')}")
            print(f"Base: {results_base[i].get('predicted_answer', '')} ✓")
            print(f"Fine-Tuned: {results_finetuned[i].get('predicted_answer', '')} ✗")
            print("-" * 40)

print(f"\n⚠️ {regression_count} regressions")
print("=" * 80)