# ARC Fine-tuning Experiments

This notebook compares different fine-tuning strategies for Qwen2.5-VL on the ARC (AI2 Reasoning Challenge) dataset.

## Experiments:
1. **Baseline** - Original model without fine-tuning
2. **Full LoRA** - Fine-tune both vision encoder and language model with LoRA
3. **Vision-Only LoRA** - Fine-tune only vision encoder with LoRA (freeze LM)
4. **Vision Full Fine-tune** - Full fine-tuning of vision encoder (no LoRA, uses non-quantized model)
5. **Language-Only LoRA** - Fine-tune only language model with LoRA (freeze vision)
6. **Full LoRA OCR** - Fine-tune on OCR task (image → text extraction)

## Output Files:
All artifacts are saved locally and automatically copied to Google Drive (when running in Colab):
- `results/` - JSON results for each experiment
- `models/` - Saved final model weights
- `checkpoints/` - Training checkpoints (saved every N steps)
- `plots/` - Training curves and comparison graphs

**Google Drive path:** `drive/MyDrive/arc_finetuning_outputs/`

**Note:** Existing files on Drive are skipped to avoid overwriting (except results and plots which are always updated).

---
## 1. Setup & Installation

In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

In [None]:
# Core imports
import json
import os
import gc
import re
import shutil
from datetime import datetime
from PIL import Image
from tqdm import tqdm
from contextlib import nullcontext

import torch
import numpy as np
import matplotlib.pyplot as plt

from unsloth import FastVisionModel
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig
from transformers import TrainerCallback

# Mount Google Drive (Colab only)
IS_COLAB = "COLAB_" in "".join(os.environ.keys())
if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Google Drive mounted at /content/drive")

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Create local output directories
local_dirs = ["results", "models", "plots", "checkpoints"]
for d in local_dirs:
    os.makedirs(d, exist_ok=True)
print(f"Local directories created: {', '.join(local_dirs)}")

---
## 2. Configuration

In [None]:
# Global Configuration
CONFIG = {
    "model_name": "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit",  # 4-bit quantized (for LoRA experiments)
    "model_name_full": "unsloth/Qwen2.5-VL-7B-Instruct",      # Full precision (for non-LoRA experiments)
    "load_in_4bit": True,
    
    # Data paths (update for Colab: "drive/MyDrive/arc_train" etc.)
    "train_data_dir": "arc_train",
    "val_data_dir": "arc_validation", 
    "test_data_dir": "arc_test",
    
    # Output paths (local)
    "results_dir": "results",
    "models_dir": "models",
    "plots_dir": "plots",
    "checkpoints_dir": "checkpoints",
    
    # Google Drive output path (all artifacts will be copied here)
    "drive_output_dir": "drive/MyDrive/arc_finetuning_outputs",
    
    # Training settings
    "batch_size": 2,
    "gradient_accumulation_steps": 4,
    "max_steps": 30,
    "learning_rate": 2e-4,
    "warmup_steps": 5,
    "save_steps": 10,  # Save checkpoint every N steps
    
    # LoRA settings
    "lora_r": 16,
    "lora_alpha": 16,
    
    # Evaluation
    "val_samples": 50,  # Number of validation samples for quick eval
    "seed": 3407,
}

# Experiment Configurations
EXPERIMENTS = {
    "baseline": {
        "name": "Baseline (No Fine-tuning)",
        "train": False,
    },
    "full_lora": {
        "name": "Full LoRA (Vision + Language)",
        "train": True,
        "use_lora": True,
        "finetune_vision": True,
        "finetune_language": True,
        "task": "qa",
    },
    "vision_lora": {
        "name": "Vision-Only LoRA",
        "train": True,
        "use_lora": True,
        "finetune_vision": True,
        "finetune_language": False,
        "task": "qa",
    },
    "vision_full": {
        "name": "Vision Full Fine-tune (No LoRA)",
        "train": True,
        "use_lora": False,  # Full fine-tuning, requires non-quantized model
        "finetune_vision": True,
        "finetune_language": False,
        "task": "qa",
    },
    "language_lora": {
        "name": "Language-Only LoRA",
        "train": True,
        "use_lora": True,
        "finetune_vision": False,
        "finetune_language": True,
        "task": "qa",
    },
    "full_lora_ocr": {
        "name": "Full LoRA OCR Task",
        "train": True,
        "use_lora": True,
        "finetune_vision": True,
        "finetune_language": True,
        "task": "ocr",
    },
}

# Create Google Drive directories if in Colab
if IS_COLAB:
    drive_base = CONFIG["drive_output_dir"]
    for d in ["results", "models", "plots", "checkpoints"]:
        os.makedirs(os.path.join(drive_base, d), exist_ok=True)
    print(f"Google Drive directories created at: {drive_base}")

print("Configuration loaded.")
print(f"\nExperiments to run: {list(EXPERIMENTS.keys())}")
print(f"\nQuantized model: {CONFIG['model_name']}")
print(f"Full precision model: {CONFIG['model_name_full']}")

---
## 3. Data Loading

In [None]:
def load_dataset(data_dir, split_name):
    """Load dataset from JSONL file."""
    images_dir = os.path.join(data_dir, f"{os.path.basename(data_dir)}_images")
    jsonl_file = os.path.join(data_dir, f"{os.path.basename(data_dir)}.jsonl")
    
    dataset = []
    with open(jsonl_file, "r") as f:
        for line in f:
            item = json.loads(line)
            dataset.append({
                "image_path": os.path.join(images_dir, item["image_path"]),
                "answer_key": item["answer_key"],
                "id": item["id"],
                "question": item["question"],
                "choices": item["choices"],
            })
    
    print(f"Loaded {len(dataset)} {split_name} samples")
    return dataset

# Load all datasets
train_data = load_dataset(CONFIG["train_data_dir"], "training")
val_data = load_dataset(CONFIG["val_data_dir"], "validation")
test_data = load_dataset(CONFIG["test_data_dir"], "test")

In [None]:
def format_question_text(item):
    """Format question with choices as text (for OCR task target)."""
    choices_text = "\n".join([f"{chr(65+i)}. {c}" for i, c in enumerate(item["choices"])])
    return f"{item['question']}\n{choices_text}"

def convert_to_qa_format(sample):
    """Convert sample to QA task format (image → answer letter)."""
    image = Image.open(sample["image_path"]).convert("RGB")
    return {
        "messages": [
            {"role": "user", "content": [{"type": "image", "image": image}]},
            {"role": "assistant", "content": [{"type": "text", "text": f"<answer>{sample['answer_key']}</answer>"}]},
        ]
    }

def convert_to_ocr_format(sample):
    """Convert sample to OCR task format (image → question text)."""
    image = Image.open(sample["image_path"]).convert("RGB")
    target_text = format_question_text(sample)
    return {
        "messages": [
            {"role": "user", "content": [{"type": "image", "image": image}]},
            {"role": "assistant", "content": [{"type": "text", "text": target_text}]},
        ]
    }

print("Data conversion functions defined.")
print(f"\nSample QA target: <answer>{train_data[0]['answer_key']}</answer>")
print(f"\nSample OCR target:\n{format_question_text(train_data[0])[:200]}...")

---
## 4. Helper Functions

In [None]:
class LossTracker(TrainerCallback):
    """Callback to track training losses."""
    def __init__(self):
        self.losses = []
        self.steps = []
    
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            self.losses.append(logs["loss"])
            self.steps.append(state.global_step)
    
    def get_data(self):
        return {"steps": self.steps, "losses": self.losses}

def extract_answer(text):
    """Extract answer letter from model output."""
    match = re.search(r'<answer>\s*([A-Da-d])\s*</answer>', text)
    if match:
        return match.group(1).upper()
    match = re.search(r'(?:^|answer is|answer:|choice)\s*([A-Da-d])\b', text, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    return None

def evaluate_model(model, tokenizer, dataset, num_samples=50, use_adapters=True):
    """Evaluate model on dataset."""
    FastVisionModel.for_inference(model)
    
    # Setup adapter context
    if use_adapters:
        context = nullcontext()
    else:
        try:
            from peft import disable_adapter_layers
            context = disable_adapter_layers(model)
        except:
            context = nullcontext()
    
    results = []
    correct = 0
    
    with context:
        for sample in tqdm(dataset[:num_samples], desc="Evaluating"):
            image = Image.open(sample["image_path"]).convert("RGB")
            
            messages = [{"role": "user", "content": [{"type": "image"}]}]
            input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
            inputs = tokenizer(image, input_text, add_special_tokens=False, return_tensors="pt").to("cuda")
            
            with torch.no_grad():
                output = model.generate(**inputs, max_new_tokens=64, use_cache=True, 
                                       temperature=0.1, do_sample=False)
            
            response = tokenizer.decode(output[0], skip_special_tokens=True)
            response = response.split("assistant")[-1].strip() if "assistant" in response.lower() else response
            
            predicted = extract_answer(response)
            expected = sample["answer_key"]
            is_correct = predicted == expected
            
            if is_correct:
                correct += 1
            
            results.append({
                "id": sample["id"],
                "expected": expected,
                "predicted": predicted,
                "correct": is_correct,
            })
    
    accuracy = correct / len(results) * 100
    return results, accuracy

print("Helper functions defined.")

In [None]:
def copy_to_drive(local_path, drive_subdir, skip_if_exists=True):
    """Copy file or directory to Google Drive.
    
    Args:
        local_path: Local file or directory path
        drive_subdir: Subdirectory in drive output (e.g., 'results', 'models')
        skip_if_exists: If True, skip copying if destination exists
    """
    if not IS_COLAB:
        return  # Skip if not in Colab
    
    drive_base = CONFIG["drive_output_dir"]
    filename = os.path.basename(local_path)
    drive_path = os.path.join(drive_base, drive_subdir, filename)
    
    # Check if already exists
    if skip_if_exists and os.path.exists(drive_path):
        print(f"  [SKIP] Already exists on Drive: {drive_path}")
        return False
    
    # Copy file or directory
    try:
        if os.path.isdir(local_path):
            if os.path.exists(drive_path):
                shutil.rmtree(drive_path)
            shutil.copytree(local_path, drive_path)
        else:
            shutil.copy2(local_path, drive_path)
        print(f"  [COPIED] {local_path} -> {drive_path}")
        return True
    except Exception as e:
        print(f"  [ERROR] Failed to copy {local_path}: {e}")
        return False

def save_results(exp_name, results_dict, save_dir="results"):
    """Save experiment results to JSON and copy to Drive."""
    filepath = os.path.join(save_dir, f"{exp_name}_results.json")
    with open(filepath, "w") as f:
        json.dump(results_dict, f, indent=2)
    print(f"Results saved to {filepath}")
    copy_to_drive(filepath, "results", skip_if_exists=False)  # Always update results

def save_model(model, tokenizer, exp_name, save_dir="models"):
    """Save model weights and copy to Drive."""
    model_dir = os.path.join(save_dir, exp_name)
    model.save_pretrained(model_dir)
    tokenizer.save_pretrained(model_dir)
    print(f"Model saved to {model_dir}")
    copy_to_drive(model_dir, "models", skip_if_exists=True)

def save_plot(fig, filename, save_dir="plots"):
    """Save plot and copy to Drive."""
    filepath = os.path.join(save_dir, filename)
    fig.savefig(filepath, dpi=150, bbox_inches='tight')
    print(f"Plot saved to {filepath}")
    copy_to_drive(filepath, "plots", skip_if_exists=False)  # Always update plots

def copy_checkpoints_to_drive(exp_name, checkpoint_dir):
    """Copy training checkpoints to Drive."""
    if not IS_COLAB or not os.path.exists(checkpoint_dir):
        return
    
    drive_checkpoint_dir = os.path.join(CONFIG["drive_output_dir"], "checkpoints", exp_name)
    os.makedirs(drive_checkpoint_dir, exist_ok=True)
    
    for item in os.listdir(checkpoint_dir):
        if item.startswith("checkpoint-"):
            local_ckpt = os.path.join(checkpoint_dir, item)
            drive_ckpt = os.path.join(drive_checkpoint_dir, item)
            if os.path.exists(drive_ckpt):
                print(f"  [SKIP] Checkpoint exists: {item}")
            else:
                shutil.copytree(local_ckpt, drive_ckpt)
                print(f"  [COPIED] Checkpoint: {item}")

def load_fresh_model(use_quantized=True):
    """Load a fresh copy of the base model.
    
    Args:
        use_quantized: If True, load 4-bit quantized model (for LoRA).
                      If False, load full precision model (for full fine-tuning).
    """
    gc.collect()
    torch.cuda.empty_cache()
    
    if use_quantized:
        model_name = CONFIG["model_name"]
        load_in_4bit = True
        print(f"Loading quantized model: {model_name}")
    else:
        model_name = CONFIG["model_name_full"]
        load_in_4bit = False
        print(f"Loading full precision model: {model_name}")
    
    model, tokenizer = FastVisionModel.from_pretrained(
        model_name,
        load_in_4bit=load_in_4bit,
        use_gradient_checkpointing="unsloth",
    )
    return model, tokenizer

def cleanup_model(model, trainer=None):
    """Clean up model from memory."""
    if trainer is not None:
        del trainer
    del model
    gc.collect()
    torch.cuda.empty_cache()

print("Save/Load functions defined.")

In [None]:
def check_existing_results(exp_key):
    """Check if results already exist locally or on Drive."""
    local_path = f"results/{exp_key}_results.json"
    
    # Check local
    if os.path.exists(local_path):
        print(f"[SKIP] Results already exist: {local_path}")
        with open(local_path, "r") as f:
            return json.load(f)
    
    # Check Drive
    if IS_COLAB:
        drive_path = os.path.join(CONFIG["drive_output_dir"], "results", f"{exp_key}_results.json")
        if os.path.exists(drive_path):
            print(f"[SKIP] Results already exist on Drive: {drive_path}")
            # Copy from Drive to local
            shutil.copy2(drive_path, local_path)
            with open(local_path, "r") as f:
                return json.load(f)
    
    return None

def run_experiment(exp_key, exp_config, train_data, val_data, test_data):
    """Run a single experiment and return results."""
    print(f"\n{'='*60}")
    print(f"EXPERIMENT: {exp_config['name']}")
    print(f"{'='*60}")
    
    # Check if results already exist
    existing_results = check_existing_results(exp_key)
    if existing_results is not None:
        return existing_results
    
    # Determine if we need quantized or full precision model
    use_lora = exp_config.get("use_lora", True)
    use_quantized = use_lora  # Use quantized model only for LoRA experiments
    
    # Load fresh model
    model, tokenizer = load_fresh_model(use_quantized=use_quantized)
    
    results = {
        "name": exp_config["name"],
        "config": exp_config,
        "timestamp": datetime.now().isoformat(),
        "model_type": "quantized" if use_quantized else "full_precision",
    }
    
    loss_tracker = None
    trainer = None
    checkpoint_dir = f"checkpoints/{exp_key}"
    
    if exp_config.get("train", False):
        if use_lora:
            # Configure model with LoRA adapters
            model = FastVisionModel.get_peft_model(
                model,
                finetune_vision_layers=exp_config["finetune_vision"],
                finetune_language_layers=exp_config["finetune_language"],
                finetune_attention_modules=True,
                finetune_mlp_modules=True,
                r=CONFIG["lora_r"],
                lora_alpha=CONFIG["lora_alpha"],
                lora_dropout=0,
                bias="none",
                random_state=CONFIG["seed"],
            )
        else:
            # Full fine-tuning: unfreeze specific layers (non-quantized model)
            trainable_params = 0
            total_params = 0
            for name, param in model.named_parameters():
                total_params += param.numel()
                if exp_config["finetune_vision"] and "visual" in name.lower():
                    param.requires_grad = True
                    trainable_params += param.numel()
                elif exp_config["finetune_language"] and "visual" not in name.lower():
                    param.requires_grad = True
                    trainable_params += param.numel()
                else:
                    param.requires_grad = False
            print(f"Trainable params: {trainable_params:,} / {total_params:,} ({100*trainable_params/total_params:.2f}%)")
        
        # Prepare dataset based on task
        task = exp_config.get("task", "qa")
        if task == "qa":
            converted_data = [convert_to_qa_format(s) for s in tqdm(train_data, desc="Converting to QA")]
        else:  # ocr
            converted_data = [convert_to_ocr_format(s) for s in tqdm(train_data, desc="Converting to OCR")]
        
        # Setup trainer with checkpoint saving
        loss_tracker = LossTracker()
        FastVisionModel.for_training(model)
        
        trainer = SFTTrainer(
            model=model,
            tokenizer=tokenizer,
            data_collator=UnslothVisionDataCollator(model, tokenizer),
            train_dataset=converted_data,
            callbacks=[loss_tracker],
            args=SFTConfig(
                per_device_train_batch_size=CONFIG["batch_size"],
                gradient_accumulation_steps=CONFIG["gradient_accumulation_steps"],
                warmup_steps=CONFIG["warmup_steps"],
                max_steps=CONFIG["max_steps"],
                learning_rate=CONFIG["learning_rate"],
                logging_steps=1,
                optim="adamw_8bit",
                weight_decay=0.001,
                lr_scheduler_type="linear",
                seed=CONFIG["seed"],
                output_dir=checkpoint_dir,
                save_strategy="steps",
                save_steps=CONFIG["save_steps"],
                save_total_limit=3,  # Keep only last 3 checkpoints
                report_to="none",
                remove_unused_columns=False,
                dataset_text_field="",
                dataset_kwargs={"skip_prepare_dataset": True},
                max_length=2048,
            ),
        )
        
        # Train
        print(f"\nTraining {exp_config['name']}...")
        train_result = trainer.train()
        results["training"] = {
            "runtime_seconds": train_result.metrics["train_runtime"],
            "loss_history": loss_tracker.get_data(),
        }
        print(f"Training completed in {train_result.metrics['train_runtime']:.2f}s")
        
        # Copy checkpoints to Drive
        print("Copying checkpoints to Drive...")
        copy_checkpoints_to_drive(exp_key, checkpoint_dir)
        
        # Save final model
        save_model(model, tokenizer, exp_key)
    
    # Evaluate on validation
    print(f"\nEvaluating on validation set ({CONFIG['val_samples']} samples)...")
    val_results, val_accuracy = evaluate_model(
        model, tokenizer, val_data, 
        num_samples=CONFIG["val_samples"], 
        use_adapters=use_lora and exp_config.get("train", False)
    )
    results["validation"] = {"accuracy": val_accuracy, "num_samples": len(val_results)}
    print(f"Validation Accuracy: {val_accuracy:.2f}%")
    
    # Evaluate on test
    print(f"\nEvaluating on test set ({len(test_data)} samples)...")
    test_results, test_accuracy = evaluate_model(
        model, tokenizer, test_data, 
        num_samples=len(test_data), 
        use_adapters=use_lora and exp_config.get("train", False)
    )
    results["test"] = {"accuracy": test_accuracy, "num_samples": len(test_results)}
    print(f"Test Accuracy: {test_accuracy:.2f}%")
    
    # Save results
    save_results(exp_key, results)
    
    # Cleanup
    cleanup_model(model, trainer)
    
    return results

print("Experiment runner defined.")

---
## 5. Run All Experiments

In [None]:
# Store all results
ALL_RESULTS = {}

# Run each experiment
for exp_key, exp_config in EXPERIMENTS.items():
    ALL_RESULTS[exp_key] = run_experiment(exp_key, exp_config, train_data, val_data, test_data)
    
print("\n" + "="*60)
print("ALL EXPERIMENTS COMPLETED!")
print("="*60)

---
## 6. Results Comparison

In [None]:
# Create comprehensive results table
print("\n" + "="*100)
print("COMPREHENSIVE RESULTS COMPARISON")
print("="*100)
print(f"\n{'Experiment':<40} {'Val Acc':<12} {'Test Acc':<12} {'Val Impr':<12} {'Test Impr':<12}")
print("-"*100)

baseline_val = ALL_RESULTS["baseline"]["validation"]["accuracy"]
baseline_test = ALL_RESULTS["baseline"]["test"]["accuracy"]

for exp_key, results in ALL_RESULTS.items():
    val_acc = results["validation"]["accuracy"]
    test_acc = results["test"]["accuracy"]
    val_impr = val_acc - baseline_val if exp_key != "baseline" else 0
    test_impr = test_acc - baseline_test if exp_key != "baseline" else 0
    
    impr_str_val = f"{val_impr:+.2f}%" if exp_key != "baseline" else "--"
    impr_str_test = f"{test_impr:+.2f}%" if exp_key != "baseline" else "--"
    
    print(f"{results['name']:<40} {val_acc:>10.2f}%  {test_acc:>10.2f}%  {impr_str_val:>10}  {impr_str_test:>10}")

print("="*100)

# Save comprehensive results
save_results("comprehensive_comparison", ALL_RESULTS)

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

exp_names = [ALL_RESULTS[k]["name"] for k in ALL_RESULTS.keys()]
exp_names_short = ["Baseline", "Full\nLoRA", "Vision\nLoRA", "Vision\nFull", "Lang\nLoRA", "OCR\nLoRA"]
val_accs = [ALL_RESULTS[k]["validation"]["accuracy"] for k in ALL_RESULTS.keys()]
test_accs = [ALL_RESULTS[k]["test"]["accuracy"] for k in ALL_RESULTS.keys()]

# 1. Bar chart - All accuracies
ax1 = axes[0, 0]
x = np.arange(len(exp_names_short))
width = 0.35
bars1 = ax1.bar(x - width/2, val_accs, width, label='Validation', color='#3498db', edgecolor='black')
bars2 = ax1.bar(x + width/2, test_accs, width, label='Test', color='#e74c3c', edgecolor='black')
ax1.set_ylabel('Accuracy (%)', fontsize=12)
ax1.set_title('Accuracy by Experiment', fontsize=14, fontweight='bold')
ax1.set_xticks(x)
ax1.set_xticklabels(exp_names_short, fontsize=9)
ax1.legend()
ax1.set_ylim(0, 100)

# 2. Improvement over baseline
ax2 = axes[0, 1]
exp_keys_train = [k for k in ALL_RESULTS.keys() if k != "baseline"]
names_train = [exp_names_short[i+1] for i in range(len(exp_keys_train))]
val_imprs = [ALL_RESULTS[k]["validation"]["accuracy"] - baseline_val for k in exp_keys_train]
test_imprs = [ALL_RESULTS[k]["test"]["accuracy"] - baseline_test for k in exp_keys_train]

x2 = np.arange(len(names_train))
bars1 = ax2.bar(x2 - width/2, val_imprs, width, label='Validation', color='#2ecc71', edgecolor='black')
bars2 = ax2.bar(x2 + width/2, test_imprs, width, label='Test', color='#9b59b6', edgecolor='black')
ax2.set_ylabel('Improvement (%)', fontsize=12)
ax2.set_title('Improvement over Baseline', fontsize=14, fontweight='bold')
ax2.set_xticks(x2)
ax2.set_xticklabels(names_train, fontsize=9)
ax2.legend()
ax2.axhline(y=0, color='black', linestyle='-', linewidth=0.5)

# 3. Training curves
ax3 = axes[1, 0]
colors = plt.cm.tab10(np.linspace(0, 1, len(exp_keys_train)))
for i, exp_key in enumerate(exp_keys_train):
    if "training" in ALL_RESULTS[exp_key] and ALL_RESULTS[exp_key]["training"]["loss_history"]["losses"]:
        losses = ALL_RESULTS[exp_key]["training"]["loss_history"]["losses"]
        steps = ALL_RESULTS[exp_key]["training"]["loss_history"]["steps"]
        ax3.plot(steps, losses, '-o', label=names_train[i], markersize=3, color=colors[i])
ax3.set_xlabel('Training Step', fontsize=12)
ax3.set_ylabel('Loss', fontsize=12)
ax3.set_title('Training Loss Curves', fontsize=14, fontweight='bold')
ax3.legend(fontsize=8)
ax3.grid(True, alpha=0.3)

# 4. Test accuracy ranking
ax4 = axes[1, 1]
sorted_results = sorted(ALL_RESULTS.items(), key=lambda x: x[1]["test"]["accuracy"], reverse=True)
sorted_names = [ALL_RESULTS[k]["name"][:25] for k, _ in sorted_results]
sorted_accs = [r["test"]["accuracy"] for _, r in sorted_results]
colors_rank = ['#2ecc71' if i == 0 else '#3498db' if acc > baseline_test else '#e74c3c' for i, acc in enumerate(sorted_accs)]
ax4.barh(range(len(sorted_names)), sorted_accs, color=colors_rank, edgecolor='black')
ax4.set_yticks(range(len(sorted_names)))
ax4.set_yticklabels(sorted_names, fontsize=9)
ax4.set_xlabel('Test Accuracy (%)', fontsize=12)
ax4.set_title('Test Accuracy Ranking', fontsize=14, fontweight='bold')
ax4.axvline(x=baseline_test, color='red', linestyle='--', label=f'Baseline ({baseline_test:.1f}%)')
for i, acc in enumerate(sorted_accs):
    ax4.text(acc + 0.5, i, f'{acc:.1f}%', va='center', fontsize=9)

plt.tight_layout()

# Save plot locally and to Drive
save_plot(fig, 'comprehensive_comparison.png')
plt.show()

In [None]:
# Final Summary
print("\n" + "="*70)
print("FINAL SUMMARY")
print("="*70)

# Find best performing experiment
best_exp = max([(k, v["test"]["accuracy"]) for k, v in ALL_RESULTS.items() if k != "baseline"], key=lambda x: x[1])
worst_exp = min([(k, v["test"]["accuracy"]) for k, v in ALL_RESULTS.items() if k != "baseline"], key=lambda x: x[1])

print(f"\nBaseline Test Accuracy: {baseline_test:.2f}%")
print(f"\nBest Performing: {ALL_RESULTS[best_exp[0]]['name']}")
print(f"  - Test Accuracy: {best_exp[1]:.2f}%")
print(f"  - Improvement: {best_exp[1] - baseline_test:+.2f}%")

print(f"\nWorst Performing: {ALL_RESULTS[worst_exp[0]]['name']}")
print(f"  - Test Accuracy: {worst_exp[1]:.2f}%")
print(f"  - Improvement: {worst_exp[1] - baseline_test:+.2f}%")

print("\n" + "="*70)
print("OUTPUT LOCATIONS:")
print("="*70)
print("\nLocal directories:")
print("  - results/     : JSON results for each experiment")
print("  - models/      : Final model weights")
print("  - checkpoints/ : Training checkpoints")
print("  - plots/       : Visualization plots")

if IS_COLAB:
    print(f"\nGoogle Drive (persistent storage):")
    print(f"  {CONFIG['drive_output_dir']}/")
    print(f"    ├── results/     : JSON results")
    print(f"    ├── models/      : Final model weights")
    print(f"    ├── checkpoints/ : Training checkpoints")
    print(f"    └── plots/       : Visualization plots")
print("="*70)