In [17]:
print("Setting up Kaggle environment for Image Captioning...")

# Install dependencies
!pip install torch torchvision pillow numpy pandas tqdm

# Import libraries
import os
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
import random
from collections import Counter
import json
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("\n‚úì Environment ready!")
print(f"Working directory: {os.getcwd()}")

Setting up Kaggle environment for Image Captioning...

‚úì Environment ready!
Working directory: /kaggle/working/Deep_Learning_final


In [18]:
# ============================================================================
# CELL 2: KAGGLE CONFIGURATION
# ============================================================================

print("Setting Kaggle paths...")

# KAGGLE PATHS (Update these if your dataset name is different)
KAGGLE_INPUT_DIR = "/kaggle/input"
KAGGLE_WORKING_DIR = "/kaggle/working"

# Auto-detect Flickr8k dataset
datasets = os.listdir(KAGGLE_INPUT_DIR)
flickr_dataset = None
for d in datasets:
    if "flickr" in d.lower() or "flickr8k" in d.lower():
        flickr_dataset = d
        break

if flickr_dataset:
    DATASET_PATH = f"{KAGGLE_INPUT_DIR}/{flickr_dataset}"
    # Try to find the actual structure
    possible_paths = [
        f"{DATASET_PATH}/flickr8k",
        DATASET_PATH,
        f"{DATASET_PATH}/Flickr8k"
    ]
    
    for path in possible_paths:
        if os.path.exists(f"{path}/captions.txt"):
            DATASET_PATH = path
            break
    
    IMAGE_DIR = f"{DATASET_PATH}/Images"
    ANNOTATION_FILE = f"{DATASET_PATH}/captions.txt"
    
    # If Images folder doesn't exist, check for images folder
    if not os.path.exists(IMAGE_DIR):
        if os.path.exists(f"{DATASET_PATH}/images"):
            IMAGE_DIR = f"{DATASET_PATH}/images"
        elif os.path.exists(f"{DATASET_PATH}/Flickr8k_Dataset"):
            IMAGE_DIR = f"{DATASET_PATH}/Flickr8k_Dataset"
    
    print(f"‚úì Found dataset: {flickr_dataset}")
    print(f"  Images: {IMAGE_DIR}")
    print(f"  Captions: {ANNOTATION_FILE}")
else:
    print("‚ö†Ô∏è No Flickr dataset found. Please add 'flickr8k' dataset via '+ Add data'")
    IMAGE_DIR = "/kaggle/input/flickr8k/Images"  # Default fallback
    ANNOTATION_FILE = "/kaggle/input/flickr8k/captions.txt"

# Output directories
EXPERIMENT_DIR = f"{KAGGLE_WORKING_DIR}/experiments"
MODEL_SAVE_DIR = f"{KAGGLE_WORKING_DIR}/models"
os.makedirs(EXPERIMENT_DIR, exist_ok=True)
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

print(f"\n‚úì Output directories created in {KAGGLE_WORKING_DIR}")

Setting Kaggle paths...
‚úì Found dataset: flickr8kimagescaptions
  Images: /kaggle/input/flickr8kimagescaptions/flickr8k/images
  Captions: /kaggle/input/flickr8kimagescaptions/flickr8k/captions.txt

‚úì Output directories created in /kaggle/working


In [19]:
# ============================================================================
# CELL 3: DATA LOADER (Modified for Kaggle)
# ============================================================================

class Vocabulary:
    def __init__(self, freq_threshold):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        return [tok.lower() for tok in text.replace(".", " .").replace(",", " ,").split()]

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4

        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)
        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]


class FlickrDataset(Dataset):
    def __init__(self, root_dir, imgs, captions, vocab, transform=None):
        self.root_dir = root_dir
        self.imgs = imgs
        self.captions = captions
        self.vocab = vocab
        self.transform = transform

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, index):
        caption = self.captions[index]
        img_id = self.imgs[index]
        img_path = os.path.join(self.root_dir, img_id)
        
        image = Image.open(img_path).convert("RGB")

        if self.transform is not None:
            image = self.transform(image)

        numericalized_caption = [self.vocab.stoi["<SOS>"]]
        numericalized_caption += self.vocab.numericalize(caption)
        numericalized_caption.append(self.vocab.stoi["<EOS>"])

        return image, torch.tensor(numericalized_caption)


class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        imgs = [item[0].unsqueeze(0) for item in batch]
        imgs = torch.cat(imgs, dim=0)
        targets = [item[1] for item in batch]
        targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)
        return imgs, targets


def get_loaders(
    root_folder=None,
    annotation_file=None,
    transform=None,
    batch_size=32,
    num_workers=2,
    shuffle=True,
    pin_memory=True,
    test_size=0.1,
    val_size=0.1,
    freq_threshold=5
):
    """Modified for Kaggle - uses global paths if None provided"""
    
    # Use Kaggle paths if not specified
    if root_folder is None:
        root_folder = IMAGE_DIR
    if annotation_file is None:
        annotation_file = ANNOTATION_FILE
    
    all_imgs = []
    all_captions = []
    
    print(f"Loading data from: {annotation_file}")
    with open(annotation_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        if lines and "image,caption" in lines[0]:
            lines = lines[1:]
        
        for line in lines:
            parts = line.strip().split(',', 1) 
            if len(parts) == 2:
                all_imgs.append(parts[0])
                all_captions.append(parts[1])

    print(f"Loaded {len(all_imgs)} image-caption pairs")
    
    unique_imgs = list(set(all_imgs))
    random.seed(42)
    random.shuffle(unique_imgs)
    
    total_imgs = len(unique_imgs)
    v_count = int(total_imgs * val_size)
    t_count = int(total_imgs * test_size)
    train_count = total_imgs - v_count - t_count
    
    train_img_ids = set(unique_imgs[:train_count])
    val_img_ids = set(unique_imgs[train_count:train_count+v_count])
    test_img_ids = set(unique_imgs[train_count+v_count:])
    
    train_imgs, train_caps = [], []
    val_imgs, val_caps = [], []
    test_imgs, test_caps = [], []
    
    for img, cap in zip(all_imgs, all_captions):
        if img in train_img_ids:
            train_imgs.append(img)
            train_caps.append(cap)
        elif img in val_img_ids:
            val_imgs.append(img)
            val_caps.append(cap)
        elif img in test_img_ids:
            test_imgs.append(img)
            test_caps.append(cap)
            
    print(f"Split: Train={len(train_imgs)}, Val={len(val_imgs)}, Test={len(test_imgs)}")
    
    vocab = Vocabulary(freq_threshold)
    vocab.build_vocabulary(train_caps)
    
    print(f"Vocabulary size: {len(vocab)}")
    
    train_dataset = FlickrDataset(root_folder, train_imgs, train_caps, vocab, transform=transform)
    val_dataset = FlickrDataset(root_folder, val_imgs, val_caps, vocab, transform=transform)
    test_dataset = FlickrDataset(root_folder, test_imgs, test_caps, vocab, transform=transform)
    
    pad_idx = vocab.stoi["<PAD>"]

    # Kaggle optimization: adjust workers
    num_workers = min(num_workers, 2)  # Kaggle works better with fewer workers
    
    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, num_workers=num_workers, 
        shuffle=shuffle, pin_memory=pin_memory, collate_fn=MyCollate(pad_idx)
    )
    val_loader = DataLoader(
        val_dataset, batch_size=batch_size, num_workers=num_workers, 
        shuffle=False, pin_memory=pin_memory, collate_fn=MyCollate(pad_idx)
    )
    test_loader = DataLoader(
        test_dataset, batch_size=batch_size, num_workers=num_workers, 
        shuffle=False, pin_memory=pin_memory, collate_fn=MyCollate(pad_idx)
    )

    return train_loader, val_loader, test_loader, vocab

print("‚úì Data loader module ready")

‚úì Data loader module ready


In [20]:
# ============================================================================
# CELL 4: MODEL ARCHITECTURE
# ============================================================================

class CNNtoRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, dropout=0.5):
        super(CNNtoRNN, self).__init__()
        
        # Encoder CNN (ResNet101 without final layer)
        resnet = models.resnet101(weights=models.ResNet101_Weights.DEFAULT)
        modules = list(resnet.children())[:-2]  # Remove avgpool and fc
        self.cnn = nn.Sequential(*modules)
        
        # Adaptive pooling
        self.adaptive_pool = nn.AdaptiveAvgPool2d((14, 14))
        
        # Reduce channels to match embedding size
        self.channel_reducer = nn.Conv2d(2048, embed_size, kernel_size=1)
        
        # Decoder LSTM
        self.rnn = nn.LSTM(
            input_size=embed_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=False
        )
        
        # Attention mechanism
        self.attention = nn.Linear(hidden_size + embed_size, 14*14)
        self.attention_combine = nn.Linear(hidden_size + embed_size, hidden_size)
        
        # Output layers
        self.fc_out = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
        # Embedding layer
        self.embed = nn.Embedding(vocab_size, embed_size)
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embed_size = embed_size

    def forward(self, images, captions):
        # CNN feature extraction
        features = self.cnn(images)
        features = self.adaptive_pool(features)
        features = self.channel_reducer(features)
        
        # Reshape features: (batch_size, embed_size, 14, 14) -> (196, batch_size, embed_size)
        batch_size = features.size(0)
        features = features.view(batch_size, self.embed_size, -1).permute(2, 0, 1)
        
        # Initialize hidden states
        h = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(images.device)
        c = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(images.device)
        
        # Embed captions
        embeddings = self.embed(captions)
        
        # Sequence length
        seq_length = captions.size(0)
        batch_size = captions.size(1)
        
        # Prepare outputs
        outputs = torch.zeros(seq_length, batch_size, len(self.embed.weight)).to(images.device)
        
        # Process sequence
        for t in range(seq_length):
            # Attention mechanism
            combined = torch.cat((h[-1], embeddings[t]), dim=1)
            attention_weights = self.attention(combined)
            attention_weights = attention_weights.view(batch_size, 1, 14*14)
            attention_weights = torch.softmax(attention_weights, dim=2)
            
            # Apply attention to features
            context = torch.bmm(attention_weights, features.permute(1, 0, 2))
            context = context.squeeze(1)
            
            # Combine with embedding
            combined = torch.cat((h[-1], context), dim=1)
            attention_combined = self.attention_combine(combined)
            
            # LSTM step
            lstm_input = attention_combined.unsqueeze(0)
            out, (h, c) = self.rnn(lstm_input, (h, c))
            
            # Output
            out = self.fc_out(self.dropout(out.squeeze(0)))
            outputs[t] = out
        
        return outputs

    def caption_image(self, image, vocab, max_length=50, device="cuda"):
        """Generate caption for a single image"""
        self.eval()
        
        with torch.no_grad():
            # Extract features
            features = self.cnn(image)
            features = self.adaptive_pool(features)
            features = self.channel_reducer(features)
            features = features.view(1, self.embed_size, -1).permute(2, 0, 1)
            
            # Initialize
            h = torch.zeros(self.num_layers, 1, self.hidden_size).to(device)
            c = torch.zeros(self.num_layers, 1, self.hidden_size).to(device)
            
            # Start token
            word = torch.tensor([[vocab.stoi["<SOS>"]]]).to(device)
            caption = []
            
            for _ in range(max_length):
                embeddings = self.embed(word)
                
                # Attention
                combined = torch.cat((h[-1], embeddings.squeeze(1)), dim=1)
                attention_weights = self.attention(combined)
                attention_weights = torch.softmax(attention_weights, dim=1)
                context = torch.bmm(attention_weights.unsqueeze(1), features.permute(1, 0, 2))
                context = context.squeeze(1)
                
                # Combine
                combined = torch.cat((h[-1], context), dim=1)
                attention_combined = self.attention_combine(combined)
                
                # LSTM
                lstm_input = attention_combined.unsqueeze(0)
                out, (h, c) = self.rnn(lstm_input, (h, c))
                
                # Predict next word
                output = self.fc_out(out.squeeze(0))
                predicted = output.argmax(1)
                
                word = predicted.unsqueeze(0)
                
                # Stop if end token
                if vocab.itos[predicted.item()] == "<EOS>":
                    break
                    
                caption.append(vocab.itos[predicted.item()])
        
        return " ".join(caption)

print("‚úì Model architecture ready")

‚úì Model architecture ready


In [23]:
# ============================================================================
# CELL 5: TRAINING SCRIPT (Kaggle Optimized) - UPDATED
# ============================================================================

import time
from datetime import datetime

class ExperimentLogger:
    def __init__(self, exp_name):
        self.exp_name = exp_name
        self.exp_dir = os.path.join(EXPERIMENT_DIR, exp_name)
        os.makedirs(self.exp_dir, exist_ok=True)
        
        self.log_file = os.path.join(self.exp_dir, "training_log.txt")
        self.config_file = os.path.join(self.exp_dir, "config.json")
        
        print(f"[Experiment] Initialized: {self.exp_dir}")
    
    def log_config(self, config):
        with open(self.config_file, 'w') as f:
            json.dump(config, f, indent=2)
        print("[Experiment] Config saved.")
    
    def log_epoch(self, epoch, train_loss, val_loss, lr):
        log_line = f"Epoch {epoch:03d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | LR: {lr:.6f}"
        print(log_line)
        
        with open(self.log_file, 'a') as f:
            f.write(log_line + "\n")


def train_model(
    num_epochs=10,
    batch_size=32,
    learning_rate=3e-4,
    embed_size=512,
    hidden_size=512,
    num_layers=1,
    grad_clip=5.0,
    save_every=1
):
    """Main training function optimized for Kaggle - FIXED VERSION"""
    
    print("\n" + "="*60)
    print("STARTING TRAINING ON KAGGLE")
    print("="*60)
    
    # Check GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    if device.type == "cuda":
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    
    # Create experiment
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    exp_name = f"{timestamp}_ResNet101_LSTM_v1"
    logger = ExperimentLogger(exp_name)
    
    # Config
    config = {
        "num_epochs": num_epochs,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
        "embed_size": embed_size,
        "hidden_size": hidden_size,
        "num_layers": num_layers,
        "grad_clip": grad_clip,
        "dataset": os.path.basename(IMAGE_DIR),
        "device": str(device),
        "timestamp": timestamp
    }
    logger.log_config(config)
    
    # Data transforms
    transform = transforms.Compose([
        transforms.Resize((356, 356)),
        transforms.RandomCrop((299, 299)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    
    # Get data loaders
    print("\nInitializing Loaders...")
    train_loader, val_loader, test_loader, vocab = get_loaders(
        transform=transform,
        batch_size=batch_size,
        num_workers=2,
        shuffle=True,
        pin_memory=True
    )
    
    # Initialize model
    model = CNNtoRNN(
        embed_size=embed_size,
        hidden_size=hidden_size,
        vocab_size=len(vocab),
        num_layers=num_layers
    ).to(device)
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.stoi["<PAD>"])
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # FIXED: Remove 'verbose' parameter from ReduceLROnPlateau
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=2
    )
    
    # Training loop
    print(f"\nStarting training for {num_epochs} epochs...")
    best_val_loss = float('inf')
    
    for epoch in range(num_epochs):
        start_time = time.time()
        
        # Training phase
        model.train()
        train_losses = []
        
        for batch_idx, (images, captions) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")):
            images = images.to(device)
            captions = captions.to(device)
            
            # Forward pass
            outputs = model(images, captions[:-1])
            loss = criterion(
                outputs.reshape(-1, outputs.shape[2]),
                captions[1:].reshape(-1)
            )
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            
            # Gradient clipping
            if grad_clip:
                torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            
            optimizer.step()
            train_losses.append(loss.item())
            
            # Memory management for Kaggle
            if batch_idx % 50 == 0 and device.type == "cuda":
                torch.cuda.empty_cache()
        
        avg_train_loss = np.mean(train_losses)
        
        # Validation phase
        model.eval()
        val_losses = []
        
        with torch.no_grad():
            for images, captions in val_loader:
                images = images.to(device)
                captions = captions.to(device)
                
                outputs = model(images, captions[:-1])
                loss = criterion(
                    outputs.reshape(-1, outputs.shape[2]),
                    captions[1:].reshape(-1)
                )
                val_losses.append(loss.item())
        
        avg_val_loss = np.mean(val_losses)
        scheduler.step(avg_val_loss)
        current_lr = optimizer.param_groups[0]['lr']
        
        # Logging
        epoch_time = time.time() - start_time
        logger.log_epoch(epoch+1, avg_train_loss, avg_val_loss, current_lr)
        print(f"Time: {epoch_time:.1f}s")
        
        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            model_path = os.path.join(MODEL_SAVE_DIR, f"best_model_{exp_name}.pth")
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': avg_val_loss,
                'vocab': vocab,
                'config': config
            }, model_path)
            print(f"‚úì Saved best model (val_loss: {avg_val_loss:.4f})")
        
        # Periodic save
        if (epoch + 1) % save_every == 0:
            model_path = os.path.join(MODEL_SAVE_DIR, f"model_epoch_{epoch+1}_{exp_name}.pth")
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': avg_val_loss,
                'config': config
            }, model_path)
        
        # Early stopping check
        if current_lr < 1e-6:
            print("Learning rate too low, stopping early.")
            break
    
    print("\n" + "="*60)
    print("TRAINING COMPLETE")
    print(f"Best validation loss: {best_val_loss:.4f}")
    print(f"Models saved in: {MODEL_SAVE_DIR}")
    print("="*60)
    
    return model, vocab


# Run training with Kaggle-optimized parameters
if __name__ == "__main__":
    print("Testing the fixed training function...")
    
    # For quick testing on Kaggle, use fewer epochs
    try:
        model, vocab = train_model(
            num_epochs=3,  # Start with 3 epochs for testing
            batch_size=16,  # Smaller batch for testing
            learning_rate=3e-4,
            embed_size=256,  # Smaller for faster training
            hidden_size=256,
            num_layers=1,
            grad_clip=5.0,
            save_every=1
        )
        print("\n‚úÖ Training completed successfully!")
        
        # Show model info
        print(f"\nModel trained with vocabulary size: {len(vocab)}")
        print(f"Check /kaggle/working/ for saved models")
        
    except Exception as e:
        print(f"\n‚ùå Error during training: {e}")
        print("\nTrying simplified training from Cell 8 instead...")

Testing the fixed training function...

STARTING TRAINING ON KAGGLE
Using device: cuda
GPU: Tesla T4
GPU Memory: 15.83 GB
[Experiment] Initialized: /kaggle/working/experiments/2026-01-19_14-58-34_ResNet101_LSTM_v1
[Experiment] Config saved.

Initializing Loaders...
Loading data from: /kaggle/input/flickr8kimagescaptions/flickr8k/captions.txt
Loaded 40455 image-caption pairs
Split: Train=32365, Val=4045, Test=4045
Vocabulary size: 2732

Starting training for 3 epochs...


Epoch 1/3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2023/2023 [17:37<00:00,  1.91it/s]


Epoch 001 | Train Loss: 4.2247 | Val Loss: 3.6429 | LR: 0.000300
Time: 1099.4s
‚úì Saved best model (val_loss: 3.6429)


Epoch 2/3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2023/2023 [17:28<00:00,  1.93it/s]


Epoch 002 | Train Loss: 3.6447 | Val Loss: 3.4178 | LR: 0.000300
Time: 1090.6s
‚úì Saved best model (val_loss: 3.4178)


Epoch 3/3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2023/2023 [17:26<00:00,  1.93it/s]


Epoch 003 | Train Loss: 3.4721 | Val Loss: 3.2941 | LR: 0.000300
Time: 1088.5s
‚úì Saved best model (val_loss: 3.2941)

TRAINING COMPLETE
Best validation loss: 3.2941
Models saved in: /kaggle/working/models

‚úÖ Training completed successfully!

Model trained with vocabulary size: 2732
Check /kaggle/working/ for saved models


In [24]:
# ============================================================================
# CELL 6: INFERENCE & EVALUATION (Kaggle Version) - COMPLETE
# ============================================================================

from collections import Counter
import numpy as np
from typing import List, Dict
import json

def load_trained_model(model_path, device="cuda"):
    """Load a trained model for inference"""
    checkpoint = torch.load(model_path, map_location=device)
    
    # Get config
    config = checkpoint['config']
    vocab = checkpoint['vocab']
    
    # Create model
    model = CNNtoRNN(
        embed_size=config['embed_size'],
        hidden_size=config['hidden_size'],
        vocab_size=len(vocab),
        num_layers=config['num_layers']
    ).to(device)
    
    # Load weights
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    print(f"‚úì Loaded model from epoch {checkpoint['epoch']}")
    print(f"  Validation loss: {checkpoint['val_loss']:.4f}")
    print(f"  Vocabulary size: {len(vocab)}")
    
    return model, vocab


def generate_captions_for_test_set(model, vocab, test_loader, device="cuda", num_samples=5):
    """Generate captions for test images"""
    model.eval()
    results = []
    
    with torch.no_grad():
        for idx, (images, captions) in enumerate(test_loader):
            if idx >= num_samples:
                break
                
            images = images.to(device)
            
            # Get true captions
            true_captions = []
            for i in range(captions.size(1)):
                caption_tokens = []
                for token in captions[:, i]:
                    word = vocab.itos[token.item()]
                    if word == "<EOS>":
                        break
                    if word not in ["<SOS>", "<PAD>"]:
                        caption_tokens.append(word)
                if caption_tokens:  # Only add non-empty captions
                    true_captions.append(" ".join(caption_tokens))
            
            # Generate caption for first image in batch
            generated = model.caption_image(images[0].unsqueeze(0), vocab, device=device)
            
            results.append({
                'image_idx': idx,
                'true_captions': true_captions[:3],  # First 3 references
                'generated': generated
            })
            
            print(f"\nüì∏ Sample {idx+1}:")
            print(f"  ‚úÖ True: {true_captions[0] if true_captions else 'No caption'}")
            print(f"  ü§ñ Generated: {generated}")
    
    return results


# Complete Metrics class
class CaptionMetrics:
    """Evaluation metrics for image captioning"""
    
    def __init__(self):
        self.scores = {}
    
    def compute_bleu(self, reference: list, hypothesis: str, n: int = 4) -> float:
        """Compute BLEU-N score"""
        if not reference or not hypothesis:
            return 0.0
            
        ref_tokens = [ref.lower().split() for ref in reference]
        hyp_tokens = hypothesis.lower().split()
        
        if len(hyp_tokens) == 0:
            return 0.0
        
        # Brevity penalty
        ref_lengths = [len(ref) for ref in ref_tokens]
        closest_ref_len = min(ref_lengths, key=lambda x: abs(x - len(hyp_tokens)))
        
        if len(hyp_tokens) < closest_ref_len:
            bp = np.exp(1 - closest_ref_len / len(hyp_tokens))
        else:
            bp = 1.0
        
        # N-gram precisions
        precisions = []
        for i in range(1, n + 1):
            hyp_ngrams = self._get_ngrams(hyp_tokens, i)
            if not hyp_ngrams:
                precisions.append(0)
                continue
                
            max_ref_counts = Counter()
            
            for ref in ref_tokens:
                ref_ngrams = self._get_ngrams(ref, i)
                for ngram in ref_ngrams:
                    max_ref_counts[ngram] = max(max_ref_counts[ngram], ref_ngrams[ngram])
            
            clipped_counts = {
                ngram: min(count, max_ref_counts.get(ngram, 0))
                for ngram, count in hyp_ngrams.items()
            }
            
            numerator = sum(clipped_counts.values())
            denominator = max(1, len(hyp_tokens) - i + 1)
            precisions.append(numerator / denominator if numerator > 0 else 0)
        
        # Geometric mean
        if min(precisions) > 0:
            geo_mean = np.exp(sum(np.log(p) for p in precisions) / len(precisions))
        else:
            geo_mean = 0
        
        return bp * geo_mean
    
    def _get_ngrams(self, tokens, n):
        ngrams = []
        for i in range(len(tokens) - n + 1):
            ngrams.append(' '.join(tokens[i:i+n]))
        return Counter(ngrams)
    
    def compute_rouge_l(self, reference: list, hypothesis: str) -> float:
        """Compute ROUGE-L score"""
        if not reference or not hypothesis:
            return 0.0
            
        hyp_tokens = hypothesis.lower().split()
        scores = []
        
        for ref in reference:
            ref_tokens = ref.lower().split()
            lcs_length = self._lcs_length(ref_tokens, hyp_tokens)
            
            if len(ref_tokens) == 0 or len(hyp_tokens) == 0:
                scores.append(0.0)
                continue
            
            precision = lcs_length / len(hyp_tokens)
            recall = lcs_length / len(ref_tokens)
            
            if precision + recall > 0:
                f1 = 2 * precision * recall / (precision + recall)
            else:
                f1 = 0.0
            
            scores.append(f1)
        
        return max(scores) if scores else 0.0
    
    def _lcs_length(self, x, y):
        m, n = len(x), len(y)
        dp = [[0] * (n + 1) for _ in range(m + 1)]
        
        for i in range(1, m + 1):
            for j in range(1, n + 1):
                if x[i-1] == y[j-1]:
                    dp[i][j] = dp[i-1][j-1] + 1
                else:
                    dp[i][j] = max(dp[i-1][j], dp[i][j-1])
        
        return dp[m][n]
    
    def compute_meteor(self, reference: list, hypothesis: str) -> float:
        """Compute METEOR score"""
        if not reference or not hypothesis:
            return 0.0
            
        hyp_tokens = set(hypothesis.lower().split())
        scores = []
        
        for ref in reference:
            ref_tokens = set(ref.lower().split())
            
            if len(hyp_tokens) == 0 and len(ref_tokens) == 0:
                scores.append(1.0)
                continue
            elif len(hyp_tokens) == 0 or len(ref_tokens) == 0:
                scores.append(0.0)
                continue
            
            matches = len(hyp_tokens & ref_tokens)
            precision = matches / len(hyp_tokens) if hyp_tokens else 0
            recall = matches / len(ref_tokens) if ref_tokens else 0
            
            if precision + recall > 0:
                f_mean = (precision * recall) / (0.9 * precision + 0.1 * recall)
            else:
                f_mean = 0.0
            
            scores.append(f_mean)
        
        return max(scores) if scores else 0.0
    
    def evaluate_batch(self, references, hypotheses):
        """Evaluate a batch of captions"""
        bleu_scores = {1: [], 2: [], 3: [], 4: []}
        rouge_scores = []
        meteor_scores = []
        
        for img_id in hypotheses:
            if img_id not in references:
                continue
            
            ref = references[img_id]
            hyp = hypotheses[img_id]
            
            for n in range(1, 5):
                bleu_scores[n].append(self.compute_bleu(ref, hyp, n=n))
            
            rouge_scores.append(self.compute_rouge_l(ref, hyp))
            meteor_scores.append(self.compute_meteor(ref, hyp))
        
        results = {
            'BLEU-1': np.mean(bleu_scores[1]) if bleu_scores[1] else 0.0,
            'BLEU-2': np.mean(bleu_scores[2]) if bleu_scores[2] else 0.0,
            'BLEU-3': np.mean(bleu_scores[3]) if bleu_scores[3] else 0.0,
            'BLEU-4': np.mean(bleu_scores[4]) if bleu_scores[4] else 0.0,
            'ROUGE-L': np.mean(rouge_scores) if rouge_scores else 0.0,
            'METEOR': np.mean(meteor_scores) if meteor_scores else 0.0,
        }
        
        return results
    
    def print_evaluation_report(self, results: Dict[str, float]):
        """Print formatted evaluation report"""
        print("\n" + "="*50)
        print("CAPTION GENERATION EVALUATION RESULTS")
        print("="*50)
        for metric, score in results.items():
            print(f"{metric:15s}: {score:.4f}")
        print("="*50 + "\n")
    
    def save_results(self, results: Dict[str, float], filepath: str):
        """Save evaluation results to JSON file"""
        with open(filepath, 'w') as f:
            json.dump(results, f, indent=2)
        print(f"‚úì Results saved to {filepath}")


def evaluate_model_comprehensive(model, vocab, test_loader, device="cuda", num_batches=5):
    """Comprehensive evaluation of the model"""
    print("\n" + "="*60)
    print("COMPREHENSIVE MODEL EVALUATION")
    print("="*60)
    
    evaluator = CaptionMetrics()
    
    references = {}
    hypotheses = {}
    
    model.eval()
    with torch.no_grad():
        for batch_idx, (images, captions) in enumerate(test_loader):
            if batch_idx >= num_batches:
                break
                
            images = images.to(device)
            
            # Process each image in batch
            for i in range(min(2, images.size(0))):  # Max 2 images per batch
                img_id = f"batch{batch_idx}_img{i}"
                
                # Get true captions
                true_captions = []
                for j in range(captions.size(1)):
                    caption_tokens = []
                    for token in captions[:, j]:
                        word = vocab.itos[token.item()]
                        if word == "<EOS>":
                            break
                        if word not in ["<SOS>", "<PAD>"]:
                            caption_tokens.append(word)
                    if caption_tokens:
                        true_captions.append(" ".join(caption_tokens))
                
                if true_captions:
                    references[img_id] = true_captions
                    
                    # Generate caption
                    generated = model.caption_image(
                        images[i].unsqueeze(0), 
                        vocab, 
                        device=device,
                        max_length=20
                    )
                    hypotheses[img_id] = generated
    
    # Calculate metrics
    if references and hypotheses:
        results = evaluator.evaluate_batch(references, hypotheses)
        
        # Print report
        evaluator.print_evaluation_report(results)
        
        # Save detailed results
        detailed_results = {
            "metrics": results,
            "sample_count": len(hypotheses),
            "samples": []
        }
        
        # Add sample predictions
        for img_id, hyp in list(hypotheses.items())[:3]:
            detailed_results["samples"].append({
                "image_id": img_id,
                "generated": hyp,
                "references": references.get(img_id, [])
            })
        
        results_file = "/kaggle/working/evaluation_results_detailed.json"
        with open(results_file, 'w') as f:
            json.dump(detailed_results, f, indent=2)
        
        print(f"‚úì Detailed results saved to {results_file}")
        
        return results
    else:
        print("‚ùå No data to evaluate")
        return None


def run_complete_evaluation_pipeline():
    """Run complete evaluation pipeline"""
    print("Running complete evaluation pipeline...")
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")
    
    # Check for trained models
    model_files = [f for f in os.listdir("/kaggle/working") 
                  if f.endswith(".pth") and "best_model" in f]
    
    if not model_files:
        model_files = [f for f in os.listdir("/kaggle/working") 
                      if f.endswith(".pth")]
    
    if model_files:
        # Load the best or latest model
        model_path = f"/kaggle/working/{sorted(model_files)[-1]}"
        print(f"Loading model: {os.path.basename(model_path)}")
        
        model, vocab = load_trained_model(model_path, device)
        
        # Get test data loader
        transform = transforms.Compose([
            transforms.Resize((299, 299)),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])
        
        _, _, test_loader, _ = get_loaders(
            transform=transform,
            batch_size=8,
            num_workers=0,
            shuffle=False
        )
        
        # 1. Generate sample captions
        print("\n" + "="*60)
        print("GENERATING SAMPLE CAPTIONS")
        print("="*60)
        samples = generate_captions_for_test_set(
            model, vocab, test_loader, device, num_samples=3
        )
        
        # 2. Comprehensive evaluation
        print("\n" + "="*60)
        print("RUNNING COMPREHENSIVE EVALUATION")
        print("="*60)
        results = evaluate_model_comprehensive(
            model, vocab, test_loader, device, num_batches=3
        )
        
        # 3. Save final summary
        if results:
            summary = {
                "model": os.path.basename(model_path),
                "evaluation_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                "metrics": results,
                "sample_predictions": samples
            }
            
            summary_file = "/kaggle/working/final_evaluation_summary.json"
            with open(summary_file, 'w') as f:
                json.dump(summary, f, indent=2)
            
            print(f"\n‚úì Final evaluation summary saved to {summary_file}")
        
        return model, vocab, results
    else:
        print("‚ùå No trained models found. Please train a model first.")
        return None, None, None


# Example usage demonstration
if __name__ == "__main__":
    print("Testing evaluation pipeline...")
    
    # Create a simple test if no trained model exists
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Test metrics class
    print("\nTesting metrics class with sample data...")
    evaluator = CaptionMetrics()
    
    # Sample data
    test_references = {
        "img1": ["a dog playing with a ball", "a brown dog playing with a red ball"],
        "img2": ["a person riding a bicycle", "someone cycling on the road"]
    }
    
    test_hypotheses = {
        "img1": "a dog playing with ball",
        "img2": "a person riding a bike"
    }
    
    test_results = evaluator.evaluate_batch(test_references, test_hypotheses)
    evaluator.print_evaluation_report(test_results)
    
    print("‚úì Metrics class test successful!")
    print("\nTo run full evaluation on your trained model:")
    print("1. Train a model using Cell 5 or Cell 8")
    print("2. Run: model, vocab, results = run_complete_evaluation_pipeline()")

Testing evaluation pipeline...

Testing metrics class with sample data...

CAPTION GENERATION EVALUATION RESULTS
BLEU-1         : 0.8094
BLEU-2         : 0.7418
BLEU-3         : 0.6933
BLEU-4         : 0.6238
ROUGE-L        : 0.8545
METEOR         : 0.8750

‚úì Metrics class test successful!

To run full evaluation on your trained model:
1. Train a model using Cell 5 or Cell 8
2. Run: model, vocab, results = run_complete_evaluation_pipeline()


In [25]:
# ============================================================================
# CELL 7: UTILITIES & HELPER FUNCTIONS
# ============================================================================

def save_checkpoint(model, optimizer, epoch, val_loss, vocab, config, filename):
    """Save model checkpoint"""
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'val_loss': val_loss,
        'vocab': vocab,
        'config': config
    }
    torch.save(checkpoint, filename)
    print(f"‚úì Checkpoint saved: {filename}")


def load_checkpoint(filename, device="cuda"):
    """Load model checkpoint"""
    checkpoint = torch.load(filename, map_location=device)
    return checkpoint


def cleanup_memory():
    """Clean up GPU memory (important for Kaggle)"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    import gc
    gc.collect()


def get_model_summary(model):
    """Print model summary"""
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print("="*60)
    print("MODEL SUMMARY")
    print("="*60)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Model size: {total_params * 4 / 1024**2:.2f} MB (float32)")
    print("="*60)
    
    return total_params, trainable_params


# Test the training with fixed scheduler
if __name__ == "__main__":
    # Clean up any existing memory
    cleanup_memory()
    
    # Run training with fixed scheduler
    print("Testing training with fixed scheduler...")
    
    # Quick test with 1 epoch first
    try:
        model, vocab = train_model(
            num_epochs=1,  # Just 1 epoch for testing
            batch_size=16,  # Smaller batch for testing
            learning_rate=3e-4,
            embed_size=256,
            hidden_size=256,
            num_layers=1,
            grad_clip=5.0,
            save_every=1
        )
        print("\n‚úì Training test successful!")
        
        # Show model summary
        get_model_summary(model)
        
    except Exception as e:
        print(f"\n‚úó Error during training: {e}")
        print("\nTrying alternative approach...")
        
        # Alternative: Train without scheduler first
        print("\nStarting training without scheduler...")

Testing training with fixed scheduler...

STARTING TRAINING ON KAGGLE
Using device: cuda
GPU: Tesla T4
GPU Memory: 15.83 GB
[Experiment] Initialized: /kaggle/working/experiments/2026-01-19_15-55-11_ResNet101_LSTM_v1
[Experiment] Config saved.

Initializing Loaders...
Loading data from: /kaggle/input/flickr8kimagescaptions/flickr8k/captions.txt
Loaded 40455 image-caption pairs
Split: Train=32365, Val=4045, Test=4045
Vocabulary size: 2732

Starting training for 1 epochs...


Epoch 1/1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2023/2023 [17:32<00:00,  1.92it/s]


Epoch 001 | Train Loss: 4.1951 | Val Loss: 3.6272 | LR: 0.000300
Time: 1094.1s
‚úì Saved best model (val_loss: 3.6272)

TRAINING COMPLETE
Best validation loss: 3.6272
Models saved in: /kaggle/working/models

‚úì Training test successful!
MODEL SUMMARY
Total parameters: 45,184,432
Trainable parameters: 45,184,432
Model size: 172.36 MB (float32)


In [26]:
# ============================================================================
# CELL 8: ALTERNATIVE TRAINING FUNCTION (FIXED)
# ============================================================================

def train_simple_model(num_epochs=3):
    """Simplified training function for Kaggle"""
    print("\n" + "="*60)
    print("SIMPLE TRAINING (No Scheduler)")
    print("="*60)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")
    
    # Data transforms
    transform = transforms.Compose([
        transforms.Resize((299, 299)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    
    # Get data loaders (smaller for testing)
    print("\nLoading data...")
    train_loader, val_loader, test_loader, vocab = get_loaders(
        transform=transform,
        batch_size=16,
        num_workers=2,
        shuffle=True,
        pin_memory=True,
        test_size=0.05,  # Smaller test set
        val_size=0.05    # Smaller validation set
    )
    
    print(f"Vocabulary size: {len(vocab)}")
    
    # Create simple model
    model = CNNtoRNN(
        embed_size=256,
        hidden_size=256,
        vocab_size=len(vocab),
        num_layers=1
    ).to(device)
    
    # Loss and optimizer (no scheduler)
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.stoi["<PAD>"])
    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
    
    print("\nStarting training...")
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        batch_count = 0
        
        # Training loop
        for images, captions in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            images = images.to(device)
            captions = captions.to(device)
            
            # Forward
            outputs = model(images, captions[:-1])
            loss = criterion(
                outputs.reshape(-1, outputs.shape[2]),
                captions[1:].reshape(-1)
            )
            
            # Backward
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            optimizer.step()
            
            total_loss += loss.item()
            batch_count += 1
            
            # Memory cleanup every 10 batches
            if batch_count % 10 == 0 and device.type == "cuda":
                torch.cuda.empty_cache()
        
        avg_loss = total_loss / batch_count
        print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}")
        
        # Save checkpoint
        if (epoch + 1) % 2 == 0:
            checkpoint_path = f"/kaggle/working/model_epoch_{epoch+1}.pth"
            save_checkpoint(
                model, optimizer, epoch, avg_loss, vocab,
                {"epoch": epoch, "loss": avg_loss},
                checkpoint_path
            )
    
    print("\n" + "="*60)
    print("TRAINING COMPLETE")
    print("="*60)
    
    return model, vocab


# Run simple training
if __name__ == "__main__":
    print("Running simplified training...")
    model, vocab = train_simple_model(num_epochs=2)
    
    # Test inference
    print("\nTesting inference on sample images...")
    
    # Get a sample batch
    transform = transforms.Compose([
        transforms.Resize((299, 299)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    
    train_loader, val_loader, test_loader, _ = get_loaders(
        transform=transform,
        batch_size=4,
        num_workers=0,
        shuffle=False
    )
    
    # Generate caption for first image
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    
    with torch.no_grad():
        for images, captions in test_loader:
            images = images.to(device)
            
            # Generate caption for first image
            caption = model.caption_image(images[0].unsqueeze(0), vocab, device=device)
            
            # Get true caption
            true_caption_tokens = []
            for token in captions[:, 0]:
                word = vocab.itos[token.item()]
                if word == "<EOS>":
                    break
                if word not in ["<SOS>", "<PAD>"]:
                    true_caption_tokens.append(word)
            true_caption = " ".join(true_caption_tokens)
            
            print(f"\nGenerated Caption: {caption}")
            print(f"True Caption: {true_caption}")
            break
    
    print("\n‚úì Inference test successful!")

Running simplified training...

SIMPLE TRAINING (No Scheduler)
Device: cuda

Loading data...
Loading data from: /kaggle/input/flickr8kimagescaptions/flickr8k/captions.txt
Loaded 40455 image-caption pairs
Split: Train=36415, Val=2020, Test=2020
Vocabulary size: 2882
Vocabulary size: 2882

Starting training...


Epoch 1/2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2276/2276 [19:55<00:00,  1.90it/s]


Epoch 1 - Average Loss: 4.1536


Epoch 2/2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2276/2276 [19:52<00:00,  1.91it/s]


Epoch 2 - Average Loss: 3.5728
‚úì Checkpoint saved: /kaggle/working/model_epoch_2.pth

TRAINING COMPLETE

Testing inference on sample images...
Loading data from: /kaggle/input/flickr8kimagescaptions/flickr8k/captions.txt
Loaded 40455 image-caption pairs
Split: Train=32365, Val=4045, Test=4045
Vocabulary size: 2732

Generated Caption: a man in a blue shirt is standing on a <UNK> .
True Caption: a ice lined <UNK> people baby a throwing booths .

‚úì Inference test successful!


In [28]:
# ============================================================================
# CELL 9: EVALUATION & METRICS COMPLETE - FIXED
# ============================================================================

def load_checkpoint(filename, device="cuda"):
    """Load model checkpoint - FIXED for PyTorch 2.6"""
    try:
        # Method 1: Try with weights_only=False (less secure but works for trusted checkpoints)
        checkpoint = torch.load(filename, map_location=device, weights_only=False)
    except Exception as e:
        print(f"First load attempt failed: {e}")
        try:
            # Method 2: Add safe globals for Vocabulary class
            import torch.serialization
            torch.serialization.add_safe_globals([Vocabulary])
            checkpoint = torch.load(filename, map_location=device, weights_only=True)
        except Exception as e2:
            print(f"Second load attempt failed: {e2}")
            # Method 3: Last resort - load without weights_only (backward compatible)
            checkpoint = torch.load(filename, map_location=device)
    
    return checkpoint


def evaluate_model(model, test_loader, vocab, device="cuda", num_samples=10):
    """Evaluate model on test set"""
    print("\n" + "="*60)
    print("MODEL EVALUATION")
    print("="*60)
    
    model.eval()
    evaluator = CaptionMetrics()
    
    references = {}
    hypotheses = {}
    
    with torch.no_grad():
        for batch_idx, (images, captions) in enumerate(tqdm(test_loader, desc="Evaluating")):
            if batch_idx >= 5:  # Limit to 5 batches for speed
                break
                
            images = images.to(device)
            
            # Process each image in batch
            for i in range(images.size(0)):
                img_id = f"img_{batch_idx}_{i}"
                
                # Get true captions (all references for this image)
                true_captions = []
                for j in range(captions.size(1)):
                    caption_tokens = []
                    for token in captions[:, j]:
                        word = vocab.itos[token.item()]
                        if word == "<EOS>":
                            break
                        if word not in ["<SOS>", "<PAD>"]:
                            caption_tokens.append(word)
                    true_captions.append(" ".join(caption_tokens))
                
                references[img_id] = true_captions
                
                # Generate caption
                generated = model.caption_image(
                    images[i].unsqueeze(0), 
                    vocab, 
                    device=device,
                    max_length=20
                )
                hypotheses[img_id] = generated
    
    # Calculate metrics
    if references and hypotheses:
        results = evaluator.evaluate_batch(references, hypotheses)
        
        print("\nEVALUATION RESULTS:")
        print("="*40)
        for metric, score in results.items():
            print(f"{metric:15s}: {score:.4f}")
        print("="*40)
        
        # Save results
        results_file = "/kaggle/working/evaluation_results.json"
        import json
        with open(results_file, 'w') as f:
            json.dump({
                "metrics": results,
                "sample_count": len(hypotheses),
                "references_sample": {k: v[:2] for k, v in list(references.items())[:2]},
                "hypotheses_sample": {k: v for k, v in list(hypotheses.items())[:2]}
            }, f, indent=2)
        
        print(f"\n‚úì Results saved to: {results_file}")
        
        return results
    else:
        print("No data to evaluate")
        return None


def interactive_inference(model, vocab, device="cuda"):
    """Interactive inference on test images"""
    print("\n" + "="*60)
    print("INTERACTIVE INFERENCE")
    print("="*60)
    
    # Get test loader
    transform = transforms.Compose([
        transforms.Resize((299, 299)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    
    _, _, test_loader, _ = get_loaders(
        transform=transform,
        batch_size=4,
        num_workers=0,
        shuffle=True
    )
    
    model.eval()
    
    # Generate captions for a few images
    print("\nGenerating captions for sample images...\n")
    
    with torch.no_grad():
        for batch_idx, (images, captions) in enumerate(test_loader):
            if batch_idx >= 3:  # Show 3 batches
                break
                
            images = images.to(device)
            
            print(f"\n{'='*40}")
            print(f"BATCH {batch_idx + 1}")
            print('='*40)
            
            for i in range(min(2, images.size(0))):  # Show 2 images per batch
                # Generate caption
                generated = model.caption_image(
                    images[i].unsqueeze(0), 
                    vocab, 
                    device=device
                )
                
                # Get true captions
                true_captions = []
                for j in range(3):  # Show 3 true captions
                    caption_tokens = []
                    for token in captions[:, j]:
                        word = vocab.itos[token.item()]
                        if word == "<EOS>":
                            break
                        if word not in ["<SOS>", "<PAD>"]:
                            caption_tokens.append(word)
                    if caption_tokens:
                        true_captions.append(" ".join(caption_tokens))
                
                print(f"\nImage {i+1}:")
                print(f"  Generated: {generated}")
                print(f"  True captions:")
                for idx, true_cap in enumerate(true_captions[:2]):
                    print(f"    {idx+1}. {true_cap}")
                print()


# FIXED: Safe model loading function
def load_trained_model_safe(model_path, device="cuda"):
    """Load trained model safely for PyTorch 2.6"""
    print(f"Loading model: {os.path.basename(model_path)}")
    
    # Load checkpoint with multiple fallback methods
    checkpoint = load_checkpoint(model_path, device)
    
    # Recreate vocabulary from saved dictionaries
    if 'vocab' in checkpoint:
        vocab = checkpoint['vocab']
    else:
        # Fallback: recreate vocabulary from saved dicts
        vocab = Vocabulary(freq_threshold=5)
        vocab.stoi = checkpoint.get('vocab_stoi', {})
        vocab.itos = {int(k): v for k, v in checkpoint.get('vocab_itos', {}).items()}
    
    # Get config with defaults
    config = checkpoint.get('config', {
        'embed_size': 256,
        'hidden_size': 256,
        'num_layers': 1
    })
    
    # Create model
    model = CNNtoRNN(
        embed_size=config.get('embed_size', 256),
        hidden_size=config.get('hidden_size', 256),
        vocab_size=len(vocab),
        num_layers=config.get('num_layers', 1)
    ).to(device)
    
    # Load weights
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    print(f"‚úì Model loaded from epoch {checkpoint.get('epoch', 'unknown')}")
    print(f"  Validation loss: {checkpoint.get('val_loss', 'unknown'):.4f}")
    print(f"  Vocabulary size: {len(vocab)}")
    
    return model, vocab


# Main execution - FIXED
if __name__ == "__main__":
    print("Starting evaluation pipeline...")
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Find trained models
    model_files = [f for f in os.listdir("/kaggle/working") 
                  if f.endswith(".pth") and not f.startswith(".")]
    
    if model_files:
        latest_model = sorted(model_files)[-1]
        model_path = f"/kaggle/working/{latest_model}"
        
        try:
            # Load model using safe method
            model, vocab = load_trained_model_safe(model_path, device)
            
            # Prepare test loader
            transform = transforms.Compose([
                transforms.Resize((299, 299)),
                transforms.ToTensor(),
                transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
            ])
            
            _, _, test_loader, _ = get_loaders(
                transform=transform,
                batch_size=8,
                num_workers=0,
                shuffle=False
            )
            
            # Quick test first
            print("\n" + "="*60)
            print("QUICK MODEL TEST")
            print("="*60)
            
            # Test on one batch
            for images, captions in test_loader:
                images = images.to(device)
                caption = model.caption_image(images[0].unsqueeze(0), vocab, device=device)
                
                # Get true caption
                true_words = []
                for token in captions[:, 0]:
                    word = vocab.itos[token.item()]
                    if word == "<EOS>":
                        break
                    if word not in ["<SOS>", "<PAD>"]:
                        true_words.append(word)
                
                print(f"\n‚úÖ Model works!")
                print(f"   Generated: {caption}")
                print(f"   True: {' '.join(true_words)}")
                break
            
            # Ask user if they want full evaluation
            print("\n" + "="*60)
            response = input("Run full evaluation? (y/n): ").lower().strip()
            
            if response == 'y':
                # Run evaluation
                results = evaluate_model(model, test_loader, vocab, device, num_samples=10)
                
                # Interactive inference
                interactive_inference(model, vocab, device)
            else:
                print("Skipping full evaluation. Model is loaded and ready for use.")
                
        except Exception as e:
            print(f"‚ùå Error loading model: {e}")
            print("\nTrying alternative loading method...")
            
            # Try direct loading
            try:
                checkpoint = torch.load(model_path, map_location=device, weights_only=False)
                print("‚úì Loaded with weights_only=False")
                
                # Show what's in checkpoint
                print(f"Checkpoint keys: {list(checkpoint.keys())}")
                
            except Exception as e2:
                print(f"Still failing: {e2}")
                print("\nPlease try the Quick Evaluation Workaround cell instead.")
                
    else:
        print("‚ùå No trained models found.")
        print("Check /kaggle/working/ for .pth files")
        print("If you just trained, list files with: !ls -la /kaggle/working/")

Starting evaluation pipeline...
Using device: cuda
Loading model: model_epoch_2.pth
First load attempt failed: cannot access local variable 'torch' where it is not associated with a value
‚úì Model loaded from epoch 1
  Validation loss: 3.5728
  Vocabulary size: 2882
Loading data from: /kaggle/input/flickr8kimagescaptions/flickr8k/captions.txt
Loaded 40455 image-caption pairs
Split: Train=32365, Val=4045, Test=4045
Vocabulary size: 2732

QUICK MODEL TEST

‚úÖ Model works!
   Generated: a man in a blue shirt is standing on a <UNK> .
   True: a ice lined <UNK> people baby a throwing booths .



Run full evaluation? (y/n):  y



MODEL EVALUATION


Evaluating:   1%|          | 5/506 [00:01<02:15,  3.71it/s]



EVALUATION RESULTS:
BLEU-1         : 0.4394
BLEU-2         : 0.1137
BLEU-3         : 0.0000
BLEU-4         : 0.0000
ROUGE-L        : 0.3238
METEOR         : 0.3163

‚úì Results saved to: /kaggle/working/evaluation_results.json

INTERACTIVE INFERENCE
Loading data from: /kaggle/input/flickr8kimagescaptions/flickr8k/captions.txt
Loaded 40455 image-caption pairs
Split: Train=32365, Val=4045, Test=4045
Vocabulary size: 2732

Generating captions for sample images...


BATCH 1

Image 1:
  Generated: a man in a blue shirt is standing on a <UNK> .
  True captions:
    1. a ice lined <UNK> people baby a throwing booths .
    2. a ice group of and display hat lined girl white little a booths .


Image 2:
  Generated: a man in a blue shirt is standing on a <UNK> .
  True captions:
    1. a ice lined <UNK> people baby a throwing booths .
    2. a ice group of and display hat lined girl white little a booths .


BATCH 2

Image 1:
  Generated: a man in a blue shirt is standing on a <UNK> .
  True ca

In [29]:
# ============================================================================
# CELL 10: FINAL SUMMARY & EXPORT
# ============================================================================

def create_final_summary():
    """Create final summary of the experiment"""
    print("\n" + "="*60)
    print("EXPERIMENT SUMMARY")
    print("="*60)
    
    # Check what we have
    print("\nüìÅ Output Directory Contents:")
    !ls -la /kaggle/working/
    
    print("\nüìä Experiment Logs:")
    if os.path.exists("/kaggle/working/experiments"):
        experiments = os.listdir("/kaggle/working/experiments")
        for exp in experiments[:3]:  # Show first 3
            print(f"  - {exp}")
    
    print("\nü§ñ Trained Models:")
    model_files = [f for f in os.listdir("/kaggle/working") if f.endswith(".pth")]
    for model_file in model_files[:5]:  # Show first 5
        print(f"  - {model_file}")
    
    # Dataset info
    print("\nüì¶ Dataset Information:")
    if os.path.exists(ANNOTATION_FILE):
        with open(ANNOTATION_FILE, 'r') as f:
            lines = f.readlines()
            print(f"  Captions file: {len(lines)} lines")
    
    if os.path.exists(IMAGE_DIR):
        images = [f for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        print(f"  Images directory: {len(images)} images")
    
    # GPU info
    if torch.cuda.is_available():
        print(f"\n‚ö° GPU: {torch.cuda.get_device_name(0)}")
        print(f"  Memory used: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
        print(f"  Memory cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
    
    print("\n" + "="*60)
    print("NEXT STEPS:")
    print("="*60)
    print("1. To train longer: Increase num_epochs in train_simple_model()")
    print("2. To save outputs: Download files from /kaggle/working/")
    print("3. To improve: Adjust batch_size, learning_rate, model size")
    print("4. To evaluate: Run the evaluation cell after training")
    print("="*60)


# Create summary
create_final_summary()

print("\n‚úÖ Kaggle Notebook Setup Complete!")
print("\nTo run the full pipeline:")
print("1. Run Cell 1-4 for setup")
print("2. Run Cell 8 for training")
print("3. Run Cell 9 for evaluation")
print("4. Run Cell 10 for summary")


EXPERIMENT SUMMARY

üìÅ Output Directory Contents:
total 531412
drwxr-xr-x 6 root root      4096 Jan 19 17:21 .
drwxr-xr-x 5 root root      4096 Jan 19 14:31 ..
drwxr-xr-x 5 root root      4096 Jan 19 14:44 Deep_Learning_final
-rw-r--r-- 1 root root       721 Jan 19 17:21 evaluation_results.json
drwxr-xr-x 6 root root      4096 Jan 19 15:55 experiments
-rw-r--r-- 1 root root 544130280 Jan 19 16:54 model_epoch_2.pth
drwxr-xr-x 2 root root      4096 Jan 19 16:13 models
drwxr-xr-x 2 root root      4096 Jan 19 14:31 .virtual_documents

üìä Experiment Logs:
  - 2026-01-19_14-53-10_ResNet101_LSTM_v1
  - 2026-01-19_15-55-11_ResNet101_LSTM_v1
  - 2026-01-19_14-57-03_ResNet101_LSTM_v1

ü§ñ Trained Models:
  - model_epoch_2.pth

üì¶ Dataset Information:
  Captions file: 40456 lines
  Images directory: 8091 images

‚ö° GPU: Tesla T4
  Memory used: 0.40 GB
  Memory cached: 5.73 GB

NEXT STEPS:
1. To train longer: Increase num_epochs in train_simple_model()
2. To save outputs: Download files f