In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
## ============================================================
# Task 4 – Game Title Detection (Image Classification)
# Production-Grade Pipeline with Swin Transformer
# ============================================================

# Install dependencies
!pip install timm --quiet

import os
import random
import time
import warnings
from pathlib import Path
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight

import timm
from torchvision import transforms

warnings.filterwarnings('ignore')

# ============================================================
# Configuration
# ============================================================

class CFG:
    """Centralized configuration for the entire pipeline."""
    
    # Paths
    DATA_ROOT = "/kaggle/input/cpe342-karena/public_dataset/task4"
    TRAIN_CSV = "train.csv"
    VAL_CSV = "val.csv"
    TEST_CSV = "test_refined.csv"
    TRAIN_DIR = "train"
    VAL_DIR = "val"
    TEST_DIR = "test"
    
    # Model
    MODEL_NAME = "swin_small_patch4_window7_224"
    # Alternative models: "vit_base_patch16_224", "vgg16", "efficientnet_b0"
    
    # Training hyperparameters
    IMG_SIZE = 224
    BATCH_SIZE = 64
    NUM_EPOCHS = 5
    EARLY_STOPPING_PATIENCE = 3  # Stop if no improvement for 3 epochs
    N_FOLDS = 5
    LR = 1e-4
    WEIGHT_DECAY = 1e-4
    
    # System
    SEED = 42
    NUM_WORKERS = 2
    USE_MULTI_GPU = True  # Enable multi-GPU training
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Output
    OUTPUT_DIR = "/kaggle/working"
    SUBMISSION_NAME = "task4_submission.csv"

# ============================================================
# Utilities
# ============================================================

def set_seed(seed: int = 42) -> None:
    """Set random seeds for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def get_label_mapping(labels: pd.Series) -> Tuple[Optional[Dict], int]:
    """
    Create label-to-index mapping if labels are strings.
    
    Returns:
        label2idx: Dictionary mapping labels to indices (None if numeric)
        num_classes: Total number of classes
    """
    if labels.dtype == "O":  # Object/String type
        unique_labels = sorted(labels.unique())
        label2idx = {label: idx for idx, label in enumerate(unique_labels)}
        num_classes = len(label2idx)
    else:
        label2idx = None
        num_classes = int(labels.max()) + 1
    
    return label2idx, num_classes

# ============================================================
# Data Augmentation
# ============================================================

def get_train_transform(img_size: int = 224) -> transforms.Compose:
    """
    Strong augmentation pipeline for training.
    
    Includes:
    - Random resized crop
    - Horizontal flip
    - Rotation
    - Color jitter
    - ImageNet normalization
    """
    return transforms.Compose([
        transforms.Resize((img_size + 32, img_size + 32)),
        transforms.RandomResizedCrop(img_size, scale=(0.8, 1.0)),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(15),
        transforms.ColorJitter(
            brightness=0.2,
            contrast=0.2,
            saturation=0.2,
            hue=0.1
        ),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],  # ImageNet stats
            std=[0.229, 0.224, 0.225]
        ),
    ])

def get_valid_transform(img_size: int = 224) -> transforms.Compose:
    """Validation/test transform with only resize and normalization."""
    return transforms.Compose([
        transforms.Resize((img_size, img_size)),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        ),
    ])

# ============================================================
# Dataset
# ============================================================

class GameDataset(Dataset):
    """
    Dataset class for game title detection.
    
    Handles both training (with labels) and test (without labels) datasets.
    """
    
    def __init__(
        self,
        df: pd.DataFrame,
        img_dir: str,
        transform: Optional[transforms.Compose] = None,
        label2idx: Optional[Dict] = None,
        is_test: bool = False
    ):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test
        self.label2idx = label2idx
        
        # Detect image column name (prioritize file_name over id)
        self.image_col = self._detect_column(
            ["file_name", "filename", "image", "id"],
            default=df.columns[1] if len(df.columns) > 1 else df.columns[0]
        )
        
        # Detect label column name (if not test)
        if not is_test:
            self.label_col = self._detect_column(
                ["label", "target"],
                default=df.columns[1]
            )
    
    def _detect_column(self, candidates: List[str], default: str) -> str:
        """Detect which column name exists in the dataframe."""
        for col in candidates:
            if col in self.df.columns:
                return col
        return default
    
    def __len__(self) -> int:
        return len(self.df)
    
    def __getitem__(self, idx: int) -> Tuple:
        row = self.df.iloc[idx]
        img_name = str(row[self.image_col])
        
        # Add file extension if not present
        if not img_name.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif')):
            # Try common extensions
            for ext in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']:
                img_path = os.path.join(self.img_dir, img_name + ext)
                if os.path.exists(img_path):
                    break
            else:
                # If no extension works, default to .jpg
                img_path = os.path.join(self.img_dir, img_name + '.jpg')
        else:
            img_path = os.path.join(self.img_dir, img_name)
        
        # Load and transform image
        try:
            image = Image.open(img_path).convert("RGB")
        except Exception as e:
            print(f"Error loading {img_path}: {e}")
            # Return a black image as fallback
            image = Image.new("RGB", (CFG.IMG_SIZE, CFG.IMG_SIZE))
        
        if self.transform:
            image = self.transform(image)
        
        if self.is_test:
            return image, img_name
        
        # Process label
        label = row[self.label_col]
        if self.label2idx is not None and isinstance(label, str):
            label = self.label2idx[label]
        label = int(label)
        
        return image, label

# ============================================================
# Model Creation
# ============================================================

def create_model(num_classes: int, model_name: str = CFG.MODEL_NAME) -> nn.Module:
    """
    Create a pretrained model from timm library.
    
    Args:
        num_classes: Number of output classes
        model_name: Name of the model architecture
    
    Returns:
        PyTorch model (wrapped in DataParallel if multi-GPU is enabled)
    """
    model = timm.create_model(
        model_name,
        pretrained=True,
        num_classes=num_classes
    )
    
    # Enable multi-GPU training
    if CFG.USE_MULTI_GPU and torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs for training")
        model = nn.DataParallel(model)
    
    return model

# ============================================================
# Training & Evaluation
# ============================================================

def train_one_epoch(
    model: nn.Module,
    loader: DataLoader,
    criterion: nn.Module,
    optimizer: optim.Optimizer,
    scheduler: Optional[optim.lr_scheduler._LRScheduler] = None,
    device: str = CFG.DEVICE
) -> Tuple[float, float]:
    """
    Train model for one epoch.
    
    Returns:
        avg_loss: Average loss for the epoch
        accuracy: Classification accuracy
    """
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for images, labels in loader:
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        if scheduler is not None:
            scheduler.step()
        
        # Metrics
        running_loss += loss.item() * images.size(0)
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    
    avg_loss = running_loss / total
    accuracy = correct / total
    
    return avg_loss, accuracy

@torch.no_grad()
def eval_one_epoch(
    model: nn.Module,
    loader: DataLoader,
    criterion: nn.Module,
    device: str = CFG.DEVICE
) -> Tuple[float, float]:
    """
    Evaluate model for one epoch.
    
    Returns:
        avg_loss: Average loss for the epoch
        accuracy: Classification accuracy
    """
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for images, labels in loader:
        images = images.to(device)
        labels = labels.to(device)
        
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        running_loss += loss.item() * images.size(0)
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    
    avg_loss = running_loss / total
    accuracy = correct / total
    
    return avg_loss, accuracy

@torch.no_grad()
def predict_proba(
    model: nn.Module,
    loader: DataLoader,
    num_classes: int,
    device: str = CFG.DEVICE
) -> np.ndarray:
    """
    Generate probability predictions for a dataset.
    
    Returns:
        Array of shape (n_samples, num_classes) with softmax probabilities
    """
    model.eval()
    all_probs = []
    
    for batch in loader:
        if isinstance(batch, (list, tuple)) and len(batch) == 2:
            images, _ = batch
        else:
            images = batch
        
        images = images.to(device)
        outputs = model(images)
        probs = torch.softmax(outputs, dim=1)
        all_probs.append(probs.cpu().numpy())
    
    return np.concatenate(all_probs, axis=0)

# ============================================================
# Main Training Pipeline
# ============================================================

def main():
    """Main training and inference pipeline."""
    
    # Initialize
    set_seed(CFG.SEED)
    print(f"Using device: {CFG.DEVICE}")
    
    # Check available GPUs
    if torch.cuda.is_available():
        print(f"Number of GPUs available: {torch.cuda.device_count()}")
        for i in range(torch.cuda.device_count()):
            print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")
    
    print(f"Model: {CFG.MODEL_NAME}")
    print("=" * 60)
    
    # Load data
    train_df = pd.read_csv(os.path.join(CFG.DATA_ROOT, CFG.TRAIN_CSV))
    val_df = pd.read_csv(os.path.join(CFG.DATA_ROOT, CFG.VAL_CSV))
    test_df = pd.read_csv(os.path.join(CFG.DATA_ROOT, CFG.TEST_CSV))
    
    print(f"Train shape: {train_df.shape}")
    print(f"Val shape:   {val_df.shape}")
    print(f"Test shape:  {test_df.shape}")
    
    # Get label mapping
    label_col = "label" if "label" in train_df.columns else train_df.columns[1]
    label2idx, num_classes = get_label_mapping(train_df[label_col])
    print(f"Number of classes: {num_classes}")
    
    if label2idx:
        print(f"Label mapping: {label2idx}")
    
    # ============================================================
    # K-Fold Cross Validation
    # ============================================================
    
    skf = StratifiedKFold(
        n_splits=CFG.N_FOLDS,
        shuffle=True,
        random_state=CFG.SEED
    )
    
    y_labels = train_df[label_col].map(label2idx).values if label2idx else train_df[label_col].values
    
    fold_models = []
    oof_preds = np.zeros((len(train_df), num_classes), dtype=np.float32)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, y_labels)):
        print(f"\n{'=' * 60}")
        print(f"Fold {fold + 1}/{CFG.N_FOLDS}")
        print(f"{'=' * 60}")
        
        # Split data
        train_fold = train_df.iloc[train_idx].reset_index(drop=True)
        val_fold = train_df.iloc[val_idx].reset_index(drop=True)
        
        # Compute class weights
        y_train = train_fold[label_col].map(label2idx).values if label2idx else train_fold[label_col].values
        y_train = y_train.astype(int)
        
        class_weights = compute_class_weight(
            class_weight="balanced",
            classes=np.arange(num_classes),
            y=y_train
        )
        class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(CFG.DEVICE)
        criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
        
        # Create sampler for handling class imbalance
        sample_weights = class_weights[y_train]
        sampler = WeightedRandomSampler(
            weights=sample_weights,
            num_samples=len(sample_weights),
            replacement=True
        )
        
        # Create datasets
        train_dataset = GameDataset(
            train_fold,
            os.path.join(CFG.DATA_ROOT, CFG.TRAIN_DIR),
            transform=get_train_transform(CFG.IMG_SIZE),
            label2idx=label2idx,
            is_test=False
        )
        
        val_dataset = GameDataset(
            val_fold,
            os.path.join(CFG.DATA_ROOT, CFG.TRAIN_DIR),
            transform=get_valid_transform(CFG.IMG_SIZE),
            label2idx=label2idx,
            is_test=False
        )
        
        # Create dataloaders
        train_loader = DataLoader(
            train_dataset,
            batch_size=CFG.BATCH_SIZE,
            sampler=sampler,
            num_workers=CFG.NUM_WORKERS,
            pin_memory=True
        )
        
        val_loader = DataLoader(
            val_dataset,
            batch_size=CFG.BATCH_SIZE,
            shuffle=False,
            num_workers=CFG.NUM_WORKERS,
            pin_memory=True
        )
        
        # Create model
        model = create_model(num_classes).to(CFG.DEVICE)
        
        # Optimizer and scheduler
        optimizer = optim.AdamW(
            model.parameters(),
            lr=CFG.LR,
            weight_decay=CFG.WEIGHT_DECAY
        )
        
        num_training_steps = CFG.NUM_EPOCHS * len(train_loader)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=num_training_steps,
            eta_min=CFG.LR * 0.01
        )
        
        # Training loop
        best_val_acc = 0.0
        best_state = None
        patience_counter = 0
        
        for epoch in range(1, CFG.NUM_EPOCHS + 1):
            start_time = time.time()
            
            train_loss, train_acc = train_one_epoch(
                model, train_loader, criterion, optimizer, scheduler
            )
            
            val_loss, val_acc = eval_one_epoch(
                model, val_loader, criterion
            )
            
            elapsed = time.time() - start_time
            
            print(f"Epoch {epoch:02d} | "
                  f"Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | "
                  f"Val Loss: {val_loss:.4f} Acc: {val_acc:.4f} | "
                  f"Time: {elapsed:.1f}s")
            
            # Save best model
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                # Save state_dict without DataParallel wrapper
                if isinstance(model, nn.DataParallel):
                    best_state = model.module.state_dict()
                else:
                    best_state = model.state_dict()
                patience_counter = 0
                print(f"  → New best validation accuracy: {best_val_acc:.4f}")
            else:
                patience_counter += 1
                print(f"  → No improvement (patience: {patience_counter}/{CFG.EARLY_STOPPING_PATIENCE})")
            
            # Early stopping
            if patience_counter >= CFG.EARLY_STOPPING_PATIENCE:
                print(f"  → Early stopping triggered at epoch {epoch}")
                break
        
        print(f"\nBest validation accuracy for fold {fold + 1}: {best_val_acc:.4f}")
        
        # Save fold model (already unwrapped in best_state)
        fold_model_path = os.path.join(CFG.OUTPUT_DIR, f"model_fold{fold}.pth")
        torch.save(best_state, fold_model_path)
        fold_models.append(fold_model_path)
        
        # Load best model for OOF predictions
        if isinstance(model, nn.DataParallel):
            model.module.load_state_dict(best_state)
        else:
            model.load_state_dict(best_state)
        
        fold_probs = predict_proba(model, val_loader, num_classes)
        oof_preds[val_idx] = fold_probs
    
    # ============================================================
    # Out-of-Fold Evaluation
    # ============================================================
    
    oof_pred_labels = oof_preds.argmax(axis=1)
    oof_acc = (oof_pred_labels == y_labels).mean()
    
    print(f"\n{'=' * 60}")
    print(f"Out-of-Fold Accuracy: {oof_acc:.4f}")
    print(f"{'=' * 60}")
    
    # ============================================================
    # Test Inference with Ensemble
    # ============================================================
    
    print("\nGenerating test predictions with ensemble...")
    
    test_dataset = GameDataset(
        test_df,
        os.path.join(CFG.DATA_ROOT, CFG.TEST_DIR),
        transform=get_valid_transform(CFG.IMG_SIZE),
        label2idx=label2idx,
        is_test=True
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=CFG.BATCH_SIZE,
        shuffle=False,
        num_workers=CFG.NUM_WORKERS,
        pin_memory=True
    )
    
    # Ensemble predictions
    all_fold_probs = []
    
    for fold, model_path in enumerate(fold_models):
        print(f"Loading fold {fold + 1} model...")
        model = create_model(num_classes).to(CFG.DEVICE)
        
        # Load state dict (unwrapped version saved)
        state_dict = torch.load(model_path, map_location=CFG.DEVICE)
        
        # Load into the correct model (handle DataParallel wrapper)
        if isinstance(model, nn.DataParallel):
            model.module.load_state_dict(state_dict)
        else:
            model.load_state_dict(state_dict)
        
        fold_probs = predict_proba(model, test_loader, num_classes)
        all_fold_probs.append(fold_probs)
    
    # Average probabilities across folds
    ensemble_probs = np.mean(all_fold_probs, axis=0)
    test_pred_labels = ensemble_probs.argmax(axis=1)
    
    # Convert back to original labels if needed
    if label2idx:
        idx2label = {v: k for k, v in label2idx.items()}
        test_pred_labels = [idx2label[int(pred)] for pred in test_pred_labels]
    
    # ============================================================
    # Create Submission
    # ============================================================
    
    # Detect ID column for submission (use id, not file_name)
    id_col = "id"
    for col in ["id"]:
        if col in test_df.columns:
            id_col = col
            break
    
    submission = pd.DataFrame({
        "id": test_df[id_col],
        "label": test_pred_labels
    })
    
    submission_path = os.path.join(CFG.OUTPUT_DIR, CFG.SUBMISSION_NAME)
    submission.to_csv(submission_path, index=False)
    
    print(f"\nSubmission saved to: {submission_path}")
    print(f"\nSubmission preview:")
    print(submission.head(10))
    print(f"\nLabel distribution in predictions:")
    print(submission["label"].value_counts().sort_index())

# ============================================================
# Execute Pipeline
# ============================================================

if __name__ == "__main__":
    main()

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m90.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m77.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━



Using device: cuda
Number of GPUs available: 2
  GPU 0: Tesla T4
  GPU 1: Tesla T4
Model: swin_small_patch4_window7_224
Train shape: (31546, 3)
Val shape:   (24772, 3)
Test shape:  (25889, 3)
Number of classes: 5

Fold 1/5


model.safetensors:   0%|          | 0.00/200M [00:00<?, ?B/s]

Using 2 GPUs for training
Epoch 01 | Train Loss: 0.2772 Acc: 0.8844 | Val Loss: 0.1306 Acc: 0.9564 | Time: 347.5s
  → New best validation accuracy: 0.9564
Epoch 02 | Train Loss: 0.0918 Acc: 0.9636 | Val Loss: 0.0971 Acc: 0.9574 | Time: 340.5s
  → New best validation accuracy: 0.9574
Epoch 03 | Train Loss: 0.0531 Acc: 0.9784 | Val Loss: 0.0744 Acc: 0.9699 | Time: 340.5s
  → New best validation accuracy: 0.9699
Epoch 04 | Train Loss: 0.0378 Acc: 0.9843 | Val Loss: 0.0579 Acc: 0.9810 | Time: 340.4s
  → New best validation accuracy: 0.9810
Epoch 05 | Train Loss: 0.0274 Acc: 0.9888 | Val Loss: 0.0493 Acc: 0.9846 | Time: 340.6s
  → New best validation accuracy: 0.9846
Epoch 06 | Train Loss: 0.0185 Acc: 0.9912 | Val Loss: 0.0482 Acc: 0.9818 | Time: 340.5s
  → No improvement (patience: 1/3)
Epoch 07 | Train Loss: 0.0154 Acc: 0.9931 | Val Loss: 0.0428 Acc: 0.9843 | Time: 340.8s
  → No improvement (patience: 2/3)
Epoch 08 | Train Loss: 0.0095 Acc: 0.9959 | Val Loss: 0.0416 Acc: 0.9870 | Time: 34