# The "Springfield" Identity - Training Notebook


In [None]:
# IMPORTANT: Copy dataset to local disk FIRST (run this before training!)
# This makes training MUCH faster (10x speedup)
import subprocess
import os
from pathlib import Path

# Try to find characters_train folder
possible_paths = [
    '/content/drive/MyDrive/shared/bonusHW/characters_train',
    '/content/drive/MyDrive/bonusHW/characters_train',
    '/content/drive/MyDrive/characters_train',
    '/content/drive/MyDrive/shared/characters_train',
]

source = None
for path in possible_paths:
    if os.path.exists(path):
        source = path
        print(f"Found characters_train at: {path}")
        break

if source is None:
    print("Searching for characters_train in Google Drive...")
    for root, dirs, files in os.walk('/content/drive/MyDrive'):
        if 'characters_train' in dirs:
            source = os.path.join(root, 'characters_train')
            print(f"Found characters_train at: {source}")
            break

dest = '/content/characters_train'

if source and os.path.exists(source):
    if not os.path.exists(dest):
        print(f"Copying dataset from {source} to local disk... This may take a few minutes.")
        os.system(f'cp -r "{source}" "{dest}"')
        if os.path.exists(dest):
            print("✓ Dataset copied to local disk!")
        else:
            print("✗ Copy failed. Trying alternative method...")
            import shutil
            shutil.copytree(source, dest)
            print("✓ Dataset copied!")
    else:
        print("✓ Dataset already exists on local disk!")
else:
    print("✗ characters_train folder not found!")
    print("Please make sure:")
    print("1. The folder is uploaded to Google Drive")
    print("2. Update the 'source' variable above with the correct path")
    print("\nCurrent directory structure:")
    if os.path.exists('/content/drive/MyDrive'):
        print("MyDrive contents:")
        for item in os.listdir('/content/drive/MyDrive')[:10]:
            print(f"  - {item}")


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import os
WORK_DIR = '/content/drive/MyDrive/bonusHW'
os.makedirs(WORK_DIR, exist_ok=True)
os.chdir(WORK_DIR)

print(f"Working directory: {os.getcwd()}")
print(f"\nContents of current directory:")
contents = os.listdir('.') if os.path.exists('.') else []
if contents:
    for item in contents[:10]:
        print(f"  - {item}")
    if len(contents) > 10:
        print(f"  ... and {len(contents) - 10} more items")
else:
    print("  (empty)")

if 'characters_train' in contents:
    print("\n✓ Found 'characters_train' directory!")
else:
    print("\n✗ 'characters_train' directory not found")
    print("\nTo fix this:")
    print("1. Go to Google Drive (drive.google.com)")
    print("2. Navigate to: MyDrive/shared/bonusHW/")
    print("3. Upload the 'characters_train' folder here")
    print("4. Make sure it contains subdirectories like homer_simpson/, marge_simpson/, etc.")


In [None]:
# Optional: Search for characters_train in Google Drive if you're not sure where it is
# Uncomment and run this cell to search for the folder
"""
import os
from pathlib import Path

def find_characters_train(start_path='/content/drive/MyDrive', max_depth=3):
    start = Path(start_path)
    found_paths = []
    
    for root, dirs, files in os.walk(start):
        depth = root.replace(str(start), '').count(os.sep)
        if depth > max_depth:
            dirs.clear()
            continue
        
        if 'characters_train' in dirs:
            full_path = os.path.join(root, 'characters_train')
            found_paths.append(full_path)
            print(f"Found: {full_path}")
    
    return found_paths

print("Searching for 'characters_train' folder...")
found = find_characters_train()
if found:
    print(f"\nFound {len(found)} location(s). You can update data_dir to use one of these paths.")
else:
    print("\nNot found. Please upload the characters_train folder to Google Drive.")
"""


In [None]:
import torch
import random
import numpy as np

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ['PYTHONHASHSEED'] = str(42)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from PIL import Image
from pathlib import Path
import json


In [None]:
class SimpsonsDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data_dir = Path(data_dir)
        self.transform = transform
        self.images = []
        self.labels = []
        self.class_to_idx = {}
        self.idx_to_class = {}
        
        character_dirs = sorted([d for d in self.data_dir.iterdir() if d.is_dir()])
        
        for idx, char_dir in enumerate(character_dirs):
            class_name = char_dir.name
            self.class_to_idx[class_name] = idx
            self.idx_to_class[idx] = class_name
            
            image_files = sorted(list(char_dir.glob('*.jpg')))
            for img_path in image_files:
                self.images.append(img_path)
                self.labels.append(idx)
        
        print(f"Loaded {len(self.images)} images from {len(character_dirs)} classes")
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        img_path = self.images[idx]
        label = self.labels[idx]
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
        
        return image, label


In [None]:
class SimpsonsCNN(nn.Module):
    def __init__(self, num_classes):
        super(SimpsonsCNN, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.bn4 = nn.BatchNorm2d(256)
        self.conv5 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
        self.bn5 = nn.BatchNorm2d(512)
        
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(0.5)
        
        self.fc1 = nn.Linear(512 * 4 * 4, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, num_classes)
        
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.pool(self.relu(self.bn1(self.conv1(x))))
        x = self.pool(self.relu(self.bn2(self.conv2(x))))
        x = self.pool(self.relu(self.bn3(self.conv3(x))))
        x = self.pool(self.relu(self.bn4(self.conv4(x))))
        x = self.pool(self.relu(self.bn5(self.conv5(x))))
        
        x = x.view(x.size(0), -1)
        
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.dropout(self.relu(self.fc2(x)))
        x = self.fc3(x)
        
        return x


In [None]:
train_transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


In [None]:
# If you rsynced to local disk, keep this path:
data_dir = '/content/characters_train'

if not os.path.exists(data_dir):
    print(f"ERROR: '{data_dir}' directory not found!")
    print(f"Current directory: {os.getcwd()}")
    print(f"\nPlease make sure:")
    print(f"1. You ran the rsync copy cell to /content/characters_train")
    print(f"   !rsync -a --info=progress2 /content/drive/MyDrive/shared/bonusHW/characters_train /content/characters_train")
    print(f"2. Or update data_dir to the correct path")
    print(f"\nCurrent directory contents:")
    print(os.listdir('.'))
    raise FileNotFoundError(f"Data directory '{data_dir}' not found in {os.getcwd()}")

full_dataset = SimpsonsDataset(data_dir, transform=None)

train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_subset, val_subset = random_split(
    full_dataset, 
    [train_size, val_size],
    generator=torch.Generator().manual_seed(42)
)

class SubsetDataset(Dataset):
    def __init__(self, base_dataset, indices, transform=None):
        self.base_dataset = base_dataset
        self.indices = indices
        self.transform = transform
    
    def __len__(self):
        return len(self.indices)
    
    def __getitem__(self, idx):
        actual_idx = self.indices[idx]
        image, label = self.base_dataset[actual_idx]
        if self.transform:
            image = self.transform(image)
        return image, label

train_dataset = SubsetDataset(full_dataset, train_subset.indices, transform=train_transform)
val_dataset = SubsetDataset(full_dataset, val_subset.indices, transform=val_transform)

num_classes = len(full_dataset.class_to_idx)

class_mappings = {
    'class_to_idx': full_dataset.class_to_idx,
    'idx_to_class': full_dataset.idx_to_class
}

with open(os.path.join(WORK_DIR, 'class_mappings.json'), 'w') as f:
    json.dump(class_mappings, f)

print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}, Classes: {num_classes}")


In [None]:
batch_size = 64 if torch.cuda.is_available() else 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=torch.cuda.is_available())
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=torch.cuda.is_available())


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpsonsCNN(num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)


In [None]:
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100 * correct / total
    return epoch_loss, epoch_acc

def validate(model, val_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / len(val_loader)
    epoch_acc = 100 * correct / total
    return epoch_loss, epoch_acc


In [None]:
num_epochs = 20
best_val_acc = 0.0

for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = validate(model, val_loader, criterion, device)
    
    scheduler.step(val_loss)
    
    print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}% | Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        model_path = os.path.join(WORK_DIR, 'model.pth')
        torch.save(model.state_dict(), model_path)
        print(f"Model saved. Best val acc: {best_val_acc:.2f}%")

print(f"Training completed. Best validation accuracy: {best_val_acc:.2f}%")


In [None]:
model_path = os.path.join(WORK_DIR, 'model.pth')
model.load_state_dict(torch.load(model_path, map_location=device))
torch.save(model.state_dict(), model_path)
