## Setup

In [1]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
sys.path.append(str(project_root))

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from sklearn.model_selection import StratifiedKFold
from src.preprocess import build_preprocessor
from src.dataset import TitanicDataset
from src.model import TitanicNN
import joblib
import numpy as np
import os
import pandas as pd

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


## Build and Save Preprocessor

In [3]:
preprocessor, cat_cols, num_cols = build_preprocessor('../data/processed/train_clean.csv')
input_dim = joblib.load('../models/preprocessor.pkl').transform(
    pd.read_csv('../data/processed/train_clean.csv').drop(columns=['Survived'])
).shape[1]
print(f'Input dimension after encoding: {input_dim}')

Input dimension after encoding: 34


## Training Dataset

In [4]:
full_dataset = TitanicDataset(
    '../data/processed/train_clean.csv',
    '../models/preprocessor.pkl',
    has_label=True
)

## 5-fold CV + early stopping

In [5]:
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=420)
labels = pd.read_csv('../data/processed/train_clean.csv')['Survived']
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(np.zeros(len(full_dataset)), labels)):
    print(f'\n=== Fold {fold+1} ===')
    train_sub = torch.utils.data.Subset(full_dataset, train_idx)
    val_sub   = torch.utils.data.Subset(full_dataset, val_idx)

    train_loader = DataLoader(train_sub, batch_size=64, shuffle=True)
    val_loader   = DataLoader(val_sub,   batch_size=128, shuffle=False)

    model = TitanicNN(input_dim).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=2e-3, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=6, factor=0.3)

    best_val = 0.0
    patience, trials = 0, 10

    for epoch in range(200):
        model.train()
        train_loss = 0.0
        for Xb, yb in train_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(Xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # ---- validation ----
        model.eval()
        val_preds, val_true = [], []
        with torch.no_grad():
            for Xb, yb in val_loader:
                Xb, yb = Xb.to(device), yb.to(device)
                val_preds.append(model(Xb).cpu().numpy())
                val_true.append(yb.cpu().numpy())
        val_preds = np.concatenate(val_preds).squeeze()
        val_true  = np.concatenate(val_true).squeeze()
        val_acc = ( (val_preds > 0.5) == val_true ).mean()

        scheduler.step(1 - val_acc)

        if val_acc > best_val:
            best_val = val_acc
            patience = 0
            torch.save(model.state_dict(), f'../models/nn_fold{fold+1}_best.pt')
        else:
            patience += 1
            if patience >= trials:
                print(f'Early stop at epoch {epoch+1}')
                break

        if (epoch+1) % 20 == 0:
            print(f'Epoch {epoch+1:3d} | Train loss {train_loss/len(train_loader):.4f} | Val acc {val_acc:.4f}')

    cv_scores.append(best_val)
    print(f'Fold {fold+1} best validation accuracy: {best_val:.4f}')

print(f'\nCV mean accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}')


=== Fold 1 ===
Early stop at epoch 18
Fold 1 best validation accuracy: 0.9058

=== Fold 2 ===
Epoch  20 | Train loss 0.3139 | Val acc 0.7713
Early stop at epoch 25
Fold 2 best validation accuracy: 0.7803

=== Fold 3 ===
Early stop at epoch 13
Fold 3 best validation accuracy: 0.8027

=== Fold 4 ===
Epoch  20 | Train loss 0.3415 | Val acc 0.8243
Early stop at epoch 23
Fold 4 best validation accuracy: 0.8423

CV mean accuracy: 0.8328 ± 0.0477


## Model

In [None]:
# Load best hyper-params (same arch)
final_model = TitanicNN(input_dim).to(device)
final_model.load_state_dict(torch.load('../models/nn_fold1_best.pt'))  # pick any

# Retrain on whole training set
full_loader = DataLoader(full_dataset, batch_size=64, shuffle=True)
optimizer = optim.Adam(final_model.parameters(), lr=5e-4)
criterion = nn.BCELoss()

for epoch in range(420):
    final_model.train()
    for Xb, yb in full_loader:
        Xb, yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(final_model(Xb), yb)
        loss.backward()
        optimizer.step()
    if (epoch+1) % 20 == 0:
        print(f'Full-train epoch {epoch+1}')

torch.save(final_model.state_dict(), '../models/nn_final.pt')

Full-train epoch 20
Full-train epoch 40
Full-train epoch 60
Full-train epoch 80
Full-train epoch 100
Full-train epoch 120
Full-train epoch 140
Full-train epoch 160
Full-train epoch 180
Full-train epoch 200
Full-train epoch 220
Full-train epoch 240
Full-train epoch 260
Full-train epoch 280
Full-train epoch 300
Full-train epoch 320
Full-train epoch 340
Full-train epoch 360
Full-train epoch 380
Full-train epoch 400
Full-train epoch 420


## Submission

In [None]:
# Test dataset (no label)
test_dataset = TitanicDataset(
    '../data/processed/test_clean.csv',
    '../models/preprocessor.pkl',
    has_label=False
)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

final_model.eval()
test_preds = []
with torch.no_grad():
    for Xb in test_loader:
        Xb = Xb.to(device)
        test_preds.append(final_model(Xb).cpu().numpy())
test_preds = np.concatenate(test_preds).squeeze()

submission = pd.read_csv('../data/raw/gender_submission.csv')
submission['Survived'] = (test_preds > 0.5).astype(int)

os.makedirs('../submissions', exist_ok=True)
submission.to_csv('../submissions/nn_v1.csv', index=False)
print('Submission saved!')