In [1]:
# USE AS MANY CELLS AS NEEDED
import torch
import torch.nn as nn
import pandas as pd
import numpy as np

from torch import optim
from sklearn.model_selection import KFold
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler

#from torch.utils.data import TensorDataset, DataLoader
    

In [3]:
X_train = pd.read_csv('~/Downloads/X_train.csv')
y_train = pd.read_csv('~/Downloads/y_train.csv')
X_test = pd.read_csv('~/Downloads/X_test.csv')
y_test = pd.read_csv('~/Downloads/y_test.csv')

In [4]:
class MyModel(nn.Module):
    pass

In [5]:
class MyModel(nn.Module):

    def __init__(self):
        super().__init__()
        # Define layers: input dim assumed 16, final output dim 1
        self.module_list = nn.ModuleList([
            nn.Linear(12, 24),
            nn.ReLU(),
            nn.Linear(24, 48),
            nn.ReLU(),
            nn.Linear(48, 96),
            nn.ReLU(),
            nn.Linear(96, 1)
        ])

    def forward(self, x):
        for layer in self.module_list:
            x = layer(x)
        return x

In [6]:
# Drop junk
X_train = X_train.drop(columns=['ID', 'Case Number', 'Updated On', 'Date'])

# Booleans
X_train['Arrest'] = X_train['Arrest'].astype(int)
X_train['Domestic'] = X_train['Domestic'].astype(int)

cat_cols = ['Block', 'Primary Type', 'Description', 'IUCR', 'FBI Code']

encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    encoders[col] = le


scaler = StandardScaler()
X_train[X_train.columns] = scaler.fit_transform(X_train[X_train.columns])



In [63]:
def kfold_cross_validation_pytorch(X, y, k=5, batch_size=16, learning_rate=0.0001, n_epochs=100):
    # Robustly convert inputs to numeric and cast to float32
    # Work with DataFrame view for consistent coercion
    if isinstance(X, pd.DataFrame):
        X_df = X.copy()
    else:
        X_df = pd.DataFrame(X)
    X_df = X_df.apply(pd.to_numeric, errors='coerce')
    # Fill numeric NaNs with column means (safe fallback)
    X_df = X_df.fillna(X_df.mean())
    X_arr = X_df.values.astype(np.float32)

    if isinstance(y, pd.DataFrame):
        y_df = y.copy()
    else:
        y_df = pd.DataFrame(y)
    y_df = y_df.apply(pd.to_numeric, errors='coerce')
    # If y is entirely NaN in a column, fill with 0s as last resort
    y_df = y_df.fillna(y_df.mean()).fillna(0)
    y_arr = y_df.values.astype(np.float32)
    if y_arr.ndim == 1:
        y_arr = y_arr.reshape(-1, 1)

    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    fold_results = {
        'train_loss': [],
        'val_loss': [],
        'models': []
    }

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_arr), 1):
        print(f"Training Fold {fold}/{k}")

        # Create tensors for this fold
        X_train_fold = torch.tensor(X_arr[train_idx], dtype=torch.float32)
        y_train_fold = torch.tensor(y_arr[train_idx], dtype=torch.float32)
        X_val_fold = torch.tensor(X_arr[val_idx], dtype=torch.float32)
        y_val_fold = torch.tensor(y_arr[val_idx], dtype=torch.float32)

        # Create DataLoaders
        train_dataset = TensorDataset(X_train_fold, y_train_fold)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_dataset = TensorDataset(X_val_fold, y_val_fold)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # Initialize model, loss, optimizer
        torch.manual_seed(1234)
        model = MyModel()
        cost_function = nn.MSELoss()
        optimizer = optim.SGD(model.parameters(), lr=learning_rate)

        train_losses = []

        for epoch in range(n_epochs):
            model.train()
            running_cost = 0.0
            for x_batch, y_batch in train_loader:
                optimizer.zero_grad()
                y_pred = model(x_batch)
                cost = cost_function(y_pred, y_batch)
                cost.backward()
                optimizer.step()
                running_cost += cost.item() * x_batch.size(0)

            epoch_train_loss = running_cost / len(X_train_fold)
            train_losses.append(epoch_train_loss)
            if epoch % 10 == 0:
                print(f"Epoch {epoch+1}: Train Loss = {epoch_train_loss:.4f}")

        # Validation
        model.eval()
        val_running_cost = 0.0
        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                y_pred = model(x_batch)
                cost = cost_function(y_pred, y_batch)
                val_running_cost += cost.item() * x_batch.size(0)

        val_loss = val_running_cost / len(X_val_fold)
        final_train_loss = train_losses[-1] if len(train_losses) else None

        fold_results['train_loss'].append(final_train_loss)
        fold_results['val_loss'].append(val_loss)
        fold_results['models'].append(model)

        print(f"\nFold {fold} Results:")
        print(f"  Final Train Loss: {final_train_loss:.4f}")
        print(f"  Validation Loss: {val_loss:.4f}")

    avg_train_loss = np.mean([t for t in fold_results['train_loss'] if t is not None])
    avg_val_loss = np.mean(fold_results['val_loss'])
    std_val_loss = np.std(fold_results['val_loss'])
    print(f"K-Fold Cross-Validation Results (k={k})")
    print(f"Average Train Loss: {avg_train_loss:.4f}")
    print(f"Average Val Loss: {avg_val_loss:.4f} (+/- {std_val_loss:.4f})")
    print(f"Individual Fold Val Losses: {[f'{loss:.4f}' for loss in fold_results['val_loss']]})")

    fold_results['avg_train_loss'] = avg_train_loss
    fold_results['avg_val_loss'] = avg_val_loss
    fold_results['std_val_loss'] = std_val_loss
    return fold_results


In [None]:

if __name__ == "__main__":
    
    results = kfold_cross_validation_pytorch(
        X=X_train,
        y=y_train,
        k=5,
        batch_size=16,
        learning_rate=0.0001,
        n_epochs=100
    )
    
    # Identify best fold
    print(f"\nBest fold (lowest val loss): {np.argmin(results['val_loss']) + 1}")

Training Fold 1/5
Epoch 1: Train Loss = 0.0004
Epoch 1: Train Loss = 0.0004
Epoch 11: Train Loss = 0.0001
Epoch 11: Train Loss = 0.0001
Epoch 21: Train Loss = 0.0000
Epoch 21: Train Loss = 0.0000
Epoch 31: Train Loss = 0.0000
Epoch 31: Train Loss = 0.0000
Epoch 41: Train Loss = 0.0000
Epoch 41: Train Loss = 0.0000
Epoch 51: Train Loss = 0.0000
Epoch 51: Train Loss = 0.0000
Epoch 61: Train Loss = 0.0000
Epoch 61: Train Loss = 0.0000
Epoch 71: Train Loss = 0.0000
Epoch 71: Train Loss = 0.0000
Epoch 81: Train Loss = 0.0000
Epoch 81: Train Loss = 0.0000
Epoch 91: Train Loss = 0.0000
Epoch 91: Train Loss = 0.0000

Fold 1 Results:
  Final Train Loss: 0.0000
  Validation Loss: 0.0000
Training Fold 2/5

Fold 1 Results:
  Final Train Loss: 0.0000
  Validation Loss: 0.0000
Training Fold 2/5
Epoch 1: Train Loss = 0.0004
Epoch 1: Train Loss = 0.0004
