In [17]:
# USE AS MANY CELLS AS NEEDED
import torch
import torch.nn as nn
import pandas as pd
import numpy as np

from torch import optim
from sklearn.model_selection import KFold
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler

#from torch.utils.data import TensorDataset, DataLoader
    

In [18]:
X_train = pd.read_csv('~/Downloads/X_train.csv')
y_train = pd.read_csv('~/Downloads/y_train.csv')
X_test = pd.read_csv('~/Downloads/X_test.csv')
y_test = pd.read_csv('~/Downloads/y_test.csv')

In [19]:
import ast

# Clean and parse data
train = pd.concat([X_train, y_train], axis=1)
train_clean = train.dropna()
X_train_clean = train_clean.iloc[:, :-1]
y_train_clean = train_clean.iloc[:, -1]

# Parse coordinates
y_coords = y_train_clean.apply(ast.literal_eval)
y_clean = np.vstack(y_coords.values)

print(f"Total samples: {len(X_train_clean):,}")

# sample the data 
sample_size = 100000 
np.random.seed(42)
sample_indices = np.random.choice(len(X_train_clean), size=sample_size, replace=False)
X_train_sample = X_train_clean.iloc[sample_indices].reset_index(drop=True)
y_train_sample = y_clean[sample_indices]

print(f"Using {sample_size:,} samples for Neural Network training")

# Update the variables
X_train = X_train_sample
y_train = pd.DataFrame(y_train_sample, columns=['lat', 'lon'])

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

Total samples: 946,609
Using 100,000 samples for Neural Network training
X_train shape: (100000, 16)
y_train shape: (100000, 2)


In [20]:
class MyModel(nn.Module):
    pass

In [21]:

class MyModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.module_list = nn.ModuleList([
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 2)  # 2 outputs: latitude and longitude
        ])

    def forward(self, x):
        for layer in self.module_list:
            x = layer(x)
        return x

In [22]:
# Drop junk
X_train = X_train.drop(columns=['ID', 'Case Number', 'Updated On', 'Date'], errors='ignore')

# Booleans
X_train['Arrest'] = X_train['Arrest'].astype(int)
X_train['Domestic'] = X_train['Domestic'].astype(int)

# Drop highly unique categorical columns for speed
high_unique = ['Block', 'Description']
X_train = X_train.drop(columns=high_unique, errors='ignore')
X_train = X_train.drop(columns=['Ward', 'Community Area', 'Beat', 'District'], errors='ignore')

# Encode remaining categorical variables
cat_cols = ['Primary Type', 'IUCR', 'FBI Code']
encoders = {}
for col in cat_cols:
    if col in X_train.columns:
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col].astype(str))
        encoders[col] = le

# Scale features
scaler = StandardScaler()
X_train[X_train.columns] = scaler.fit_transform(X_train[X_train.columns])

print(f"Final X_train shape: {X_train.shape}")
print(f"Features: {X_train.columns.tolist()}")


Final X_train shape: (100000, 6)
Features: ['IUCR', 'Primary Type', 'Arrest', 'Domestic', 'FBI Code', 'Year']


In [23]:
def kfold_cross_validation_pytorch(X, y, k=5, batch_size=32, learning_rate=0.001, n_epochs=50):
    # Robustly convert inputs to numeric and cast to float32
    if isinstance(X, pd.DataFrame):
        X_df = X.copy()
    else:
        X_df = pd.DataFrame(X)
    X_df = X_df.apply(pd.to_numeric, errors='coerce')
    # Fill numeric NaNs with column means (safe fallback)
    X_df = X_df.fillna(X_df.mean())
    X_arr = X_df.values.astype(np.float32)

    if isinstance(y, pd.DataFrame):
        y_df = y.copy()
    else:
        y_df = pd.DataFrame(y)
    y_df = y_df.apply(pd.to_numeric, errors='coerce')
    y_df = y_df.fillna(y_df.mean()).fillna(0)
    y_arr = y_df.values.astype(np.float32)
    if y_arr.ndim == 1:
        y_arr = y_arr.reshape(-1, 1)

    #get input dimension
    input_dim = X_arr.shape[1]
    print(f"Input dimension: {input_dim}")
    print(f"Output dimension: {y_arr.shape[1]}")

    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    fold_results = {
        'train_loss': [],
        'val_loss': [],
        'models': []
    }

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_arr), 1):
        print(f"Training Fold {fold}/{k}")

        # Create tensors for this fold
        X_train_fold = torch.tensor(X_arr[train_idx], dtype=torch.float32)
        y_train_fold = torch.tensor(y_arr[train_idx], dtype=torch.float32)
        X_val_fold = torch.tensor(X_arr[val_idx], dtype=torch.float32)
        y_val_fold = torch.tensor(y_arr[val_idx], dtype=torch.float32)

        # Create DataLoaders
        train_dataset = TensorDataset(X_train_fold, y_train_fold)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_dataset = TensorDataset(X_val_fold, y_val_fold)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # Initialize model with correct input dimension
        torch.manual_seed(1234)
        model = MyModel(input_dim=input_dim)  # PASS INPUT_DIM HERE
        cost_function = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # Adam instead of SGD

        train_losses = []

        for epoch in range(n_epochs):
            model.train()
            running_cost = 0.0
            for x_batch, y_batch in train_loader:
                optimizer.zero_grad()
                y_pred = model(x_batch)
                cost = cost_function(y_pred, y_batch)
                cost.backward()
                optimizer.step()
                running_cost += cost.item() * x_batch.size(0)

            epoch_train_loss = running_cost / len(X_train_fold)
            train_losses.append(epoch_train_loss)
            
            # Print every 10 epochs
            if (epoch + 1) % 10 == 0:
                print(f"Epoch {epoch+1}/{n_epochs}: Train Loss = {epoch_train_loss:.6f}")

        # Validation
        model.eval()
        val_running_cost = 0.0
        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                y_pred = model(x_batch)
                cost = cost_function(y_pred, y_batch)
                val_running_cost += cost.item() * x_batch.size(0)

        val_loss = val_running_cost / len(X_val_fold)
        final_train_loss = train_losses[-1] if len(train_losses) else None

        fold_results['train_loss'].append(final_train_loss)
        fold_results['val_loss'].append(val_loss)
        fold_results['models'].append(model)

        print(f"\nFold {fold} Results:")
        print(f"  Final Train Loss: {final_train_loss:.6f}")
        print(f"  Validation Loss: {val_loss:.6f}")

    # Summary statistics
    avg_train_loss = np.mean([t for t in fold_results['train_loss'] if t is not None])
    avg_val_loss = np.mean(fold_results['val_loss'])
    std_val_loss = np.std(fold_results['val_loss'])
    
  
    print(f"K-Fold Cross-Validation Results (k={k})")
    print(f"Average Train Loss: {avg_train_loss:.6f}")
    print(f"Average Val Loss: {avg_val_loss:.6f} (+/- {std_val_loss:.6f})")
    print(f"Individual Fold Val Losses: {[f'{loss:.6f}' for loss in fold_results['val_loss']]}")

    fold_results['avg_train_loss'] = avg_train_loss
    fold_results['avg_val_loss'] = avg_val_loss
    fold_results['std_val_loss'] = std_val_loss
    return fold_results

In [42]:
# Run training

if __name__ == "__main__":
    results = kfold_cross_validation_pytorch(
        X=X_train,
        y=y_train,
        k=5,
        batch_size=32,
        learning_rate=0.001,  # Slightly higher learning rate
        n_epochs=50  # Fewer epochs to start
    )
    
    best_fold = np.argmin(results['val_loss'])

    print(f"\nBest fold (lowest val loss): {np.argmin(results['val_loss']) + 1}")
    print(f"Best val loss: {min(results['val_loss']):.6f}")

    best_model = results['models'][best_fold]
    mse_loss = nn.MSELoss()

    X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
    y_tensor = torch.tensor(y_train.values, dtype=torch.float32)

    best_model.eval()
    with torch.no_grad():
        y_pred = best_model(X_tensor)
        best_fold_train_mse = mse_loss(y_pred, y_tensor).item()

    print(f"Train MSE for best fold: {best_fold_train_mse:.6f}")

Input dimension: 6
Output dimension: 2
Training Fold 1/5
Epoch 10/50: Train Loss = 0.009081
Epoch 20/50: Train Loss = 0.006073
Epoch 30/50: Train Loss = 0.006105
Epoch 40/50: Train Loss = 0.006059
Epoch 50/50: Train Loss = 0.006083

Fold 1 Results:
  Final Train Loss: 0.006083
  Validation Loss: 0.005958
Training Fold 2/5
Epoch 10/50: Train Loss = 0.009258
Epoch 20/50: Train Loss = 0.006058
Epoch 30/50: Train Loss = 0.006050
Epoch 40/50: Train Loss = 0.006052
Epoch 50/50: Train Loss = 0.006029

Fold 2 Results:
  Final Train Loss: 0.006029
  Validation Loss: 0.006051
Training Fold 3/5
Epoch 10/50: Train Loss = 0.009135
Epoch 20/50: Train Loss = 0.006008
Epoch 30/50: Train Loss = 0.006035
Epoch 40/50: Train Loss = 0.006023
Epoch 50/50: Train Loss = 0.006052

Fold 3 Results:
  Final Train Loss: 0.006052
  Validation Loss: 0.006727
Training Fold 4/5
Epoch 10/50: Train Loss = 0.008881
Epoch 20/50: Train Loss = 0.006042
Epoch 30/50: Train Loss = 0.006008
Epoch 40/50: Train Loss = 0.006080
Ep