In [64]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# sklearn imports for preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# 1) Load raw train.csv
df = pd.read_csv('/content/train.csv')

# 2) Drop unused columns
df = df.drop(columns=['id','CustomerId','Surname'])

# 3) Separate features/target
X = df.drop(columns='Exited')
y = df['Exited'].values

# 4) Label‐encode categorical features
le_geo = LabelEncoder().fit(X['Geography'])
le_gen = LabelEncoder().fit(X['Gender'])
X['Geography'] = le_geo.transform(X['Geography'])
X['Gender']    = le_gen.transform(X['Gender'])

# 5) Split into train & test (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y, test_size=0.2, random_state=42, stratify=y
)

# 6) Standardize features
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)

# 7) Convert to PyTorch tensors & loaders
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train_t = torch.from_numpy(X_train).float().to(device)
y_train_t = torch.from_numpy(y_train).float().view(-1,1).to(device)
X_test_t  = torch.from_numpy(X_test).float().to(device)
y_test_t  = torch.from_numpy(y_test).float().view(-1,1).to(device)

train_ds = TensorDataset(X_train_t, y_train_t)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)

# 8) Define simple MLP
class MLP(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 64), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(64, 32),    nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(32, 1)
        )
    def forward(self, x):
        return self.net(x)

model = MLP(X_train.shape[1]).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# 9) Training loop
epochs = 20
for epoch in range(1, epochs+1):
    model.train()
    running_loss = 0.0
    for xb, yb in train_loader:
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * xb.size(0)
    avg_loss = running_loss / len(train_ds)
    print(f"Epoch {epoch}/{epochs}  Loss: {avg_loss:.4f}")

# 10) Evaluate on held‐out test set
model.eval()
with torch.no_grad():
    logits = model(X_test_t)
    probs  = torch.sigmoid(logits).cpu().numpy().reshape(-1)
    preds  = (probs > 0.5).astype(int)
    y_true = y_test

# compute precision, recall, f1
tp = ((preds==1)&(y_true==1)).sum()
tn = ((preds==0)&(y_true==0)).sum()
fp = ((preds==1)&(y_true==0)).sum()
fn = ((preds==0)&(y_true==1)).sum()

precision = tp / (tp+fp+1e-8)
recall    = tp / (tp+fn+1e-8)
f1        = 2*precision*recall/(precision+recall+1e-8)

print(f"Test Precision: {precision:.4f}")
print(f"Test Recall:    {recall:.4f}")
print(f"Test F1-score:  {f1:.4f}")

# 11) Inference on /content/test.csv and write prediction.csv
test_df = pd.read_csv('/content/test.csv')
ids = test_df['id']

# drop & encode exactly as train
test_df = test_df.drop(columns=['CustomerId','Surname'])
test_df['Geography'] = le_geo.transform(test_df['Geography'])
test_df['Gender']    = le_gen.transform(test_df['Gender'])

# scale
X_sub = scaler.transform(test_df.drop(columns='id').values)
X_sub_t = torch.from_numpy(X_sub).float().to(device)

model.eval()
with torch.no_grad():
    logits = model(X_sub_t)
    probs  = torch.sigmoid(logits).cpu().numpy().reshape(-1)
    preds  = (probs > 0.5).astype(int)

submission = pd.DataFrame({'id': ids, 'Exited': preds})
submission.to_csv('prediction.csv', index=False)
print("Saved prediction.csv")

Epoch 1/20  Loss: 0.4357
Epoch 2/20  Loss: 0.3405
Epoch 3/20  Loss: 0.3205
Epoch 4/20  Loss: 0.3105
Epoch 5/20  Loss: 0.2948
Epoch 6/20  Loss: 0.2858
Epoch 7/20  Loss: 0.2830
Epoch 8/20  Loss: 0.2822
Epoch 9/20  Loss: 0.2735
Epoch 10/20  Loss: 0.2769
Epoch 11/20  Loss: 0.2714
Epoch 12/20  Loss: 0.2716
Epoch 13/20  Loss: 0.2662
Epoch 14/20  Loss: 0.2676
Epoch 15/20  Loss: 0.2647
Epoch 16/20  Loss: 0.2671
Epoch 17/20  Loss: 0.2663
Epoch 18/20  Loss: 0.2664
Epoch 19/20  Loss: 0.2609
Epoch 20/20  Loss: 0.2632
Test Precision: 0.8234
Test Recall:    0.6258
Test F1-score:  0.7112
Saved prediction.csv
