# Binary - Load Data

In [17]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset
from torch.utils.data.dataset import random_split
from math import ceil

train_data = pd.read_csv("/Users/svenschnydrig/My Drive/Data Science Project - Team D/data/fe2-binary/train_fe2_binary.csv")
y = train_data["target"]
X = train_data.drop(["ID_code", "target"], axis=1)
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)
ds = TensorDataset(X_tensor, y_tensor)
train_ds, val_ds = random_split(ds, [int(0.9*len(ds)), ceil(0.1*len(ds))])
#for final training we use the whole dataset
train_ds = TensorDataset(X_tensor, y_tensor)

test_data = pd.read_csv("/Users/svenschnydrig/My Drive/Data Science Project - Team D/data/fe2-binary/test_fe2_binary.csv")
test_ids = test_data["ID_code"]
X = test_data.drop(["ID_code"], axis=1)
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)
test_ds = TensorDataset(X_tensor, y_tensor)

# Binary - Train model

In [18]:
import torch
from sklearn import metrics
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
import pandas as pd
import numpy as np
import torch

class NN(nn.Module):
    def __init__(self, input_size, hidden_dim):
        super(NN, self).__init__()
        self.bn = nn.BatchNorm1d(input_size)
        self.fc1 = nn.Linear(2, hidden_dim)
        self.fc2 = nn.Linear(input_size//2*hidden_dim, 1)

    def forward(self, x):
        N = x.shape[0]
        x = self.bn(x)
        og_features = x[:, :185].unsqueeze(2) # (N, 200, 1)
        new_features = x[:, 185:].unsqueeze(2) # (N, 200, 1)
        x = torch.cat([og_features, new_features], dim = 2) # (N, 200, 2)
        x = F.relu(self.fc1(x)).reshape(N, -1) # (N, 200*hidden_dim)
        return torch.sigmoid(self.fc2(x)).view(-1)

DEVICE = torch.device("mps")

model = NN(input_size=370, hidden_dim=200).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
loss_fn = nn.BCELoss()
train_loader = DataLoader(train_ds, batch_size=1000, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=1000)
test_loader = DataLoader(test_ds, batch_size=1000)

def get_predictions(loader, model, device):
    model.eval()
    saved_preds = []
    true_labels = []

    with torch.no_grad():
        for x,y in loader:
            x = x.to(device)
            y = y.to(device)
            scores = model(x)
            saved_preds += scores.tolist()
            true_labels += y.tolist()

    model.train()
    return saved_preds, true_labels

for epoch in range(20):
    probabilities, true = get_predictions(val_loader, model, device=DEVICE)
    print(f"VALIDATION ROC: {metrics.roc_auc_score(true, probabilities)}")

    for batch_idx, (data, targets) in enumerate(train_loader):
        data = data.to(DEVICE)
        targets = targets.to(DEVICE)

        # forward
        scores = model(data)
        loss = loss_fn(scores, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def get_submission(model, loader, test_ids, device):
    all_preds = []
    model.eval()
    with torch.no_grad():
        for x,y in loader:
            print(x.shape)
            x = x.to(device)
            score = model(x)
            prediction = score.float()
            all_preds += prediction.tolist()

    model.train()

    df = pd.DataFrame({
        "ID_code" : test_ids.values,
        "target" : np.array(all_preds)
    })

    df.to_csv("/Users/svenschnydrig/My Drive/Data Science Project - Team D/submission/various_submissions/fe2_binary.csv", index=False)

get_submission(model, test_loader, test_ids, DEVICE)

VALIDATION ROC: 0.5284195592292358
VALIDATION ROC: 0.9146103340682112
VALIDATION ROC: 0.9195261916252279
VALIDATION ROC: 0.9218650588646315
VALIDATION ROC: 0.9221349864894455
VALIDATION ROC: 0.9226086389595776
VALIDATION ROC: 0.9243971061264146
VALIDATION ROC: 0.9249448699932465
VALIDATION ROC: 0.9243810054290247
VALIDATION ROC: 0.9247614410974553
VALIDATION ROC: 0.9256089670679793
VALIDATION ROC: 0.9257375317035629
VALIDATION ROC: 0.9258765561231902
VALIDATION ROC: 0.9264922093742627
VALIDATION ROC: 0.9253587259472801
VALIDATION ROC: 0.9259041370713595
VALIDATION ROC: 0.9261218649879989
VALIDATION ROC: 0.9259249716005526
VALIDATION ROC: 0.9259120598617127
VALIDATION ROC: 0.9260746542283111
torch.Size([1000, 370])
torch.Size([1000, 370])
torch.Size([1000, 370])
torch.Size([1000, 370])
torch.Size([1000, 370])
torch.Size([1000, 370])
torch.Size([1000, 370])
torch.Size([1000, 370])
torch.Size([1000, 370])
torch.Size([1000, 370])
torch.Size([1000, 370])
torch.Size([1000, 370])
torch.Size([

# Sum - Load data

In [19]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset
from torch.utils.data.dataset import random_split
from math import ceil

train_data = pd.read_csv("/Users/svenschnydrig/My Drive/Data Science Project - Team D/data/fe2-sum/train_fe2_sum.csv")
y = train_data["target"]
X = train_data.drop(["ID_code", "target"], axis=1)
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)
ds = TensorDataset(X_tensor, y_tensor)
train_ds, val_ds = random_split(ds, [int(0.9*len(ds)), ceil(0.1*len(ds))])
#for final training we use the whole dataset
train_ds = TensorDataset(X_tensor, y_tensor)

test_data = pd.read_csv("/Users/svenschnydrig/My Drive/Data Science Project - Team D/data/fe2-sum/test_fe2_sum.csv")
test_ids = test_data["ID_code"]
X = test_data.drop(["ID_code"], axis=1)
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)
test_ds = TensorDataset(X_tensor, y_tensor)

# Sum - Train Model

In [20]:
import torch
from sklearn import metrics
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
import pandas as pd
import numpy as np
import torch

class NN(nn.Module):
    def __init__(self, input_size, hidden_dim):
        super(NN, self).__init__()
        self.bn = nn.BatchNorm1d(input_size)
        self.fc1 = nn.Linear(5, hidden_dim)
        self.fc2 = nn.Linear(input_size//5*hidden_dim, 1)

    def forward(self, x):
        N = x.shape[0]
        x = self.bn(x)
        og_features = x[:, :185].unsqueeze(2) # (N, 200, 1)
        vc = x[:, 185:370].unsqueeze(2) # (N, 200, 1)
        sum = x[:, 370:555].unsqueeze(2) # (N, 200, 1)
        sum2 = x[:, 555:740].unsqueeze(2) # (N, 200, 1)
        sum3 = x[:, 740:].unsqueeze(2) # (N, 200, 1)
        x = torch.cat([og_features, vc, sum, sum2, sum3], dim = 2) # (N, 200, 2)
        x = F.relu(self.fc1(x)).reshape(N, -1) # (N, 200*hidden_dim)
        return torch.sigmoid(self.fc2(x)).view(-1)

DEVICE = torch.device("mps")

model = NN(input_size=925, hidden_dim=200).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
loss_fn = nn.BCELoss()
train_loader = DataLoader(train_ds, batch_size=1000, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=1000)
test_loader = DataLoader(test_ds, batch_size=1000)

def get_predictions(loader, model, device):
    model.eval()
    saved_preds = []
    true_labels = []

    with torch.no_grad():
        for x,y in loader:
            x = x.to(device)
            y = y.to(device)
            scores = model(x)
            saved_preds += scores.tolist()
            true_labels += y.tolist()

    model.train()
    return saved_preds, true_labels

for epoch in range(20):
    probabilities, true = get_predictions(val_loader, model, device=DEVICE)
    print(f"VALIDATION ROC: {metrics.roc_auc_score(true, probabilities)}")

    for batch_idx, (data, targets) in enumerate(train_loader):
        data = data.to(DEVICE)
        targets = targets.to(DEVICE)

        # forward
        scores = model(data)
        loss = loss_fn(scores, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def get_submission(model, loader, test_ids, device):
    all_preds = []
    model.eval()
    with torch.no_grad():
        for x,y in loader:
            print(x.shape)
            x = x.to(device)
            score = model(x)
            prediction = score.float()
            all_preds += prediction.tolist()

    model.train()

    df = pd.DataFrame({
        "ID_code" : test_ids.values,
        "target" : np.array(all_preds)
    })

    df.to_csv("/Users/svenschnydrig/My Drive/Data Science Project - Team D/submission/various_submissions/fe2_sum.csv", index=False)

get_submission(model, test_loader, test_ids, DEVICE)

VALIDATION ROC: 0.47650633057504393
VALIDATION ROC: 0.9074651522098831
VALIDATION ROC: 0.9153364195319942
VALIDATION ROC: 0.9201236751721716
VALIDATION ROC: 0.9236999221710588
VALIDATION ROC: 0.9253961925549404
VALIDATION ROC: 0.9271585537175396
VALIDATION ROC: 0.9285721948982817
VALIDATION ROC: 0.9280809361568305
VALIDATION ROC: 0.9291376183595754
VALIDATION ROC: 0.9310836444895448
VALIDATION ROC: 0.9298832818966888
VALIDATION ROC: 0.9314374674222665
VALIDATION ROC: 0.9316476374740464
VALIDATION ROC: 0.9325056484576875
VALIDATION ROC: 0.9324301632977289
VALIDATION ROC: 0.9332097456403194
VALIDATION ROC: 0.9334438761783691
VALIDATION ROC: 0.9329319305417688
VALIDATION ROC: 0.9332427979069704
torch.Size([1000, 925])
torch.Size([1000, 925])
torch.Size([1000, 925])
torch.Size([1000, 925])
torch.Size([1000, 925])
torch.Size([1000, 925])
torch.Size([1000, 925])
torch.Size([1000, 925])
torch.Size([1000, 925])
torch.Size([1000, 925])
torch.Size([1000, 925])
torch.Size([1000, 925])
torch.Size(

Sources
- https://www.youtube.com/watch?v=MOnk75_8b9M&t=2995s
- https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html
- https://www.amazon.de/Sebastian-Raschka/dp/1801819319/ref=sr_1_1_sspa?crid=1LIHNO6T7C2AI&keywords=pylampe&qid=1671791736&sprefix=pytorch%2Caps%2C99&sr=8-1-spons&sp_csd=d2lkZ2V0TmFtZT1zcF9hdGY&psc=1
- https://medium.com/analytics-vidhya/santander-customer-transaction-prediction-an-end-to-end-machine-learning-project-2cb763172f8a