# Binary - Load Data

In [5]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset
from torch.utils.data.dataset import random_split
from math import ceil

train_data = pd.read_csv("/Users/svenschnydrig/My Drive/Data Science Project - Team D/data/fe1-binary/train_fe1_binary.csv")
y = train_data["target"]
X = train_data.drop(["ID_code", "target"], axis=1)
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)
ds = TensorDataset(X_tensor, y_tensor)
train_ds, val_ds = random_split(ds, [int(0.9*len(ds)), ceil(0.1*len(ds))])
#for final training we use the whole dataset
train_ds = TensorDataset(X_tensor, y_tensor)

test_data = pd.read_csv("/Users/svenschnydrig/My Drive/Data Science Project - Team D/data/fe1-binary/test_fe1_binary.csv")
test_ids = test_data["ID_code"]
X = test_data.drop(["ID_code"], axis=1)
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)
test_ds = TensorDataset(X_tensor, y_tensor)

# Binary - Train model

In [6]:
import torch
from sklearn import metrics
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
import pandas as pd
import numpy as np
import torch

class NN(nn.Module):
    def __init__(self, input_size, hidden_dim):
        super(NN, self).__init__()
        self.bn = nn.BatchNorm1d(input_size)
        self.fc1 = nn.Linear(2, hidden_dim)
        self.fc2 = nn.Linear(input_size//2*hidden_dim, 1)

    def forward(self, x):
        N = x.shape[0]
        x = self.bn(x)
        og_features = x[:, :200].unsqueeze(2) # (N, 200, 1)
        new_features = x[:, 200:].unsqueeze(2) # (N, 200, 1)
        x = torch.cat([og_features, new_features], dim = 2) # (N, 200, 2)
        x = F.relu(self.fc1(x)).reshape(N, -1) # (N, 200*hidden_dim)
        return torch.sigmoid(self.fc2(x)).view(-1)

DEVICE = torch.device("mps")

model = NN(input_size=400, hidden_dim=200).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCELoss()
train_loader = DataLoader(train_ds, batch_size=1000, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=1000)
test_loader = DataLoader(test_ds, batch_size=1000)

def get_predictions(loader, model, device):
    model.eval()
    saved_preds = []
    true_labels = []

    with torch.no_grad():
        for x,y in loader:
            x = x.to(device)
            y = y.to(device)
            scores = model(x)
            saved_preds += scores.tolist()
            true_labels += y.tolist()

    model.train()
    return saved_preds, true_labels

for epoch in range(20):
    probabilities, true = get_predictions(val_loader, model, device=DEVICE)
    print(f"VALIDATION ROC: {metrics.roc_auc_score(true, probabilities)}")

    for batch_idx, (data, targets) in enumerate(train_loader):
        data = data.to(DEVICE)
        targets = targets.to(DEVICE)

        # forward
        scores = model(data)
        loss = loss_fn(scores, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def get_submission(model, loader, test_ids, device):
    all_preds = []
    model.eval()
    with torch.no_grad():
        for x,y in loader:
            print(x.shape)
            x = x.to(device)
            score = model(x)
            prediction = score.float()
            all_preds += prediction.tolist()

    model.train()

    df = pd.DataFrame({
        "ID_code" : test_ids.values,
        "target" : np.array(all_preds)
    })

    df.to_csv("/Users/svenschnydrig/My Drive/Data Science Project - Team D/submission/various_submissions/fe1_binary.csv", index=False)

get_submission(model, test_loader, test_ids, DEVICE)

VALIDATION ROC: 0.5720433929158676
VALIDATION ROC: 0.9064744447372165
VALIDATION ROC: 0.911923451660937
VALIDATION ROC: 0.9148552854562378
VALIDATION ROC: 0.9167999869273837
VALIDATION ROC: 0.9185485583014857
VALIDATION ROC: 0.9197756684566619
VALIDATION ROC: 0.9201595676232146
VALIDATION ROC: 0.9212593695594598
VALIDATION ROC: 0.9215379388433048
VALIDATION ROC: 0.9213789295297468
VALIDATION ROC: 0.9233873736228646
VALIDATION ROC: 0.9232724843894576
VALIDATION ROC: 0.9225463005517052
VALIDATION ROC: 0.9231271332365141
VALIDATION ROC: 0.9243469445142332
VALIDATION ROC: 0.9239077182224782
VALIDATION ROC: 0.9241019418963055
VALIDATION ROC: 0.9246208702958156
VALIDATION ROC: 0.9241714446398528
torch.Size([1000, 400])
torch.Size([1000, 400])
torch.Size([1000, 400])
torch.Size([1000, 400])
torch.Size([1000, 400])
torch.Size([1000, 400])
torch.Size([1000, 400])
torch.Size([1000, 400])
torch.Size([1000, 400])
torch.Size([1000, 400])
torch.Size([1000, 400])
torch.Size([1000, 400])
torch.Size([1

# Sum - Load data

In [7]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset
from torch.utils.data.dataset import random_split
from math import ceil

train_data = pd.read_csv("/Users/svenschnydrig/My Drive/Data Science Project - Team D/data/fe1-sum/train_fe1_sum.csv")
y = train_data["target"]
X = train_data.drop(["ID_code", "target"], axis=1)
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)
ds = TensorDataset(X_tensor, y_tensor)
train_ds, val_ds = random_split(ds, [int(0.9*len(ds)), ceil(0.1*len(ds))])
#for final training we use the whole dataset
train_ds = TensorDataset(X_tensor, y_tensor)

test_data = pd.read_csv("/Users/svenschnydrig/My Drive/Data Science Project - Team D/data/fe1-sum/test_fe1_sum.csv")
test_ids = test_data["ID_code"]
X = test_data.drop(["ID_code"], axis=1)
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)
test_ds = TensorDataset(X_tensor, y_tensor)

# Sum - Train Model

In [8]:
import torch
from sklearn import metrics
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
import pandas as pd
import numpy as np
import torch

class NN(nn.Module):
    def __init__(self, input_size, hidden_dim):
        super(NN, self).__init__()
        self.bn = nn.BatchNorm1d(input_size)
        self.fc1 = nn.Linear(5, hidden_dim)
        self.fc2 = nn.Linear(input_size//5*hidden_dim, 1)

    def forward(self, x):
        N = x.shape[0]
        x = self.bn(x)
        og_features = x[:, :200].unsqueeze(2) # (N, 200, 1)
        vc = x[:, 200:400].unsqueeze(2) # (N, 200, 1)
        sum = x[:, 400:600].unsqueeze(2) # (N, 200, 1)
        sum2 = x[:, 600:800].unsqueeze(2) # (N, 200, 1)
        sum3 = x[:, 800:].unsqueeze(2) # (N, 200, 1)
        x = torch.cat([og_features, vc, sum, sum2, sum3], dim = 2) # (N, 200, 2)
        x = F.relu(self.fc1(x)).reshape(N, -1) # (N, 200*hidden_dim)
        return torch.sigmoid(self.fc2(x)).view(-1)

DEVICE = torch.device("mps")

model = NN(input_size=1000, hidden_dim=200).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCELoss()
train_loader = DataLoader(train_ds, batch_size=1000, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=1000)
test_loader = DataLoader(test_ds, batch_size=1000)

def get_predictions(loader, model, device):
    model.eval()
    saved_preds = []
    true_labels = []

    with torch.no_grad():
        for x,y in loader:
            x = x.to(device)
            y = y.to(device)
            scores = model(x)
            saved_preds += scores.tolist()
            true_labels += y.tolist()

    model.train()
    return saved_preds, true_labels

for epoch in range(20):
    probabilities, true = get_predictions(val_loader, model, device=DEVICE)
    print(f"VALIDATION ROC: {metrics.roc_auc_score(true, probabilities)}")

    for batch_idx, (data, targets) in enumerate(train_loader):
        data = data.to(DEVICE)
        targets = targets.to(DEVICE)

        # forward
        scores = model(data)
        loss = loss_fn(scores, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def get_submission(model, loader, test_ids, device):
    all_preds = []
    model.eval()
    with torch.no_grad():
        for x,y in loader:
            print(x.shape)
            x = x.to(device)
            score = model(x)
            prediction = score.float()
            all_preds += prediction.tolist()

    model.train()

    df = pd.DataFrame({
        "ID_code" : test_ids.values,
        "target" : np.array(all_preds)
    })

    df.to_csv("/Users/svenschnydrig/My Drive/Data Science Project - Team D/submission/various_submissions/fe1_sum.csv", index=False)

get_submission(model, test_loader, test_ids, DEVICE)

VALIDATION ROC: 0.5043079079914253
VALIDATION ROC: 0.9038657187952707
VALIDATION ROC: 0.9123142262787911
VALIDATION ROC: 0.9175648517055162
VALIDATION ROC: 0.9196406456629936
VALIDATION ROC: 0.9212910844650224
VALIDATION ROC: 0.9229965351048455
VALIDATION ROC: 0.9240308273788771
VALIDATION ROC: 0.9253327463172817
VALIDATION ROC: 0.9262446599123566
VALIDATION ROC: 0.9272840001674479
VALIDATION ROC: 0.9284252343385759
VALIDATION ROC: 0.9286720276225524
VALIDATION ROC: 0.9293919506009594
VALIDATION ROC: 0.9297987174725602
VALIDATION ROC: 0.9311986676453614
VALIDATION ROC: 0.9307444051287599
VALIDATION ROC: 0.9311391795591685
VALIDATION ROC: 0.932142207817906
VALIDATION ROC: 0.9317892256505824
torch.Size([1000, 1000])
torch.Size([1000, 1000])
torch.Size([1000, 1000])
torch.Size([1000, 1000])
torch.Size([1000, 1000])
torch.Size([1000, 1000])
torch.Size([1000, 1000])
torch.Size([1000, 1000])
torch.Size([1000, 1000])
torch.Size([1000, 1000])
torch.Size([1000, 1000])
torch.Size([1000, 1000])
t

Sources
- https://www.youtube.com/watch?v=MOnk75_8b9M&t=2995s
- https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html
- https://www.amazon.de/Sebastian-Raschka/dp/1801819319/ref=sr_1_1_sspa?crid=1LIHNO6T7C2AI&keywords=pylampe&qid=1671791736&sprefix=pytorch%2Caps%2C99&sr=8-1-spons&sp_csd=d2lkZ2V0TmFtZT1zcF9hdGY&psc=1
- https://medium.com/analytics-vidhya/santander-customer-transaction-prediction-an-end-to-end-machine-learning-project-2cb763172f8a