In [1]:
%pip install torch

Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split

In [3]:
TARGET = "metastatic_diagnosis_period"
selected_features = json.load(open("data/selected_features.json"))

train_df = pd.read_csv("data/train_preprocessed.csv")[selected_features + [TARGET]]
train_df.head()

Unnamed: 0,patient_race,patient_age,patient_gender,breast_cancer_diagnosis_code,metastatic_cancer_diagnosis_code,metastatic_first_novel_treatment,metastatic_first_novel_treatment_type,dummy_patient_race_4,dummy_payer_type_0,dummy_patient_state_13,...,dummy_metastatic_cancer_diagnosis_code_39,dummy_metastatic_cancer_diagnosis_code_31,dummy_metastatic_cancer_diagnosis_code_21,dummy_metastatic_cancer_diagnosis_code_25,dummy_metastatic_cancer_diagnosis_code_13,dummy_metastatic_first_novel_treatment_2,dummy_metastatic_first_novel_treatment_0,dummy_metastatic_first_novel_treatment_1,dummy_metastatic_first_novel_treatment_type_1,metastatic_diagnosis_period
0,5,39,0,45,3,2,1,0,1,0,...,0,0,0,0,0,1,0,0,1,191.0
1,4,55,0,29,3,2,1,1,0,1,...,0,0,0,0,0,1,0,0,1,33.0
2,5,59,0,7,3,2,1,0,1,0,...,0,0,0,0,0,1,0,0,1,157.0
3,2,59,0,44,3,2,1,0,0,0,...,0,0,0,0,0,1,0,0,1,146.0
4,5,71,0,6,32,2,1,0,1,0,...,0,0,0,0,0,1,0,0,1,286.0


In [4]:
batch_size = 128
learning_rate = 1e-2

In [5]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df[selected_features], train_df[TARGET], test_size=0.2, random_state=777
)
train_data = [[X_train.values[i], y_train.values[i]] for i in range(len(X_train))]
val_data = [[X_val.values[i], y_val.values[i]] for i in range(len(X_val))]

train_loader = torch.utils.data.DataLoader(
    train_data, batch_size=batch_size, shuffle=True
)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=False)

In [6]:
class NN(nn.Module):
    def __init__(self, input_size, hidden_size=512):
        super(NN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.GELU(),
            nn.Linear(hidden_size, hidden_size),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, 1),
        )

    def forward(self, x):
        return self.layers(x)

In [7]:
model = NN(len(selected_features))
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

n_epochs = 92
for epoch in range(n_epochs):
    model.train()
    for i, (x_batch, y_batch) in enumerate(train_loader):
        optimizer.zero_grad()
        y_pred = model(x_batch.float())
        loss = criterion(y_pred, y_batch.float().reshape(-1, 1))
        loss.backward()
        optimizer.step()
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for x_batch, y_batch in val_loader:
            y_pred = model(x_batch.float())
            val_loss += criterion(y_pred, y_batch.float().reshape(-1, 1))
        print(
            f"Epoch {epoch + 1}/{n_epochs}, val_loss: {val_loss.item() / len(val_loader)}"
        )

Epoch 1/92, val_loss: 7192.334077380952
Epoch 2/92, val_loss: 7166.584077380952
Epoch 3/92, val_loss: 6811.773065476191
Epoch 4/92, val_loss: 6912.171875
Epoch 5/92, val_loss: 6754.961309523809
Epoch 6/92, val_loss: 7149.723958333333
Epoch 7/92, val_loss: 6696.693452380952
Epoch 8/92, val_loss: 6872.374255952381
Epoch 9/92, val_loss: 7123.037202380952
Epoch 10/92, val_loss: 6730.520089285715
Epoch 11/92, val_loss: 6678.228422619048
Epoch 12/92, val_loss: 6834.256696428572
Epoch 13/92, val_loss: 6746.7641369047615
Epoch 14/92, val_loss: 6827.141369047619
Epoch 15/92, val_loss: 6805.537202380952
Epoch 16/92, val_loss: 7670.909226190476
Epoch 17/92, val_loss: 6784.287202380952
Epoch 18/92, val_loss: 6736.212797619048
Epoch 19/92, val_loss: 6984.960565476191
Epoch 20/92, val_loss: 6747.883928571428
Epoch 21/92, val_loss: 7022.824404761905
Epoch 22/92, val_loss: 6881.055803571428
Epoch 23/92, val_loss: 6708.364583333333
Epoch 24/92, val_loss: 7156.453869047619
Epoch 25/92, val_loss: 6676.93

In [8]:
test_df = pd.read_csv("data/test_preprocessed.csv")[selected_features + ["patient_id"]]
test_data = [
    [test_df[selected_features].values[i], test_df["patient_id"].values[i]]
    for i in range(len(test_df))
]
test_loader = torch.utils.data.DataLoader(
    test_data, batch_size=batch_size, shuffle=False
)
evaluations = []

model.eval()
with torch.no_grad():
    for x_batch, ids in test_loader:
        y_pred = model(x_batch.to(torch.float))
        for i in range(len(ids)):
            ypred = y_pred[i].item()
            y = np.uint16(np.around(np.clip(ypred, a_min = 0, a_max = np.inf),0))
            evaluations.append(
                {"patient_id": int(ids[i]), "metastatic_diagnosis_period": y}
            )

pd.DataFrame(evaluations).to_csv("evaluation.csv", index=False)