In [1]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Setup

In [2]:
USE_GPU_IF_AVAILABLE = False

MODEL_DIR = (
    'C:/' +
    'Users/' +
    'Abdul Zakkar/' +
    'Documents/' +
    'UICOM/' +
    'research/' +
    'salahudeen/' +
    'models/'
)

INPUTS = [
    'age','race_2','pred_yr1',
]

OUTPUTS = [
    'canc_yr_1','canc_yr_2','canc_yr_3','canc_yr_4','canc_yr_5','canc_yr_6'
]

BATCH_SIZE = 64
HIDDEN_SIZE = 128
DROPOUT = 0.2
LEARNING_RATE = 1e-4
EPOCHS = 10

N_NETWORKS = 100
N_YEARS = 6

In [3]:
if USE_GPU_IF_AVAILABLE:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    device = torch.device('cpu')
print('Using device:', device)

Using device: cpu


In [4]:
# Define the neural network
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, output_size, mean, std, **kwargs):
        super(NeuralNetwork, self).__init__()
        
        self.weight_layer = nn.Linear(input_size, input_size)

        self.dropout = nn.Dropout(kwargs.get('dropout', 0.2))

        hidden_size = kwargs.get('hidden_size', 128)
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size)
        self.layer3 = nn.Linear(hidden_size, output_size)

        self.relu = nn.ReLU()

        self.sigmoid = nn.Sigmoid()

        self.mean = nn.Parameter(torch.tensor(mean, dtype=torch.float32), requires_grad=False)
        self.std = nn.Parameter(torch.tensor(std, dtype=torch.float32), requires_grad=False)
    
    def forward(self, x):
        x = (x - self.mean) / self.std # Standardize the input

        weights = torch.sigmoid(self.weight_layer(x))
        x = x * weights
        
        x = self.dropout(self.relu(self.layer1(x)))
        x = self.dropout(self.relu(self.layer2(x)))
        x = self.sigmoid(self.layer3(x))
        return x

In [5]:
# Custom loss function to handle masks
class MaskedBCELoss(nn.Module):
    def __init__(self):
        super(MaskedBCELoss, self).__init__()
        self.bce_loss = nn.BCELoss(reduction='none')
    
    def forward(self, outputs, targets, masks):
        loss = self.bce_loss(outputs, targets)
        loss = loss * masks # Apply mask and weights
        return loss.sum() / masks.sum()  # Average loss over present labels

In [6]:
class MaskedDataset(Dataset):
    def __init__(self, features, labels, masks):
        self.features = features
        self.labels = labels
        self.masks = masks
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx], self.masks[idx]

In [7]:
# Function to train the model
def train_model(model, criterion, optimizer, dataloader, epochs,
    report_every_n_epochs=1, verbose=0
):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for features, labels, masks in dataloader:
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels, masks)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        if (epoch + 1) % report_every_n_epochs == 0 and verbose >= 1:
            print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(dataloader):.4f}")

In [8]:
# Function to evaluate the model
def evaluate_model(model, dataloader):
    model.eval()
    with torch.no_grad():
        all_outputs = []
        for features, _, _ in dataloader:
            outputs = model(features)
            all_outputs.append(outputs)
        pred_cols = ['pred_yr'+str(i) for i in range(1,N_YEARS+1)]
        return pd.DataFrame(np.reshape(torch.cat(all_outputs).cpu().numpy(), (-1, 6)), columns=pred_cols)

# Training

In [9]:
TRAIN_FILE = (
    'C:/' +
    'Users/' +
    'Abdul Zakkar/' +
    'Documents/' +
    'UICOM/' +
    'research/' +
    'salahudeen/' +
    'test_train_sets/' +
    'train/' +
    'train_nlst_svm7sybil.csv'
)

In [10]:
df_train = pd.read_csv(TRAIN_FILE)
df_train = df_train[INPUTS + OUTPUTS]
df_train = df_train.dropna(subset=['canc_yr_1']).reset_index(drop=True)

In [11]:
features_train = df_train[INPUTS]

scaler = StandardScaler()
scaler.fit(features_train)
mean = scaler.mean_
std = scaler.scale_

features_train = features_train.to_numpy().astype(np.float32)
labels_train = df_train[OUTPUTS].fillna(0).to_numpy().astype(np.float32)
masks_train = df_train[OUTPUTS].notna().astype(int).to_numpy().astype(np.float32)

In [12]:
features_train = torch.tensor(features_train, device=device)
labels_train = torch.tensor(labels_train, device=device)
masks_train = torch.tensor(masks_train, device=device)

dataset_train = MaskedDataset(features_train, labels_train, masks_train)
dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)

In [13]:
for network in range(N_NETWORKS):
    print(f'Training model {network+1}: ', end='')
    model = NeuralNetwork(
        len(INPUTS),
        len(OUTPUTS),
        mean,
        std,
        hidden_size=HIDDEN_SIZE,
        dropout=DROPOUT
    ).to(device)
    criterion = MaskedBCELoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    train_model(
        model,
        criterion,
        optimizer,
        dataloader_train,
        EPOCHS)
    model_scripted = torch.jit.script(model) # Export to TorchScript
    model_scripted.save(MODEL_DIR + f'nn_{len(INPUTS)}inputs_nlst_uic_{network+1}.pth') # Save
    print('DONE.')

Training model 1: DONE.
Training model 2: DONE.
Training model 3: DONE.
Training model 4: DONE.
Training model 5: DONE.
Training model 6: DONE.
Training model 7: DONE.
Training model 8: DONE.
Training model 9: DONE.
Training model 10: DONE.
Training model 11: DONE.
Training model 12: DONE.
Training model 13: DONE.
Training model 14: DONE.
Training model 15: DONE.
Training model 16: DONE.
Training model 17: DONE.
Training model 18: DONE.
Training model 19: DONE.
Training model 20: DONE.
Training model 21: DONE.
Training model 22: DONE.
Training model 23: DONE.
Training model 24: DONE.
Training model 25: DONE.
Training model 26: DONE.
Training model 27: DONE.
Training model 28: DONE.
Training model 29: DONE.
Training model 30: DONE.
Training model 31: DONE.
Training model 32: DONE.
Training model 33: DONE.
Training model 34: DONE.
Training model 35: DONE.
Training model 36: DONE.
Training model 37: DONE.
Training model 38: DONE.
Training model 39: DONE.
Training model 40: DONE.
Training 

# Testing

In [32]:
TEST_FILE = (
    'C:/' +
    'Users/' +
    'Abdul Zakkar/' +
    'Documents/' +
    'UICOM/' +
    'research/' +
    'salahudeen/' +
    'test_train_sets/' +
    'test/' +
    'test_uic_svm7sybil.csv'
)

RESULTS_FILE = (
    'C:/' +
    'Users/' +
    'Abdul Zakkar/' +
    'Documents/' +
    'UICOM/' +
    'research/' +
    'salahudeen/' +
    'test_train_sets/' +
    'test/' +
    'test_uic_nn2s.csv'
)

INDIVIDUAL_RESULTS_FILE = (
    'C:/' +
    'Users/' +
    'Abdul Zakkar/' +
    'Documents/' +
    'UICOM/' +
    'research/' +
    'salahudeen/' +
    'test_train_sets/' +
    'test_ensemble/' +
    'test_uic_nn2s_{x}.csv'
)

In [33]:
def find_files_matching_pattern(directory, pattern):
    regex = re.compile(pattern)
    output = []
    for root, _, files in os.walk(directory):
        for file in files:
            if regex.search(file):
                output.append(os.path.join(root, file))
    return output

model_paths = find_files_matching_pattern(MODEL_DIR, r'nn_3inputs_nlst_uic_\d+\.pth')

In [34]:
df_test = pd.read_csv(TEST_FILE)
df_test = df_test[INPUTS + OUTPUTS]
df_test = df_test.dropna(subset=['canc_yr_1']).reset_index(drop=True)

In [35]:
features_test = df_test[INPUTS].to_numpy().astype(np.float32)
labels_test = df_test[OUTPUTS].fillna(0).to_numpy().astype(np.float32)
masks_test = df_test[OUTPUTS].notna().astype(int).to_numpy().astype(np.float32)

In [36]:
features_test = torch.tensor(features_test, device=device)
labels_test = torch.tensor(labels_test, device=device)
masks_test = torch.tensor(masks_test, device=device)

dataset_test = MaskedDataset(features_test, labels_test, masks_test)
dataloader_test = DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True)

In [37]:
results = []
for i, model_path in enumerate(model_paths):
    model = torch.jit.load(model_path)
    result = evaluate_model(model, dataloader_test)
    truth = df_test[['canc_yr_'+str(i) for i in range(1,N_YEARS+1)]]
    result = pd.concat([result, truth], axis=1)
    result.to_csv(INDIVIDUAL_RESULTS_FILE.replace('{x}', str(i+1)), index=False)
    results.append(result)

results = pd.concat(results, keys=range(len(results)))
results = results.groupby(level=1).mean()

results.to_csv(RESULTS_FILE, index=False)