## Pancreatic Cancer Detection

### Dataset Creation

In [53]:
from dataset import Dataset
from pathlib import Path
import torch
from sklearn import preprocessing
import numpy as np
torch.manual_seed(94)
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [54]:
urinary_data = Dataset(filePath='../data/urinary_data.csv',
                       label_column='diagnosis',
                       separator=',', 
                       name='urinary_data')
display(urinary_data.dataframe.head())

Unnamed: 0,sample_id,patient_cohort,sample_origin,age,sex,diagnosis,stage,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
0,S1,Cohort1,BPTB,33,F,1,,,11.7,1.83222,0.893219,52.94884,654.282174,1262.0
1,S10,Cohort1,BPTB,81,F,1,,,,0.97266,2.037585,94.46703,209.48825,228.407
2,S100,Cohort2,BPTB,51,M,1,,,7.0,0.78039,0.145589,102.366,461.141,
3,S101,Cohort2,BPTB,61,M,1,,,8.0,0.70122,0.002805,60.579,142.95,
4,S102,Cohort2,BPTB,62,M,1,,,9.0,0.21489,0.00086,65.54,41.088,


### Dataset cleaning and preparation

In [55]:
# urinary_data.cleanDataframe()
# Encode labels
for column in urinary_data.dataframe.columns:
    urinary_data.encode_column(column)

display(urinary_data.dataframe.head())

# Dataset Cleaning
urinary_data.cleanDataframe()

display(urinary_data.dataframe.head())


Unnamed: 0,sample_id,patient_cohort,sample_origin,age,sex,diagnosis,stage,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
0,0,0,0,6,0,0,8,52,78,158,181,337,391,247
1,1,0,0,54,0,0,8,52,266,93,265,402,242,151
2,2,1,0,24,1,0,8,52,62,75,108,412,347,298
3,3,1,0,34,1,0,8,52,67,68,47,351,209,298
4,4,1,0,35,1,0,8,52,70,16,14,360,117,298


Removed: 0 rows | 1 columns


Unnamed: 0,patient_cohort,sample_origin,age,sex,diagnosis,stage,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
0,0,0,6,0,0,8,52,78,158,181,337,391,247
1,0,0,54,0,0,8,52,266,93,265,402,242,151
2,1,0,24,1,0,8,52,62,75,108,412,347,298
3,1,0,34,1,0,8,52,67,68,47,351,209,298
4,1,0,35,1,0,8,52,70,16,14,360,117,298


In [56]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Dataset split
X = urinary_data.dataframe.iloc[:, urinary_data.dataframe.columns != urinary_data.label_column].squeeze()
y = urinary_data.dataframe[urinary_data.label_column].values.reshape(-1, 1)
# Transform y to tensor of size equal to the number of classes
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False).fit(y)
y = ohe.transform(y)


In [57]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print("Set sizes: Train: {}, Validation: {}, Test: {}".format(len(X_train), len(X_val), len(X_test)))

# Convert to tensors
X_train = torch.from_numpy(X_train.values).float().squeeze()
X_val = torch.from_numpy(X_val.values).float().squeeze()  
X_test = torch.from_numpy(X_test.values).float().squeeze()
y_train = torch.from_numpy(y_train).float().squeeze()
y_val = torch.from_numpy(y_val).float().squeeze()
y_test = torch.from_numpy(y_test).float().squeeze()


print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_val shape: {}".format(X_val.shape))
print("y_val shape: {}".format(y_val.shape))
print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))


Set sizes: Train: 377, Validation: 95, Test: 118
X_train shape: torch.Size([377, 12])
y_train shape: torch.Size([377, 3])
X_val shape: torch.Size([95, 12])
y_val shape: torch.Size([95, 3])
X_test shape: torch.Size([118, 12])
y_test shape: torch.Size([118, 3])


###

In [58]:
# Define dictionary for label column
# 3 (pancreatic cancer), 2 (non-cancerous pancreas condition),  1 (healthy)
label_dict = {1: 'healthy', 2: 'non-cancerous pancreas condition', 3: 'pancreatic cancer'}
urinary_data.init_label_dictionary(label_column='diagnosis', label_dict=label_dict)

### Model Training

In [85]:
# Define loss function
# Binary classification => BCELoss
from models import train_binary_logits
from models import PCDModel_1
from models import MulticlassClassification
from models import accuracy_fn
import tqdm
import copy

# TRAINING
n_epochs = 1000
model = PCDModel_1(urinary_data.get_feature_count(), urinary_data.get_label_count())
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# prepare model and training parameters
n_epochs = 200
batch_size = 5
batches_per_epoch = len(X_train) // batch_size
 
best_acc = - np.inf   # init to negative infinity
best_weights = None
train_loss_hist = []
train_acc_hist = []
test_loss_hist = []
test_acc_hist = []

# training loop
for epoch in range(n_epochs):
    epoch_loss = []
    epoch_acc = []
    # set model in training mode and run through each batch
    model.train()
    with tqdm.trange(batches_per_epoch, unit="batch", mininterval=0) as bar:
        bar.set_description(f"Epoch {epoch}")
        for i in bar:
            # take a batch
            start = i * batch_size
            X_batch = X_train[start:start+batch_size]
            y_batch = y_train[start:start+batch_size]
            # forward pass
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch)
            # backward pass
            optimizer.zero_grad()
            loss.backward()
            # update weights
            optimizer.step()
            # compute and store metrics
            acc = (torch.argmax(y_pred, 1) == torch.argmax(y_batch, 1)).float().mean()
            epoch_loss.append(float(loss))
            epoch_acc.append(float(acc))
            bar.set_postfix(
                loss=float(loss),
                acc=float(acc)
            )
    # set model in evaluation mode and run through the test set
    model.eval()
    y_pred = model(X_test)
    ce = loss_fn(y_pred, y_test)
    acc = (torch.argmax(y_pred, 1) == torch.argmax(y_test, 1)).float().mean()
    ce = float(ce)
    acc = float(acc)
    train_loss_hist.append(np.mean(epoch_loss))
    train_acc_hist.append(np.mean(epoch_acc))
    test_loss_hist.append(ce)
    test_acc_hist.append(acc)
    if acc > best_acc:
        best_acc = acc
        best_weights = copy.deepcopy(model.state_dict())
    print(f"Epoch {epoch} validation: Cross-entropy={ce:.2f}, Accuracy={acc*100:.1f}%")




Epoch: 0, Loss: 13.135809898376465, Accuracy: 0.3448275923728943, Test Loss: 5.38438081741333, Test Accuracy: 0.31355932354927063
Epoch: 10, Loss: 2.221109390258789, Accuracy: 0.5331565141677856, Test Loss: 1.8657960891723633, Test Accuracy: 0.5593220591545105
Epoch: 20, Loss: 0.9937533736228943, Accuracy: 0.6100795865058899, Test Loss: 1.0471991300582886, Test Accuracy: 0.5169491767883301
Epoch: 30, Loss: 0.9039359092712402, Accuracy: 0.7135278582572937, Test Loss: 0.7489650249481201, Test Accuracy: 0.7372881174087524
Epoch: 40, Loss: 0.6573114991188049, Accuracy: 0.7241379022598267, Test Loss: 0.6895939707756042, Test Accuracy: 0.694915235042572
Epoch: 50, Loss: 0.600689172744751, Accuracy: 0.761273205280304, Test Loss: 0.5787359476089478, Test Accuracy: 0.7796609997749329
Epoch: 60, Loss: 0.5566666722297668, Accuracy: 0.7931034564971924, Test Loss: 0.5115457773208618, Test Accuracy: 0.7966101765632629
Epoch: 70, Loss: 0.5331751704216003, Accuracy: 0.8063660264015198, Test Loss: 0.51

In [87]:
pred = model(X_test[10]).squeeze()
print(y_test[10])
print(pred)

tensor([0., 0., 1.])
tensor([-18.6728, -20.9609, -12.0012], grad_fn=<SqueezeBackward0>)
