## Pancreatic Cancer Detection

### Dataset Creation

In [22]:
from pathlib import Path
import torch
from sklearn import preprocessing
import numpy as np
torch.manual_seed(94)
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import os
from pathlib import Path
import pandas as pd
import networkx as nx
import numpy as np
from utils import Utilities as utils
from sklearn import preprocessing
from torch.utils.data import Dataset
import torch

class ToTensor(object):
    """Convert sample to Tensors."""

    def __call__(self, sample):
        return torch.Tensor(sample).float() 
    
class DataFrameEntry():
    def __init__(self, columns : list, values : list, name = '') -> None:
        self.columns = columns
        self.values = values
        self.name = name

class DataFrameLabel():
    def __init__(self, columns : list, values : list, name = '') -> None:
        self.columns = columns
        self.values = values
        self.name = name
        
    
class Dataset(Dataset):
    def __init__(self, filePath : str, label_column : list, separator = ';', name=''):
        self.dataframe = utils.createDataframe(filepath=filePath, 
                                               separator=separator)
        self.label_column = label_column
        self.encoders = {}
        self.label_dicts = {}

    def init_label_dictionary(self, label_column : str | int, label_dict : dict):
        self.label_dicts[label_column] = label_dict


    def __len__(self):
        return len(self.dataframe)
    
    def get_labels(self):
        return self.df[self.label_column]

    def encode_column(self, column : str | int) -> None:
        if self.encoders.get(column) is None:
            self.encoders[column] = preprocessing.LabelEncoder()
            self.encoders[column].fit(self.dataframe[column].values)
        self.dataframe[column] = self.encoders[column].transform(self.dataframe[column].values)

    def decode_column(self, column : str | int) -> None:
        if self.encoders.get(column) is not None:
            self.dataframe[column] = self.encoders[column].inverse_transform(self.dataframe[column].values)
        else:
            print('Warning: Column not encoded')

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return self.dataframe.iloc[idx]
    
    def get_column_types(self, column : str | int) -> str:
        if self.__len__() > 0:
            return type(self.dataframe[column][0])
        else:
            raise Exception('Dataset is empty')
    

            
    def addDataset(self, filePath : str, separator = ';', name='') -> None:
        self.df = utils.createDataframe(self.base_path + filePath, separator=separator)
        
        if self.df is None:
            print('Error: File not found or not valid')
        else:
            if self.dataframes.get(name) is None:
                self.dataframes[name] = self.df
                self.encode_df = self.df.copy()
                print('Added ' + filePath + ' to dataset')
            else: 
                print('Warning: Dataset name already exists')
             
    
    def createDataset(self, files : list) -> None:
        frames = []
        for file in files:
            csv = self.base_path + file
            df = pd.read_csv(csv, sep=";")
            frames.append(df)
        self.df = pd.concat(frames) 
        self.df.drop_duplicates(inplace=True)

    def get_feature_count(self):
        return len(self.dataframe.columns) - 1
    
    def get_label_count(self):
        return len(self.dataframe[self.label_column].unique())
        
    def cleanDataframe(self):
        # Check for columns with all different values
        size = self.dataframe.shape
        self.dataframe = self.dataframe.loc[:, self.dataframe.apply(pd.Series.nunique) != self.dataframe.shape[0]]
        
        # Exclude some entries as to make it even
        self.dataframe = self.dataframe[:self.dataframe.shape[0] - (self.dataframe.shape[0] % 10)]
        print("Removed: " + str(size[0] - self.dataframe.shape[0]) + " rows | " + 
              str(size[1] - self.dataframe.shape[1]) + " columns")
         
    def applyPreprocessing(self, columns:list):
        size = self.df.shape[1]
        self.select(columns)
        print("Removed " + str(size - self.df.shape[1]) + " columns")
        
    def select(self, columns:list):
        if self.deleted is None:
            self.deleted = pd.DataFrame()
        
        # Restore the deleted columns
        # self.restore(columns)
                
        # Keep track of the deleted columns
        _deletedColumns = self.df.columns.difference(columns)
        
        if self.deleted.empty:
            self.deleted = self.df[_deletedColumns]
        else:
            self.deleted = pd.concat([self.deleted, self.df[_deletedColumns]], axis=0)

            
        self.df.drop(_deletedColumns, axis=1, inplace=True)

    ## NEEDS TO BE FIXED ##
    def restore(self, columns : list):
        restored = 0
        if self.deleted is None or self.deleted.empty:
            print("No columns to restore")
            return
        else:
            for col in (set(self.deleted.columns) & set(columns)):
                restored += 1
                _restored = self.deleted[col]

                self.df = pd.concat([self.df, _restored], axis=1, ignore_index=True)
                print(self.df.columns)
                # self.df.append(self.deleted[col])
        print("Restored " + str(restored) + " columns")
            
    
    def applyFilter(self, column, value, maxrows=None, criterion='equal'):
        if maxrows is not None:
            self.df = self.df.head(maxrows)
        if criterion == 'equal':
            self.df = self.df[self.df[column] == value]
        elif criterion == 'contains':
            self.df = self.df[self.df[column].str.contains(value)]

    def colSize(self):
        return len(self.df.columns)
    
    def rowSize(self):
        return len(self.df.index)

In [23]:
urinary_data = Dataset(filePath='../data/urinary_data.csv',
                       label_column='diagnosis',
                       separator=',', 
                       name='urinary_data')
display(urinary_data.dataframe.head())

Unnamed: 0,sample_id,patient_cohort,sample_origin,age,sex,diagnosis,stage,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
0,S1,Cohort1,BPTB,33,F,1,,,11.7,1.83222,0.893219,52.94884,654.282174,1262.0
1,S10,Cohort1,BPTB,81,F,1,,,,0.97266,2.037585,94.46703,209.48825,228.407
2,S100,Cohort2,BPTB,51,M,1,,,7.0,0.78039,0.145589,102.366,461.141,
3,S101,Cohort2,BPTB,61,M,1,,,8.0,0.70122,0.002805,60.579,142.95,
4,S102,Cohort2,BPTB,62,M,1,,,9.0,0.21489,0.00086,65.54,41.088,


### Dataset cleaning and preparation

In [24]:
# urinary_data.cleanDataframe()
# Encode labels
for column in urinary_data.dataframe.columns:
    urinary_data.encode_column(column)

display(urinary_data.dataframe.head())

# Dataset Cleaning
urinary_data.cleanDataframe()

display(urinary_data.dataframe.head())


Unnamed: 0,sample_id,patient_cohort,sample_origin,age,sex,diagnosis,stage,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
0,0,0,0,6,0,0,8,52,78,158,181,337,391,247
1,1,0,0,54,0,0,8,52,266,93,265,402,242,151
2,2,1,0,24,1,0,8,52,62,75,108,412,347,298
3,3,1,0,34,1,0,8,52,67,68,47,351,209,298
4,4,1,0,35,1,0,8,52,70,16,14,360,117,298


Removed: 0 rows | 1 columns


Unnamed: 0,patient_cohort,sample_origin,age,sex,diagnosis,stage,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
0,0,0,6,0,0,8,52,78,158,181,337,391,247
1,0,0,54,0,0,8,52,266,93,265,402,242,151
2,1,0,24,1,0,8,52,62,75,108,412,347,298
3,1,0,34,1,0,8,52,67,68,47,351,209,298
4,1,0,35,1,0,8,52,70,16,14,360,117,298


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Dataset split
X = urinary_data.dataframe.iloc[:, urinary_data.dataframe.columns != urinary_data.label_column].squeeze()
y = urinary_data.dataframe[urinary_data.label_column].values.reshape(-1, 1)
# Transform y to tensor of size equal to the number of classes
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False).fit(y)
y = ohe.transform(y)


In [26]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print("Set sizes: Train: {}, Validation: {}, Test: {}".format(len(X_train), len(X_val), len(X_test)))

# Convert to tensors
X_train = torch.from_numpy(X_train.values).float().squeeze()
X_val = torch.from_numpy(X_val.values).float().squeeze()  
X_test = torch.from_numpy(X_test.values).float().squeeze()
y_train = torch.from_numpy(y_train).float().squeeze()
y_val = torch.from_numpy(y_val).float().squeeze()
y_test = torch.from_numpy(y_test).float().squeeze()


print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_val shape: {}".format(X_val.shape))
print("y_val shape: {}".format(y_val.shape))
print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))


Set sizes: Train: 377, Validation: 95, Test: 118
X_train shape: torch.Size([377, 12])
y_train shape: torch.Size([377, 3])
X_val shape: torch.Size([95, 12])
y_val shape: torch.Size([95, 3])
X_test shape: torch.Size([118, 12])
y_test shape: torch.Size([118, 3])


###

In [27]:
# Define dictionary for label column
# 3 (pancreatic cancer), 2 (non-cancerous pancreas condition),  1 (healthy)
label_dict = {1: 'healthy', 2: 'non-cancerous pancreas condition', 3: 'pancreatic cancer'}
urinary_data.init_label_dictionary(label_column='diagnosis', label_dict=label_dict)

### Model Definition


In [None]:
class PCDModel_1(torch.nn.Module):
    def __init__(self, input_shape, output_shape):
        super().__init__()
        self.linearBlock = nn.Sequential(
            nn.Linear(in_features=input_shape, out_features=32),
            nn.ReLU(),
            nn.Linear(in_features=32, out_features=16),
            nn.ReLU(),
            nn.Linear(in_features=16, out_features=8),
            nn.ReLU(),
            nn.Linear(in_features=8, out_features=output_shape),
        )
        self.initWeights()

        
    def initWeights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        x = self.linearBlock(x)
        return x
        

### Model Training

In [37]:
# Define loss function
# Binary classification => BCELoss
from models import train_binary_logits
from models import PCDModel_1
from models import MulticlassClassification
from models import accuracy_fn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from matplotlib import pyplot as plt
# TRAINING
epochs = 100000
model_0 = PCDModel_1(urinary_data.get_feature_count(), urinary_data.get_label_count())
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_0.parameters(), lr=0.0001)

losses = []
for epoch in range(epochs):
    model_0.train()

    # 1. Forward pass
    y_logits = model_0(X_train).squeeze()
    y_pred = torch.sigmoid(y_logits)
    #print(y_pred.shape)
    # 2. Compute loss
    loss = loss_fn(y_pred, y_train)
    losses.append(loss)

    # 2.1 Compute accuracy
    acc = (torch.argmax(y_pred, 1) == torch.argmax(y_train, 1)).float().mean()

    # 3. Optimizer zero_grad
    optimizer.zero_grad()

    # 4. Backward pass
    loss.backward()

    # 5. Optimizer step
    optimizer.step()
    # 6. Test
    model_0.eval()
    with torch.inference_mode():
        # 6.1 Forward pass
        test_logits = model_0(X_test).squeeze() 
        test_pred = torch.sigmoid(test_logits)
        # 6.2 Compute loss
        test_loss = loss_fn(test_pred, y_test)
        # 6.3 Compute accuracy
        test_acc = (torch.argmax(test_pred, 1) == torch.argmax(y_test, 1)).float().mean()

    if epoch % 10 == 0:
        print("Epoch: {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}".format(epoch, loss, acc, test_loss, test_acc))





Epoch: 0, Loss: 1.2768505811691284, Accuracy: 0.28381961584091187, Test Loss: 1.2760202884674072, Test Accuracy: 0.27966102957725525
Epoch: 10, Loss: 1.2763742208480835, Accuracy: 0.29177719354629517, Test Loss: 1.2760858535766602, Test Accuracy: 0.27966102957725525
Epoch: 20, Loss: 1.2758129835128784, Accuracy: 0.29177719354629517, Test Loss: 1.2761517763137817, Test Accuracy: 0.27966102957725525
Epoch: 30, Loss: 1.2751892805099487, Accuracy: 0.2997347414493561, Test Loss: 1.2761931419372559, Test Accuracy: 0.27966102957725525
Epoch: 40, Loss: 1.274582862854004, Accuracy: 0.3076923191547394, Test Loss: 1.2761867046356201, Test Accuracy: 0.27966102957725525
Epoch: 50, Loss: 1.2740744352340698, Accuracy: 0.3103448152542114, Test Loss: 1.2761434316635132, Test Accuracy: 0.2711864411830902
Epoch: 60, Loss: 1.2736865282058716, Accuracy: 0.3156498670578003, Test Loss: 1.2761012315750122, Test Accuracy: 0.2711864411830902
Epoch: 70, Loss: 1.2733973264694214, Accuracy: 0.3183023929595947, Tes

In [87]:
pred = model_0(X_test[10]).squeeze()
print(y_test[10])
print(pred)

tensor([0., 0., 1.])
tensor([-18.6728, -20.9609, -12.0012], grad_fn=<SqueezeBackward0>)
