## Pancreatic Cancer Detection

### Dataset Creation

In [1]:
from dataset import Dataset
from pathlib import Path
import torch
from sklearn import preprocessing
import numpy as np
torch.manual_seed(94)
%load_ext autoreload
%autoreload 2


In [2]:
urinary_data = Dataset(filePath='../data/urinary_data.csv',
                       label_column='diagnosis',
                       separator=',', 
                       name='urinary_data')
display(urinary_data.dataframe.head())

Unnamed: 0,sample_id,patient_cohort,sample_origin,age,sex,diagnosis,stage,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
0,S1,Cohort1,BPTB,33,F,1,,,11.7,1.83222,0.893219,52.94884,654.282174,1262.0
1,S10,Cohort1,BPTB,81,F,1,,,,0.97266,2.037585,94.46703,209.48825,228.407
2,S100,Cohort2,BPTB,51,M,1,,,7.0,0.78039,0.145589,102.366,461.141,
3,S101,Cohort2,BPTB,61,M,1,,,8.0,0.70122,0.002805,60.579,142.95,
4,S102,Cohort2,BPTB,62,M,1,,,9.0,0.21489,0.00086,65.54,41.088,


### Dataset cleaning and preparation

In [3]:
# urinary_data.cleanDataframe()
# Encode labels
for column in urinary_data.dataframe.columns:
    urinary_data.encode_column(column)

display(urinary_data.dataframe.head())

# Dataset Cleaning
urinary_data.cleanDataframe()

display(urinary_data.dataframe.head())


Unnamed: 0,sample_id,patient_cohort,sample_origin,age,sex,diagnosis,stage,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
0,0,0,0,6,0,0,8,52,78,158,181,337,391,247
1,1,0,0,54,0,0,8,52,266,93,265,402,242,151
2,2,1,0,24,1,0,8,52,62,75,108,412,347,298
3,3,1,0,34,1,0,8,52,67,68,47,351,209,298
4,4,1,0,35,1,0,8,52,70,16,14,360,117,298


Removed: 0 rows | 1 columns


Unnamed: 0,patient_cohort,sample_origin,age,sex,diagnosis,stage,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
0,0,0,6,0,0,8,52,78,158,181,337,391,247
1,0,0,54,0,0,8,52,266,93,265,402,242,151
2,1,0,24,1,0,8,52,62,75,108,412,347,298
3,1,0,34,1,0,8,52,67,68,47,351,209,298
4,1,0,35,1,0,8,52,70,16,14,360,117,298


In [4]:
from sklearn.model_selection import train_test_split
# Dataset split
X = urinary_data.dataframe.iloc[:, urinary_data.dataframe.columns != urinary_data.label_column]
y = urinary_data.dataframe[urinary_data.label_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print("Set sizes: Train: {}, Validation: {}, Test: {}".format(len(X_train), len(X_val), len(X_test)))

# Convert to tensors
X_train = torch.from_numpy(X_train.values).float().squeeze()
X_val = torch.from_numpy(X_val.values).float().squeeze()  
X_test = torch.from_numpy(X_test.values).float().squeeze()
y_train = torch.from_numpy(y_train.values).float().squeeze()
y_val = torch.from_numpy(y_val.values).float().squeeze()
y_test = torch.from_numpy(y_test.values).float().squeeze()


print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_val shape: {}".format(X_val.shape))
print("y_val shape: {}".format(y_val.shape))
print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))


Set sizes: Train: 377, Validation: 95, Test: 118
X_train shape: torch.Size([377, 12])
y_train shape: torch.Size([377])
X_val shape: torch.Size([95, 12])
y_val shape: torch.Size([95])
X_test shape: torch.Size([118, 12])
y_test shape: torch.Size([118])


###

### Model Training

In [5]:
# Define loss function
# Binary classification => BCELoss
from models import train_binary_logits
from models import PCDModel_1
from models import MulticlassClassification
from models import accuracy_fn

# Check shapes


# TRAINING
epochs = 1000
model_0 = PCDModel_1(urinary_data.get_feature_count(), urinary_data.get_label_count())
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model_0.parameters(), lr=1e-10)

for epoch in range(epochs):
    model_0.train()

    # 1. Forward pass
    y_logits = model_0(X_train).squeeze()
    y_pred = torch.round(torch.sigmoid(y_logits))
    print(y_pred.shape)
    # 2. Compute loss
    loss = loss_fn(y_pred, y_train)

    # 2.1 Compute accuracy
    acc = accuracy_fn(y_train, y_pred)

    # 3. Optimizer zero_grad
    optimizer.zero_grad()

    # 4. Backward pass
    loss.backward()

    # 5. Optimizer step
    optimizer.step()

    # 6. Test
    model_0.eval()
    with torch.inference_mode():
        # 6.1 Forward pass
        test_logits = model_0(X_test).squeeze()
        test_pred = torch.round(torch.sigmoid(test_logits))
        # 6.2 Compute loss
        test_loss = loss_fn(test_logits, y_test)
        # 6.3 Compute accuracy
        test_acc = accuracy_fn(y_test, test_pred)

    if epoch % 100 == 0:
        print("Epoch: {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}".format(epoch, loss.item(), acc.item(), test_loss.item(), test_acc.item()))





  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (3) must match the size of tensor b (377) at non-singleton dimension 1