## !!! Baseline similarity using CNN and Cosine Similarity, Euclidean and Manhattan Distance !!!
This uses 2d convolution

In [81]:
import numpy as np
import pandas as pd
import torch 
import torch.nn as nn

import random
import os

In [82]:
seed = 101
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

### Loading data
* Re-using the saved pre-processed data of patients grouped by visits. 

In [83]:
X = torch.load("processed-data/p_x.pt")
Y = torch.load("processed-data/p_y.pt")

In [84]:
print(len(Y))
print(len(X))
print(set(Y))

184123
184123
{'N189', 'N180', 'N088', 'N039', 'N188', 'E142', 'I10', 'E102', 'I120', 'N083'}


In [85]:
print(X.shape)
print(len(Y))

(184123, 40, 42)
184123


### Number of patients in each target class

In [86]:
for l in set(Y):
    print(f"{l}:", len(np.where(np.array(Y) == l)[0]))

N189: 2690
N180: 178540
N088: 129
N039: 316
N188: 307
E142: 178
I10: 360
E102: 100
I120: 1189
N083: 314


### Converting target lables to one-hot encoding 

In [87]:
y_labels = list(set(Y))
Y_oh = np.zeros((len(Y), len(y_labels)))
for idx, y in enumerate(Y):
    Y_oh[idx][y_labels.index(y)] = 1

In [88]:
print(Y_oh.shape)
Y_oh

(184123, 10)


array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

### Converting patient data (X) and target one-hot encoded data (Y_oh) as tensors to build the model

In [89]:
Xt = torch.tensor(X)
Yt = torch.tensor(Y_oh)
Xt = Xt.type(torch.FloatTensor)
Yt = Yt.type(torch.LongTensor)
print("Xt shape:", Xt.shape)
print("Yt shape:", Yt.shape)

Xt shape: torch.Size([184123, 40, 42])
Yt shape: torch.Size([184123, 10])


### A custom dataset to load pairwise data. We are just picking adjacent patient data as pair to learn patient similiarity. Data is shuffled by dataloader for training 

In [137]:
from torch.utils.data import Dataset
from torch.utils.data import random_split

class PairwiseDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.n = len(X)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x1 = self.X[idx]
        y1 = self.y[idx]
        if idx+1 == self.n:
            x2 = self.X[idx]
            y2 = self.y[idx]
        else:
            x2 = self.X[idx+1]
            y2 = self.y[idx+1]
        
        y = int(not np.array_equal(y1, y2))
        
        return [x1, x2, np.asarray([y]), y1, y2]

    def get_splits(self, n_test=0.2):
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        
        return random_split(self, [train_size, test_size])

In [149]:
from torch.utils.data import DataLoader

BATCH_SIZE = 32

dataset = PairwiseDataset(Xt, Yt)
train, test = dataset.get_splits()
train_dl = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True)
test_dl = DataLoader(test, batch_size=BATCH_SIZE, shuffle=False)
print("# of train batches:", len(train_dl))
print("# of test batches:", len(test_dl))

# of train batches: 4604
# of test batches: 1151


In [150]:
train_iter = iter(train_dl)
x1, x2, y, y1, y2 = next(train_iter)

print('Shape of a batch x1:', x1.shape)
print('Shape of a batch y1:', y1.shape)
print('Shape of a batch x2:', x2.shape)
print('Shape of a batch y2:', y2.shape)
print('Shape of a batch y:', y.shape)

Shape of a batch x1: torch.Size([32, 40, 42])
Shape of a batch y1: torch.Size([32, 10])
Shape of a batch x2: torch.Size([32, 40, 42])
Shape of a batch y2: torch.Size([32, 10])
Shape of a batch y: torch.Size([32, 1])


### CNN Similairty models
* CNNCosineSimNet - CNN using consine similarity
* CNNEuclideanSimNet - CNN using euclidean distance 
* CNNManhattanSimNet - CNN using manhattan distance

In [164]:
import torch.nn.functional as F

class CNNSimNet(nn.Module):
    def __init__(self):
        super(CNNSimNet, self).__init__()
        self.conv = nn.Conv2d(1, 128, (3, 6), stride=2)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d((3, 6), stride=2)
                
        self.fc = nn.Linear(8064, 10)
        
        # Initialized by the child classes
        self.similarity = None
        
    def _forward(self, x):
        x = x[:, None, :]

        x = self.conv(x)
        x = self.relu(x)
        x = self.maxpool(x)

        f_out = x.flatten(start_dim=1)
        x_out = self.fc(f_out)
        
        return x_out
    
    def forward(self, x1, x2):
        x1_out = self._forward(x1)
        x2_out = self._forward(x2)
        
        y_out = self.similarity(x1_out, x2_out)
        y_out = y_out.unsqueeze(0)
        
        return y_out
    
    
class CNNCosineSimNet(CNNSimNet):
    def __init__(self):
        super(CNNCosineSimNet, self).__init__()
        self.similarity = nn.CosineSimilarity()

class CNNEuclideanSimNet(CNNSimNet):
    def __init__(self):
        super(CNNEuclideanSimNet, self).__init__()
        self.similarity = nn.PairwiseDistance(p=2)
        
class CNNManhattanSimNet(CNNSimNet):
    def __init__(self):
        super(CNNManhattanSimNet, self).__init__()
        self.similarity = nn.PairwiseDistance(p=1)

In [165]:
from sklearn.metrics import accuracy_score, \
precision_recall_fscore_support, recall_score, f1_score
import warnings
warnings.filterwarnings("ignore")

def model_metrics(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')
    return acc, p, r, f1

In [166]:
from numpy import vstack
from numpy import argmax

def evaluate(model, dl):
    model.eval()
    all_y_pred, all_y_true = list(), list()
    for x1, x2, y0, y1, y2 in dl:
        y_hat = model(x1, x2)

        y_true = y0.flatten().unsqueeze(0).type(torch.FloatTensor)
        y_pred = y_hat
        
        y_pred = (y_pred > 0.5).type(torch.FloatTensor)

        pad_len = BATCH_SIZE - y_true.shape[1]
        if pad_len > 0:
            all_y_pred.append(np.pad(y_pred.flatten(), pad_width=(0,pad_len)))
            all_y_true.append(np.pad(y_true.flatten(), pad_width=(0,pad_len)))
        else:
            all_y_pred.append(y_pred.flatten())
            all_y_true.append(y_true.flatten())
    all_y_pred, all_y_true = vstack(all_y_pred), vstack(all_y_true)
    acc, p, r, f1 = model_metrics(all_y_true.flatten(), all_y_pred.flatten())
    print(f"acc: {acc:.4f}, precision: {p:.4f}, recall: {r:.4f}, f1: {f1:.4f}")  

In [167]:
def train(model, optimizer, criterion):
    n_epochs = 10
    model.train()
    train_loss_arr = []
    for epoch in range(n_epochs):
        train_loss = 0
        for x1, x2, y0, y1, y2 in train_dl:
            optimizer.zero_grad()
            y_hat = model(x1, x2)

            y_true = y0.flatten().unsqueeze(0).type(torch.FloatTensor)
            y_pred = y_hat
            
            loss = criterion(y_pred, y_true)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss = train_loss / len(train_dl)
        print('Epoch: {} \tTraining Loss: {:.4f}'.format(epoch, train_loss))
        evaluate(model, test_dl)

### Training and evalulating the models 

#### CNN using Cosine Similarity

In [168]:
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

In [155]:
%%time
model_c = CNNCosineSimNet()
optimizer_c = Adam(model_c.parameters(), lr=0.01)
criterion_c = CrossEntropyLoss()
train(model_c, optimizer_c, criterion_c)

Epoch: 0 	Training Loss: 5.0140
acc: 0.2084, precision: 0.5134, recall: 0.5440, f1: 0.1957
Epoch: 1 	Training Loss: 4.9940
acc: 0.2379, precision: 0.5151, recall: 0.5560, f1: 0.2189
Epoch: 2 	Training Loss: 4.9846
acc: 0.2387, precision: 0.5158, recall: 0.5590, f1: 0.2196
Epoch: 3 	Training Loss: 4.9774
acc: 0.2453, precision: 0.5160, recall: 0.5610, f1: 0.2247
Epoch: 4 	Training Loss: 4.9708
acc: 0.2620, precision: 0.5146, recall: 0.5592, f1: 0.2368
Epoch: 5 	Training Loss: 4.9661
acc: 0.2817, precision: 0.5160, recall: 0.5687, f1: 0.2512
Epoch: 6 	Training Loss: 4.9616
acc: 0.2667, precision: 0.5163, recall: 0.5671, f1: 0.2406
Epoch: 7 	Training Loss: 4.9578
acc: 0.2838, precision: 0.5169, recall: 0.5729, f1: 0.2529
Epoch: 8 	Training Loss: 4.9536
acc: 0.2918, precision: 0.5162, recall: 0.5717, f1: 0.2584
Epoch: 9 	Training Loss: 4.9494
acc: 0.2833, precision: 0.5174, recall: 0.5750, f1: 0.2527
CPU times: user 54min 20s, sys: 4min 9s, total: 58min 30s
Wall time: 30min 47s


#### CNN using Euclidean Distance

In [161]:
%%time
model_e = CNNEuclideanSimNet()
optimizer_e = Adam(model_e.parameters(), lr=0.01)
criterion_e = CrossEntropyLoss()
train(model_e, optimizer_e, criterion_e)

Epoch: 0 	Training Loss: 5.0864
acc: 0.5481, precision: 0.5003, recall: 0.5016, f1: 0.3915
Epoch: 1 	Training Loss: 5.0443
acc: 0.6881, precision: 0.5042, recall: 0.5203, f1: 0.4503
Epoch: 2 	Training Loss: 5.0335
acc: 0.9536, precision: 0.5255, recall: 0.5008, f1: 0.4910
Epoch: 3 	Training Loss: 5.0356
acc: 0.9526, precision: 0.5200, recall: 0.5012, f1: 0.4924
Epoch: 4 	Training Loss: 5.0347
acc: 0.9547, precision: 0.4774, recall: 0.5000, f1: 0.4884
Epoch: 5 	Training Loss: 5.0353
acc: 0.9543, precision: 0.5774, recall: 0.5015, f1: 0.4918
Epoch: 6 	Training Loss: 5.0348
acc: 0.9526, precision: 0.5159, recall: 0.5009, f1: 0.4919
Epoch: 7 	Training Loss: 5.0346
acc: 0.9547, precision: 0.7274, recall: 0.5009, f1: 0.4902
Epoch: 8 	Training Loss: 5.0338
acc: 0.9470, precision: 0.5096, recall: 0.5019, f1: 0.4969
Epoch: 9 	Training Loss: 5.0350
acc: 0.9533, precision: 0.5305, recall: 0.5013, f1: 0.4921
CPU times: user 55min 1s, sys: 3min 24s, total: 58min 25s
Wall time: 30min 9s


#### CNN using Manhattan Distance

In [169]:
%%time
model_m = CNNManhattanSimNet()
optimizer_m = Adam(model_m.parameters(), lr=0.01)
criterion_m = CrossEntropyLoss()
train(model_m, optimizer_m, criterion_m)

Epoch: 0 	Training Loss: 5.2048
acc: 0.9512, precision: 0.5013, recall: 0.5001, f1: 0.4913
Epoch: 1 	Training Loss: 5.0578
acc: 0.9535, precision: 0.5062, recall: 0.5002, f1: 0.4898
Epoch: 2 	Training Loss: 5.0612
acc: 0.8573, precision: 0.5006, recall: 0.5012, f1: 0.4939
Epoch: 3 	Training Loss: 5.0568
acc: 0.9540, precision: 0.4935, recall: 0.4999, f1: 0.4888
Epoch: 4 	Training Loss: 5.0616
acc: 0.9381, precision: 0.4976, recall: 0.4990, f1: 0.4956
Epoch: 5 	Training Loss: 5.0598
acc: 0.8868, precision: 0.5069, recall: 0.5113, f1: 0.5063
Epoch: 6 	Training Loss: 5.0616
acc: 0.9070, precision: 0.4906, recall: 0.4896, f1: 0.4900
Epoch: 7 	Training Loss: 5.0594
acc: 0.8282, precision: 0.4945, recall: 0.4851, f1: 0.4797
Epoch: 8 	Training Loss: 5.0582
acc: 0.7859, precision: 0.4962, recall: 0.4867, f1: 0.4708
Epoch: 9 	Training Loss: 5.0603
acc: 0.9533, precision: 0.4773, recall: 0.4993, f1: 0.4881
CPU times: user 49min 37s, sys: 3min 23s, total: 53min
Wall time: 35min 3s
