In [None]:
import h5py
import inspect
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import seaborn as sns
import sys
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.datasets as dset
import torchvision.models as models
import torchvision.transforms as T
from collections import OrderedDict
from collections import defaultdict
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from torch.utils.data import Dataset, DataLoader, sampler
from PIL import Image

from tqdm.notebook import tqdm, trange

%matplotlib inline

## Inspect Available GPUs

In [2]:
use_gpu = torch.cuda.is_available()
print('Using gpu: %s ' % use_gpu)

def gpu(x,use_gpu=use_gpu):
    if use_gpu:
        return x.cuda()
    else:
        return x

Using gpu: True 


In [3]:
print("Cuda is available:", torch.cuda.is_available())
print("Cuda device count:", torch.cuda.device_count())

Cuda is available: True
Cuda device count: 2


## Create Datasets

In [4]:
PATH_TO_TRAIN = "/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/data_splits/custom_splits/triplenet_features/triplenet_train_features.hdf5"
PATH_TO_VAL = "/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/data_splits/custom_splits/triplenet_features/triplenet_val_features.hdf5"
PATH_TO_TEST = "/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/data_splits/custom_splits/triplenet_features/triplenet_test_features.hdf5"

In [5]:
class MILDataset(Dataset):
    
    def __init__(self, hdf5_path: str, transform=None):
        self.hdf5_path = hdf5_path
        self.h5data = h5py.File(self.hdf5_path, "r")
        self.cores = list(self.h5data.keys())
        self.transform = transform
        
    def __len__(self):
        return len(self.cores)
    
    def __getitem__(self, idx):
        patient_id = self.cores[idx]
        patches = self.h5data[patient_id][()]
        label = self.h5data[patient_id].attrs["y"]
        if self.transform:
            patches = self.transform(patches)
        return torch.tensor(patches), torch.tensor(int(label))

class NaiveDataset(Dataset):
    def __init__(self, hdf5_path: str, transform=None):

        self.hdf5_path = hdf5_path
        self.h5data = h5py.File(self.hdf5_path, "r")
        self.cores = list(self.h5data.keys())
        
        self.lengths = [len(self.h5data[i]) for i in self.cores]

        self.transform = transform

    def __len__(self):
        return sum(self.lengths)

    def __getitem__(self, idx):

        core_idx = 0
        for l in self.lengths:
            if idx - self.lengths[core_idx] < 0:
                break
            idx -= self.lengths[core_idx]
            core_idx += 1
        
        
        core_id = self.cores[core_idx]
        patch: np.ndarray = self.h5data[core_id][()][idx]
        label = self.h5data[core_id].attrs["y"]
        
        if self.transform:
            patch = self.transform(patch)

        return patch, torch.tensor(int(label))

In [6]:
train_dataset = NaiveDataset(PATH_TO_TRAIN)
val_dataset = NaiveDataset(PATH_TO_VAL)
test_dataset = NaiveDataset(PATH_TO_TEST)

## Create Dataloaders

In [7]:
# Dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=64, num_workers=1, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, num_workers=1, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, num_workers=1, shuffle=True)

In [8]:
# Output the label distribution in the train_dataloader
label_counts = defaultdict(int)
for (x,y) in train_dataloader:
    label_counts[y.item()] += 1
label_counts

ValueError: only one element tensors can be converted to Python scalars

## Create Linear Layer Above Feature Extractor

In [9]:
class LinearLayer(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.linear = nn.Linear(256*3, num_classes)
    
    def forward(self, x):
        out = self.linear(x)
        return out

model = LinearLayer(9)

## Define Optimizer and Loss Criterion

In [10]:
# learning_rate = 1e-2
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

weights = [0.0058, 0.0156, 0.1861, 0.0357, 0.0291, 0.0994, 0.0341, 0.0542, 0.5400]

optimizer = optim.Adam(model.parameters())
criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(weights).cuda())

## Define Training Loop

In [19]:
def train_model(model, optimizer, epochs=1):
    train_loss_history = []
    train_accuracy_history = []
    val_loss_history = []
    val_accuracy_history = []
    for epoch in range(epochs):
        print(f"Epoch: {epoch}")
        model.train()
        batches = train_dataloader
        num_correct, num_samples, total_loss = 0, 0, 0
        i = 0
        for x, y in tqdm(batches):
            optimizer.zero_grad()
            # x, y = gpu(x.squeeze(dim=0)), gpu(y)
            x, y = gpu(x), gpu(y)
            scores = model(x)
            # aggregated_scores = torch.max(scores, 0, keepdim=True)[0]
            # loss = criterion(aggregated_scores, y)
            loss = criterion(scores, y)
            loss.backward()
            optimizer.step()
            # _,preds = torch.max(aggregated_scores, 1)
            _,preds = torch.max(scores, 1)
            total_loss += loss.data.item()
            num_correct += torch.sum(preds == y.data)
            num_samples += preds.size(0)
            i += 1
        print(f"Completed {i} iterations in epoch")
        average_loss = total_loss / num_samples
        acc = num_correct / num_samples
        train_loss_history.append(average_loss)
        train_accuracy_history.append(acc)
        print('Epoch: {} Training Loss: {:.4f} Got {} / {} correct. Acc: {:.2f}%'.format(
                     epoch + 1, average_loss, num_correct, num_samples, 100 * acc))
        
        check_accuracy(model, val_loss_history, val_accuracy_history, epoch)
    return train_loss_history, train_accuracy_history, val_loss_history, val_accuracy_history

def check_accuracy(model, val_loss_history, val_accuracy_history, epoch):
    num_correct, num_samples, total_loss = 0, 0, 0
    model.eval()
    batches = val_dataloader
    with torch.no_grad():
        for x, y in tqdm(batches):
            # x, y = gpu(x.squeeze(dim=0)), gpu(y)
            x, y = gpu(x), gpu(y)
            scores = model(x)
            # aggregated_scores = torch.max(scores, 0, keepdim=True)[0]
            # loss = criterion(aggregated_scores, y)
            loss = criterion(scores, y)
            # _, preds = torch.max(aggregated_scores, 1)
            _, preds = torch.max(scores, 1)
            total_loss += loss.data.item()
            num_correct += torch.sum(preds == y.data)
            num_samples += preds.size(0)
        average_loss = total_loss / num_samples
        acc = num_correct / num_samples
    val_loss_history.append(average_loss)
    val_accuracy_history.append(acc)
    print('Epoch: {} Validation Loss: {:.4f} Got {} / {} correct {:.2f}%'.format(
        epoch + 1, average_loss, num_correct, num_samples, 100 * acc))

In [None]:
%%time
epochs = 10
LOSS, ACC, LOSS_V, ACC_V = train_model(gpu(model), optimizer, epochs=epochs)

Epoch: 0


  0%|          | 0/1183 [00:00<?, ?it/s]

In [None]:
PATH = "triplenet_model.dat"
torch.save(model, PATH)
print("Successfully saved model!")

In [None]:
# Print out the model parameters after training.
for name, param in model.named_parameters():
    print(param)

## Load Saved Model

In [None]:
PATH = "triplenet_model.dat"
loaded_model = torch.load(PATH)
loaded_model = gpu(loaded_model)
model = loaded_model

In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss() 

## Resume Training with Loaded Model

In [None]:
%%capture output
%%time
num_epochs = 20
LOSS, ACC, LOSS_V, ACC_V = train_model(gpu(model), optimizer, epochs=num_epochs)

In [18]:
PATH = f"./model_ckpts/triplenet_finetune_epochs_{num_epochs}.dat"
torch.save(model, PATH)
print("Success!")

Success!


## Evaluate Top-1, Top-3 Accuracy on Test Set + Confusion Matrices

In [19]:
model = torch.load(f"./model_ckpts/triplenet_finetune_epochs_{num_epochs}.dat")
model = gpu(model)

In [20]:
def compute_top_1_accuracy(model):
    correct = 0
    total = 0
    batches = test_dataloader
    with torch.no_grad():
        for x, y in tqdm(batches):
            x, y = gpu(x.squeeze(dim=0)), gpu(y)
            scores = model(x)
            # aggregated_scores = torch.max(scores, 0, keepdim=True)[0]
            # _, preds = torch.max(aggregated_scores, 1)
            _, preds = torch.max(scores, 1)
            total += preds.size(0)
            correct += (preds == y).sum().item()

    print('Top 1 Accuracy of the network on the all test images: %.2f %%' % (
        100 * correct / total))

compute_top_1_accuracy(model)

  0%|          | 0/1830 [00:00<?, ?it/s]

Top 1 Accuracy of the network on the all test images: 38.01 %


In [None]:
def compute_top_3_accuracy(model):
    correct = 0
    total = 0
    with torch.no_grad():
        model.eval()
        for x, y in tqdm(test_dataloader):
            x, y = gpu(x.squeeze(dim=0)), gpu(y)
            scores = model(x)
            # aggregated_scores = torch.max(scores, 0, keepdim=True)[0]
            # _, preds = torch.topk(aggregated_scores, 3, dim=1)
            _, preds = torch.topk(scores, 3, dim=1)
            total += y.size(0)
            for i in range(preds.shape[0]):
                top_5_predictions = preds[i]
                label = y[i]
                if label in top_5_predictions:
                    correct += 1
    print('Top 3 Accuracy of the network on the all test images: %.2f %%' % (
        100 * correct / total))
    
compute_top_3_accuracy(model)

  0%|          | 0/1830 [00:00<?, ?it/s]

In [None]:
# Store y_pred and y_test on the test set for evaluation.

y_pred = []
y_test = []
with torch.no_grad():
    model.eval()
    for x, y in test_dataloader:
        x, y = gpu(x.squeeze(dim=0)), gpu(y)
        scores = model(x)
        # aggregated_scores = torch.max(scores, 0, keepdim=True)[0]
        # _, predicted = torch.max(aggregated_scores, 1)
        _, predicted = torch.max(scores, 1)
        y_pred.extend(predicted.cpu().numpy().tolist())
        y_test.extend(y.cpu().numpy().tolist())

In [None]:
cnf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix\n')
print(cnf_matrix)
print(cnf_matrix[0][0], sum(cnf_matrix[0]))

In [None]:
num_classes = 8
array = cnf_matrix.tolist()
df_cm = pd.DataFrame(array, index = [i for i in range(8)],
                  columns = [i for i in range(8)])
plt.figure(figsize = (8,8))
sns.heatmap(df_cm, annot=True)