In [1]:
# Goal : To classify the type of ovarian cancer from microscopy scans of biopsy samples.

# Search SoTA model on Kaggle, Papers with code - Medical Data, Graph Transformer, GNN, Global and Local Analysis

# You should manage, including differences in image dimensions, quality, slide staining techniques, and more

# There are 2 types of images - TMA and WSI (25 images are TMA and 513(rest) are WSI)


In [2]:
import sys
sys.prefix

'/projectnb/cs640grp/students/avarshn/.conda/envs/ocean'

# Check if GPU is available

In [3]:
import torch

In [4]:
torch.cuda.is_available()

True

In [5]:
!nvidia smi


/bin/bash: nvidia: command not found


In [6]:
!lspci | grep -i nvidia

02:00.0 3D controller: NVIDIA Corporation GP100GL [Tesla P100 PCIe 12GB] (rev a1)
82:00.0 3D controller: NVIDIA Corporation GP100GL [Tesla P100 PCIe 12GB] (rev a1)


# Import Libraries

In [7]:
import os
from time import time
from collections import defaultdict
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

from PIL import Image
import cv2

import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from torchsummary import summary

import torchvision
from torchvision import transforms, utils
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights

# Configurations

In [8]:
CONFIG = {
    "seed": 42,
    # "img_size": 512,
    # "model_name": "tf_efficientnetv2_s_in21ft1k",
    "num_classes": 5,
    # "valid_batch_size": 32,
    "batch_size" : 8,
    "epochs" : 10,
    "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
}

In [9]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

# Loading Data

In [10]:
SCC_folder = "/projectnb/cs640grp/"
data_folder = "materials/UBC-OCEAN_CS640/"

In [11]:
data_path = SCC_folder + data_folder

In [12]:
os.listdir(data_path)

['test_script.py',
 'test_images_compressed_80',
 'train.csv',
 'test.csv',
 'train_images_compressed_80',
 'all_labels.npy']

In [13]:
labels = np.load(data_path +  'all_labels.npy')
labels

array(['HGSC', 'EC', 'MC', 'CC', 'LGSC'], dtype='<U4')

In [14]:
train_df = pd.read_csv(data_path +  'train.csv')
train_df["image_path"] = train_df["image_id"].apply(lambda x : str(x) + ".jpg")
train_df.head()

Unnamed: 0,image_id,label,image_path
0,57598,CC,57598.jpg
1,30868,MC,30868.jpg
2,42549,CC,42549.jpg
3,64824,CC,64824.jpg
4,15293,HGSC,15293.jpg


In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 430 entries, 0 to 429
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   image_id    430 non-null    int64 
 1   label       430 non-null    object
 2   image_path  430 non-null    object
dtypes: int64(1), object(2)
memory usage: 10.2+ KB


In [16]:
train_df["image_id"].nunique()

430

## Training Data Distribution

In [17]:
train_df["label"].value_counts()

label
HGSC    177
EC       99
CC       79
LGSC     38
MC       37
Name: count, dtype: int64

In [18]:
100*train_df["label"].value_counts(normalize = True)

label
HGSC    41.162791
EC      23.023256
CC      18.372093
LGSC     8.837209
MC       8.604651
Name: proportion, dtype: float64

In [19]:
test_df = pd.read_csv(data_path +  'test.csv')
test_df["image_path"] = test_df["image_id"].apply(lambda x : str(x) + ".jpg")
test_df.head()

Unnamed: 0,image_id,label,image_path
0,7482,CC,7482.jpg


In [20]:
# There is class imbalance

# high-grade serous carcinoma, clear-cell ovarian carcinoma, endometrioid, low-grade serous, and mucinous carcinoma

# Can use data augmentation to resolve this as slide can be in any fashion so can rotate the slides

# Custom Dataset

In [21]:


Image.MAX_IMAGE_PIXELS = None


In [22]:
model_weights = EfficientNet_B0_Weights.DEFAULT
preprocess = model_weights.transforms()

In [23]:
# preprocess = transforms.Compose([
#         # transforms.RandomSizedCrop(224),
#         # transforms.RandomHorizontalFlip(),
#         transforms.Resize((4000, 4000)),
#         transforms.ToTensor(),   # Converts a PIL Image or a NumPy array with values in the range [0, 255] to a PyTorch tensor
#                                  # with values in the range [0.0, 1.0].
#         # transforms.Normalize(mean=[0.485, 0.456, 0.406],
#         #                      std=[0.229, 0.224, 0.225])
# ])

In [24]:
class ocean_dataset(Dataset):
    
    def __init__(self, dataframe, image_dir, transform = None):
        
        self.ocean_df = dataframe
        self.ocean_df["image_path"] = self.ocean_df["image_id"].apply(lambda x : str(x) + ".jpg")
        
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.ocean_df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_path = os.path.join(self.image_dir, self.ocean_df["image_path"][idx])
        image = Image.open(img_path)

        # # Convert Pillow image to NumPy array
        # img_array = np.array(image)
        
        # # Display the range of pixel values
        # print(f"Minimum pixel value: {np.min(img_array)}")
        # print(f"Maximum pixel value: {np.max(img_array)}")


        if self.transform:
            image = self.transform(image)

        label_mapper = {'HGSC' : 0, 'EC' : 1, 'MC' : 2, 'CC' : 3, 'LGSC' : 4}
        label = label_mapper[self.ocean_df.iloc[idx, 1]]

        
        # sample = {'image': image, 'label': label}

        return image, label
        

In [25]:
df = pd.read_csv(data_path +  'train.csv')

# For testing the pipeline
# df = df[0:30]

train_df, test_df = train_test_split(df, test_size = 0.3, random_state = CONFIG['seed'])

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [26]:
train_dataset = ocean_dataset(train_df, data_path + 'train_images_compressed_80/', preprocess)

test_dataset = ocean_dataset(test_df, data_path + 'train_images_compressed_80/', preprocess)

In [27]:
# a,b = train_dataset.__getitem__(torch.tensor(8))

In [28]:
# b

0

# Class Weights - From Training Data since the dataset is imbalanced

In [29]:
train_df["label"]

0        CC
1      HGSC
2      HGSC
3        MC
4        MC
       ... 
296    HGSC
297    HGSC
298      EC
299      EC
300    HGSC
Name: label, Length: 301, dtype: object

In [30]:
label_mapper = {'HGSC' : 0, 'EC' : 1, 'MC' : 2, 'CC' : 3, 'LGSC' : 4}
labels = train_df["label"].apply(lambda x: label_mapper[x])
labels_arr = labels.values

In [31]:
train_df["label"].value_counts()

label
HGSC    115
EC       73
CC       63
MC       27
LGSC     23
Name: count, dtype: int64

In [32]:
# Formula -
# class_weight_of_that_class = n_samples / (n_classes * n_samples_of_that_class)

In [33]:
# NumPy array of class labels - labels_arr

class_weights = compute_class_weight('balanced',classes = np.unique(labels_arr), y = labels_arr)

In [34]:
class_weights

array([0.52347826, 0.82465753, 2.22962963, 0.95555556, 2.6173913 ])

In [35]:
class_weights_tensor = torch.tensor(class_weights)
class_weights_tensor

tensor([0.5235, 0.8247, 2.2296, 0.9556, 2.6174], dtype=torch.float64)

# Train

In [36]:
base_model = efficientnet_b0(weights = model_weights)

In [37]:
for i, layer in enumerate(base_model.children()):
    print(i , layer)
    print("*-"*20)

0 Sequential(
  (0): Conv2dNormActivation(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): SiLU(inplace=True)
  )
  (1): Sequential(
    (0): MBConv(
      (block): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): SiLU(inplace=True)
        )
        (1): SqueezeExcitation(
          (avgpool): AdaptiveAvgPool2d(output_size=1)
          (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
          (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
          (activation): SiLU(inplace=True)
          (scale_activation): Sigmoid()
        )
        (2): Conv2dNormActivation(
          (0): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1)

In [38]:
base_layers = nn.Sequential()
for i, layer in enumerate(base_model.children()):
    if i <2:
        base_layers.add_module(str(i), layer)

In [39]:
class Baseline_Model(nn.Module):

    def __init__(self):
        super().__init__()
        
        self.base_model_headless = base_layers
        self.base_model_headless.requires_grad = False
        
        self.fc1 = nn.Linear(1280, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 5)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p = 0.1)


    def forward(self, x):
        x = self.base_model_headless(x)
        # print("Output shape after Passing through Pretrained Model")
        # print(x.shape)

        # Flatten
        x = x.view((x.shape[0], -1))

        # FCNN
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        # print(x.shape)

        return x      

In [40]:
# https://github.com/pytorch/vision/issues/7744

from torchvision.models._api import WeightsEnum
from torch.hub import load_state_dict_from_url

def get_state_dict(self, *args, **kwargs):
    kwargs.pop("check_hash")
    return load_state_dict_from_url(self.url, *args, **kwargs)
WeightsEnum.get_state_dict = get_state_dict

In [41]:
# Initialize Model
model = Baseline_Model().to(CONFIG["device"])

In [42]:
summary(model, input_size = (3,224,224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 32, 112, 112]             864
       BatchNorm2d-2         [-1, 32, 112, 112]              64
              SiLU-3         [-1, 32, 112, 112]               0
            Conv2d-4         [-1, 32, 112, 112]             288
       BatchNorm2d-5         [-1, 32, 112, 112]              64
              SiLU-6         [-1, 32, 112, 112]               0
 AdaptiveAvgPool2d-7             [-1, 32, 1, 1]               0
            Conv2d-8              [-1, 8, 1, 1]             264
              SiLU-9              [-1, 8, 1, 1]               0
           Conv2d-10             [-1, 32, 1, 1]             288
          Sigmoid-11             [-1, 32, 1, 1]               0
SqueezeExcitation-12         [-1, 32, 112, 112]               0
           Conv2d-13         [-1, 16, 112, 112]             512
      BatchNorm2d-14         [-1, 16, 1

In [43]:
trainloader = DataLoader(train_dataset, batch_size = CONFIG['batch_size'], shuffle = True, drop_last = True, num_workers = 2)

testloader = DataLoader(test_dataset, batch_size = CONFIG['batch_size'], shuffle = False, drop_last = True, num_workers = 2)

In [44]:
def train_one_epoch(model, trainloader, device, optimizer, criterion):
    """
      Train 1 epoch on trainloader."
    """

    model = model.train()
    
    # Keep track of loss and other evaluation metrics
    train_loss = 0.0

    YPredict = []
    YTrue = []

    ## Loop over all the batches
    for i, (images, labels) in tqdm(enumerate(trainloader, 1), total=len(trainloader), desc=f"Training on whole batch..."):
            
        # Move images and labels to `device` (CPU or GPU)
        images = images.to(device)
        labels = labels.to(device)

        logits = model(images)

        # print(logits)
        # print(labels)
        # print(logits.dtype, labels.dtype)
        # print(type(labels[0]))
        # print(type(labels))
        loss = criterion(logits, labels)
        # loss = criterion(logits, labels.to(torch.long))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Compute loss and accuracy for this batch
        train_loss += loss.detach().item()

        predictions = torch.argmax(logits, dim = 1)
        predictions = predictions.view(-1)
        YTrue.extend(labels.tolist())
        YPredict.extend(predictions.tolist())

    # Compute Scores
    accuracy = accuracy_score(YTrue, YPredict)
    f1 = f1_score(YTrue, YPredict, average = "macro")

    return (train_loss/i, accuracy, f1) ## avg loss and acc over all batches


In [46]:
# Use `torch.no_grad()` here to disable gradient calculation. 
# It will reduce memory consumption as we don't need to compute gradients in inference.

@torch.no_grad()
def evaluate(model, testloader, criterion, device):
    
    model.eval()
    
    # Keep track of loss and other evaluation metrics
    test_loss = 0.0

    YPredict = []
    YTrue = []
    
    # Loop through each batch on test set
    for i, (images, labels) in enumerate(testloader, 1):
        
        images = images.to(device)
        labels = labels.to(device)
        
        logits = model(images)

        loss = criterion(logits, labels)
        
        test_loss += loss.detach().item()

        predictions = torch.argmax(logits, dim = 1)
        predictions = predictions.view(-1)
        YTrue.extend(labels.tolist())
        YPredict.extend(predictions.tolist())

    # Compute Scores
    accuracy = accuracy_score(YTrue, YPredict)
    f1 = f1_score(YTrue, YPredict, average = "macro")

    return (test_loss/i, accuracy, f1) ## avg loss and acc over all batches


In [47]:
def train_model(model, trainloader, testloader, CONFIG):
    
    optimizer = AdamW(model.parameters(), lr = 0.01, weight_decay = 1e-5)

    # criterion = nn.CrossEntropyLoss(weight = class_weights_tensor)

    criterion = nn.CrossEntropyLoss()


    ## Measure runtime
    t_start = time()
    
    ## Store training log
    history = defaultdict(list)
    
    # We will train the model `num_epochs` times
    for i in range(1, CONFIG["epochs"]+1):

        # Training
        train_loss, train_acc, train_f1 = train_one_epoch(model = model, trainloader = trainloader, optimizer = optimizer, criterion = criterion, device = CONFIG["device"])

        # Model Checkpointing
        torch.save(model, f'models/Baseline_model/epoch{i}_batch_size{CONFIG["batch_size"]}.pth')

        # Evaluate on test set
        test_loss, test_acc, test_f1 = evaluate(model = model, testloader = testloader, criterion = criterion, device = CONFIG["device"])

        ## Store train/test loss, accuracy, f1 scores
        history["train_loss"].append(train_loss)
        history["train_acc"].append(train_acc)
        history["train_f1"].append(train_f1)
        
        history["test_loss"].append(test_loss)
        history["test_acc"].append(test_acc)
        history["test_f1"].append(test_f1)

        
        
        ## print out train/test loss, accuracy
        print(f'Epoch: {i} | Runtime: {((time()-t_start)/60):.2f}[m] | train_loss: {train_loss:.3f} | test_loss: {test_loss:.3f} | train_acc: {train_acc:.3f} |  test_acc: {test_acc:.3f} | train_f1: {train_f1:.3f} | test_f1: {test_f1:.3f}')    
    return history

In [None]:
# Train Model


history_model = train_model(model, trainloader, testloader, CONFIG)

Training on whole batch...: 100%|██████████| 37/37 [07:07<00:00, 11.54s/it]


Epoch: 1 | Runtime: 10.27[m] | train_loss: 1.829 | test_loss: 8.623 | train_acc: 0.314 |  test_acc: 0.336 | train_f1: 0.191 | test_f1: 0.126


Training on whole batch...: 100%|██████████| 37/37 [07:30<00:00, 12.18s/it]


Epoch: 2 | Runtime: 20.97[m] | train_loss: 1.528 | test_loss: 1.545 | train_acc: 0.321 |  test_acc: 0.203 | train_f1: 0.133 | test_f1: 0.068


Training on whole batch...: 100%|██████████| 37/37 [07:04<00:00, 11.49s/it]


Epoch: 3 | Runtime: 31.24[m] | train_loss: 1.558 | test_loss: 1.454 | train_acc: 0.338 |  test_acc: 0.484 | train_f1: 0.162 | test_f1: 0.131


Training on whole batch...: 100%|██████████| 37/37 [07:39<00:00, 12.43s/it]


Epoch: 4 | Runtime: 42.09[m] | train_loss: 1.537 | test_loss: 1.467 | train_acc: 0.382 |  test_acc: 0.484 | train_f1: 0.127 | test_f1: 0.131


Training on whole batch...: 100%|██████████| 37/37 [07:10<00:00, 11.64s/it]


Epoch: 5 | Runtime: 52.44[m] | train_loss: 1.508 | test_loss: 1.495 | train_acc: 0.378 |  test_acc: 0.438 | train_f1: 0.110 | test_f1: 0.141


Training on whole batch...: 100%|██████████| 37/37 [07:14<00:00, 11.76s/it]


Epoch: 6 | Runtime: 62.86[m] | train_loss: 1.504 | test_loss: 1.440 | train_acc: 0.372 |  test_acc: 0.484 | train_f1: 0.139 | test_f1: 0.131


Training on whole batch...: 100%|██████████| 37/37 [07:07<00:00, 11.56s/it]


# Plots

In [None]:
## Plot train_loss, test_loss
num_epochs = CONFIG['epochs']
plt.plot(np.arange(num_epochs), history_model["train_loss"], label='train loss')
plt.plot(np.arange(num_epochs), history_model["test_loss"], label='test loss')
plt.xlabel('#epochs')
plt.ylabel('loss')
plt.legend()

In [None]:
## Plot accuracy

plt.plot(np.arange(num_epochs), history_model["train_acc"], label='train acc')
plt.plot(np.arange(num_epochs), history_model["test_acc"], label='test acc')
plt.xlabel('#epochs')
plt.ylabel('accuracy')
plt.legend()

In [None]:
## Plot accuracy

plt.plot(np.arange(num_epochs), history_model["train_f1"], label='train f1')
plt.plot(np.arange(num_epochs), history_model["test_f1"], label='test f1')
plt.xlabel('#epochs')
plt.ylabel('f1')
plt.legend()

In [None]:
# torch.save(model.state_dict(), f'models/Baseline_model/state_dict_epoch{i}_batch_size{CONFIG["batch_size"]}.pth')