In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from openslide import OpenSlide
from pathlib import Path
import torch
import torchvision
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torchvision import transforms
from PIL import Image

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer

from utils.focal_loss import FocalLoss

SEED = 42

In [2]:
train_df = pd.read_csv('./csv_dir/train_outcomes.csv') # biopsy_id, label
test_df = pd.read_csv('./csv_dir/test_outcomes.csv')

train_mapping = pd.read_csv('./csv_dir/train_mapping.csv') # slide_id, biopsy_id, img path
test_mapping = pd.read_csv('./csv_dir/test_mapping.csv')

In [3]:
train_outcome_map = {}
"""
key: biopsy_id
value: stage_number 0,1,2,3,4 (exclude NaN)
"""
for idx, row in train_df.iterrows():
    train_outcome_map[row['biopsy_id']] = row['label']

train_slide_map = {}
"""
key: slide_id
value: Tuple(biopsy_id, slide_path, label)
"""
for idx, row in train_mapping.iterrows():
    train_slide_map[row['slide_id']] = (row['biopsy_id'], row['downsampled_path'], train_outcome_map[row['biopsy_id']])

In [4]:
test_outcome_map = {}
"""
key: biopsy_id
value: stage_number 0,1,2,3,4 (exclude NaN)
"""
for idx, row in test_df.iterrows():
    test_outcome_map[row['biopsy_id']] = row['label']

test_slide_map = {}
"""
key: slide_id
value: Tuple(biopsy_id, slide_path, label)
"""
for idx, row in test_mapping.iterrows():
    # print(row['biopsy_id'])
    test_slide_map[row['slide_id']] = (row['biopsy_id'], row['downsampled_path'], test_outcome_map[row['biopsy_id']])

In [5]:
len(train_slide_map)

6810

In [6]:
class FeatureDataset(torch.utils.data.Dataset):
    def __init__(self, slide_map): 
        self.slide_map = slide_map
        self.data = list(slide_map.values())

    def __getitem__(self, index):
        biopsy_id, path, label = self.data[index]
        x = torch.load(path)
        return x, label, biopsy_id

    def __len__(self):
        return len(self.slide_map)

def pad_collate(batch):
    xx, label, biopsy_id = zip(*batch)
    x_lens = [len(x) for x in xx]
    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
    label = torch.tensor(label)
    return xx_pad, x_lens, label, biopsy_id

In [7]:
batch_size = 8

epochs = 50
learning_rate = 1e-3
momentum = 0.9
weight_decay = 1.0e-4 # 1e-8

hidden_dim = 2048 # ResNet50
num_classes = 5
out_dim = num_classes # [0,1,2,3,4]

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [8]:
train_dataset = FeatureDataset(train_slide_map)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, collate_fn=pad_collate)
test_dataset = FeatureDataset(test_slide_map)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, collate_fn=pad_collate)

In [9]:
# len(train_loader)
# for idx, t in enumerate(train_loader):
#     if idx == 10: break
#     a, length, b,c = t
#     print(a.shape, length, b,c)

In [10]:
def get_ce_loss(y_pred, y_true):
    loss_fn = nn.CrossEntropyLoss()
    return loss_fn(y_pred, y_true)

def get_focal_loss(y_pred, y_true):
    loss_fn = FocalLoss()
    return loss_fn(y_pred, y_true)

criterion = get_focal_loss

In [11]:
label_binarizer = LabelBinarizer().fit(np.arange(num_classes))

def get_score(y_true, y_pred):
    """
    # assume y_true: [0,1,2,3,4,3,2,...] discrete numbers
    # assume y_pred: tensor of shape (batch_size, num_classes)
    # where num_classes = 5 for this task

    # compute AUC for each class
    """
    y_true_onehot = label_binarizer.transform(y_true)
    macro_roc_auc_ovr = roc_auc_score(
        y_true_onehot,
        y_pred,
        multi_class="ovr",
        average="macro",
    )
    return macro_roc_auc_ovr

In [12]:
class Model(nn.Module):
    """
    hidden_dim = 1024
    out_dim = num_classes = 5
    """
    def __init__(self):
        super(Model, self).__init__()
        self.hidden_dim = 1024
        self.num_classes = 5
        
        self.linear1 = nn.Linear(self.hidden_dim, self.hidden_dim//16)
        self.activation = nn.ReLU()
        self.linear2 = nn.Linear(self.hidden_dim//16, self.num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x, x_len):
        bs=x.shape[0]
        
        x = self.linear1(x)
        x = self.activation(x)
        x = self.linear2(x) #
        out = torch.zeros((bs, self.num_classes))
        for i in range(bs):
            cur_len = x_len[i]
            cur_out = torch.mean(x[i][:cur_len], dim=0)
            out[i] = cur_out
        out = self.softmax(out)
        return out

model = Model().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay)

In [13]:
def train_epoch(model, dataloader, loss_fn, optimizer):
    train_loss = []
    model.train()
    for step, data in enumerate(dataloader):
        if step % 50 == 0: print(f"Training... Step={step}")
        if step == 200: break
        batch_x, batch_len, batch_y, batch_biopsy_id = data
        batch_x, batch_y = (
            batch_x.float().to(device),
            batch_y.type(torch.LongTensor).to(device),
        )
        optimizer.zero_grad()
        output = model(batch_x, batch_len)
        loss = loss_fn(output, batch_y)
        train_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    metric_train_loss = np.array(train_loss).mean()
    return metric_train_loss

def val_epoch(model, dataloader):
    y_pred = {} # key: biopsy_id, value: List[slice_stage_pred]
    y_true = {} # key: biopsy_id, value: List[slice_stage_pred]
    model.eval()
    with torch.no_grad():
        for step, data in enumerate(dataloader):
            if step % 50 == 0: print(f"Validating... Step={step}")
            if step == 200: break
            batch_x, batch_len, batch_y, batch_biopsy_id = data
            batch_x, batch_y = (
                batch_x.float().to(device),
                batch_y.type(torch.LongTensor).to(device),
            )
            output = model(batch_x, batch_len)
            # output = torch.argmax(output, dim=-1)
            output = output.detach().cpu().numpy().tolist()
            batch_y = batch_y.detach().cpu().numpy().tolist()

            for i in range(len(batch_biopsy_id)):
                biopsy_id = batch_biopsy_id[i]
                if biopsy_id not in y_pred:
                    y_pred[biopsy_id] = []
                    y_true[biopsy_id] = []
                y_pred[biopsy_id].append(output[i])
                y_true[biopsy_id].append(batch_y[i])
    
    prediction_list = []
    ground_truth_list = []
    for biopsy_id in y_pred:
        preds = np.array(y_pred[biopsy_id])
        truths = np.array(y_true[biopsy_id])
        prediction_list.append(preds.mean(axis=0))
        ground_truth_list.append(truths.mean())
    prediction_list = np.array(prediction_list)
    ground_truth_list = np.array(ground_truth_list)
    # prediction_list = reverse_min_max_norm(prediction_list)
    # nearest_discretize(prediction_list)
    # ground_truth_list = reverse_min_max_norm(ground_truth_list)
    # print(ground_truth_list.shape, prediction_list.shape)
    score = get_score(ground_truth_list, prediction_list)
    return score

In [None]:
best_score = -1e8
valid_step = 1

early_stop_cnt = 0
for epoch in range(epochs):
    # print(f'Running epoch {epoch} ...')
    train_loss = train_epoch(model, train_loader, criterion, optimizer)
    print(f"Epoch {epoch}: Loss = {train_loss}")
    if epoch % valid_step == 0:
        metric_valid = val_epoch(model, test_loader)
        print("Val Score:", metric_valid)
        if metric_valid > best_score:
            early_stop_cnt = 0
            best_score = metric_valid
            print(f"Saving best model at Epoch {epoch}")
            torch.save(model.state_dict(), f"./checkpoints/model_0417.ckpt")
        else:
            early_stop_cnt += valid_step
    if early_stop_cnt == 20:
        break

Training... Step=0
Training... Step=50
Training... Step=100
Training... Step=150
Training... Step=200
Epoch 0: Loss = 0.6673073956370353
Validating... Step=0
Validating... Step=50
Validating... Step=100
Validating... Step=150
Validating... Step=200
Val Score: 0.5266089833696275
Saving best model at Epoch 0
Training... Step=0
Training... Step=50
Training... Step=100
Training... Step=150
Training... Step=200
Epoch 1: Loss = 0.6456900563836098
Validating... Step=0


In [None]:
metric_valid = val_epoch(model, test_loader)