In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
# from openslide import OpenSlide
from pathlib import Path
import torch
import torchvision
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torchvision import transforms
from PIL import Image

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer

from utils.focal_loss import FocalLoss

SEED = 42

  warn(f"Failed to load image Python extension: {e}")


In [2]:
holdout_mapping = pd.read_csv('./csv_dir/holdout_mapping.csv')

In [3]:
holdout_slide_map = {}
"""
key: slide_id
value: Tuple(biopsy_id, slide_path, label=-1)
"""
for idx, row in holdout_mapping.iterrows():
    holdout_slide_map[row['slide_id']] = (row['biopsy_id'], row['downsampled_path'], -1)

In [4]:
class FeatureDataset(torch.utils.data.Dataset):
    def __init__(self, slide_map): 
        self.slide_map = slide_map
        self.data = list(slide_map.values())

    def __getitem__(self, index):
        biopsy_id, path, label = self.data[index]
        slide_id = path.split('/')[-1]
        if 'train' in path:
            x = torch.load('./datasets/train/'+slide_id)
        else:
            x = torch.load('./datasets/holdout/'+slide_id)
        # x.shape = [3, 1024]  mean/max/min
        return x, label, biopsy_id

    def __len__(self):
        return len(self.slide_map)

def pad_collate(batch):
    xx, label, biopsy_id = zip(*batch)
    x_lens = [len(x) for x in xx]
    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
    label = torch.tensor(label)
    return xx_pad, x_lens, label, biopsy_id

In [5]:
batch_size = 1024

epochs = 500
learning_rate = 1e-1
momentum = 0.9
weight_decay = 1.0e-8 # 1e-8

hidden_dim = 1024
num_classes = 5
out_dim = num_classes # [0,1,2,3,4]

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
holdout_dataset = FeatureDataset(holdout_slide_map)
holdout_loader = torch.utils.data.DataLoader(holdout_dataset, batch_size=batch_size)

In [7]:
label_binarizer = LabelBinarizer().fit(np.arange(num_classes))

def get_score(y_true, y_pred):
    """
    # assume y_true: [0,1,2,3,4,3,2,...] discrete numbers
    # assume y_pred: tensor of shape (batch_size, num_classes)
    # where num_classes = 5 for this task

    # compute AUC for each class
    """
    y_true_onehot = label_binarizer.transform(y_true)
    macro_roc_auc_ovr = roc_auc_score(
        y_true_onehot,
        y_pred,
        multi_class="ovr",
        average="macro",
    )
    return macro_roc_auc_ovr

In [8]:
class Model(nn.Module):
    """
    hidden_dim = 1024
    out_dim = num_classes = 5
    """
    def __init__(self):
        super(Model, self).__init__()
        self.hidden_dim = 1024
        self.num_classes = 5
        
        self.linear1 = nn.Linear(self.hidden_dim, self.hidden_dim//16)
        self.activation = nn.GELU()
        self.linear2 = nn.Linear(self.hidden_dim//16, self.num_classes)
        self.proj = nn.Linear(self.hidden_dim, self.num_classes)
        self.softmax = nn.Softmax(dim=1)

    # def forward(self, x, x_len):
    def forward(self, x):
        bs=x.shape[0]
        
        # x = self.linear1(x[:,0]) # only use mean feature
        # x = self.activation(x)
        # x = self.linear2(x) #
        x = self.proj(x[:,0])
        x = self.softmax(x)
        return x
        # out = torch.zeros((bs, self.num_classes))
        # for i in range(bs):
        #     cur_len = x_len[i]
        #     cur_out = torch.max(x[i][:cur_len], dim=0).values
        #     out[i] = cur_out
        # out = self.softmax(out)
        # return out

model = Model().to(device)

In [9]:
model.load_state_dict(torch.load('checkpoints/model_0417.ckpt'), strict=True)

<All keys matched successfully>

In [10]:
def val_epoch(model, dataloader):
    y_pred = {} # key: biopsy_id, value: List[slice_stage_pred]
    y_true = {} # key: biopsy_id, value: List[slice_stage_pred]
    model.eval()
    with torch.no_grad():
        for step, data in enumerate(dataloader):
            # if step % 50 == 0: print(f"Validating... Step={step}")
            batch_x, _, batch_biopsy_id = data
            batch_x = batch_x.float().to(device)
            output = model(batch_x)
            # output = torch.argmax(output, dim=-1)
            output = output.detach().cpu().numpy().tolist()

            for i in range(len(batch_biopsy_id)):
                biopsy_id = batch_biopsy_id[i]
                if biopsy_id not in y_pred:
                    y_pred[biopsy_id] = []
                    y_true[biopsy_id] = []
                y_pred[biopsy_id].append(output[i])
    
    submit_result_dict = {}
    for biopsy_id in y_pred:
        preds = np.array(y_pred[biopsy_id]).mean(axis=0)
        stage = np.argmax(preds)
        submit_result_dict[biopsy_id] = (preds, stage) 
    return submit_result_dict

In [11]:
submit_result_dict = val_epoch(model, holdout_loader)

In [12]:
biopsy_id_list = []
probability_stage_list = []
stage_list = []

for biopsy_id in submit_result_dict:
    biopsy_id_list.append(biopsy_id)
    probability_stage_list.append(submit_result_dict[biopsy_id][0])
    stage_list.append(submit_result_dict[biopsy_id][1])

In [13]:
import csv
probability_stage_ndarray = np.array(probability_stage_list)

with open("submit_0417.csv", "w") as infile:
    writer = csv.writer(infile)
    # writer.writerow(["header01", "header02"])
    for i in zip(biopsy_id_list, probability_stage_ndarray[:,0].tolist(), probability_stage_ndarray[:,1].tolist(), probability_stage_ndarray[:,2].tolist(), probability_stage_ndarray[:,3].tolist(), probability_stage_ndarray[:,4].tolist(), stage_list):
        writer.writerow(i)