In [2]:
import os
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm import tqdm
from openslide import OpenSlide

import torch
from torch import nn
from torch.utils.data import (
    ConcatDataset,
    DataLoader,
    Dataset,
    Subset,
    SubsetRandomSampler,
    TensorDataset,
    random_split,
)

import torchvision
from torchvision import transforms
from PIL import Image

# import einops

In [3]:
train_df = pd.read_csv('./csv_dir/train_outcomes.csv') # biopsy_id, label
test_df = pd.read_csv('./csv_dir/test_outcomes.csv')
holdout_df = pd.read_csv('./csv_dir/holdout_outcomes.csv')

train_mapping = pd.read_csv('./csv_dir/train_mapping.csv') # slide_id, biopsy_id, img path
test_mapping = pd.read_csv('./csv_dir/test_mapping.csv')
holdout_mapping = pd.read_csv('./csv_dir/holdout_mapping.csv')

In [4]:
train_outcome_map = {}
"""
key: biopsy_id
value: stage_number 0,1,2,3,4 (exclude NaN)
"""
for idx, row in train_df.iterrows():
    train_outcome_map[row['biopsy_id']] = row['label']

train_slide_map = {}
"""
key: slide_id
value: Tuple(biopsy_id, slide_path)
"""
for idx, row in train_mapping.iterrows():
    train_slide_map[row['slide_id']] = (row['biopsy_id'], row['downsampled_path'])


In [5]:
test_outcome_map = {}
"""
key: biopsy_id
value: stage_number 0,1,2,3,4 (exclude NaN)
"""
for idx, row in test_df.iterrows():
    test_outcome_map[row['biopsy_id']] = row['label']

test_slide_map = {}
"""
key: slide_id
value: Tuple(biopsy_id, slide_path)
"""
for idx, row in test_mapping.iterrows():
    test_slide_map[row['slide_id']] = (row['biopsy_id'], row['downsampled_path'])


In [6]:
holdout_slide_map = {}
"""
key: slide_id
value: Tuple(biopsy_id, slide_path)
"""
for idx, row in holdout_mapping.iterrows():
    holdout_slide_map[row['slide_id']] = (row['biopsy_id'], row['downsampled_path'])

In [7]:
train_x = [] # biopsy_id, img_path
train_y = [] # stage label
for slide_id in train_slide_map:
    # print(slide_id)
    biopsy_id, img_path = train_slide_map[slide_id]
    label = train_outcome_map[biopsy_id]
    train_x.append((biopsy_id, img_path))
    train_y.append(label)

In [8]:
test_x = [] # biopsy_id, img_path
test_y = [] # stage label
for slide_id in test_slide_map:
    # print(slide_id)
    biopsy_id, img_path = test_slide_map[slide_id]
    label = test_outcome_map[biopsy_id]
    test_x.append((biopsy_id, img_path))
    test_y.append(label)

In [9]:
holdout_x = [] # biopsy_id, img_path
for slide_id in holdout_slide_map:
    # print(slide_id)
    biopsy_id, img_path = holdout_slide_map[slide_id]
    holdout_x.append((biopsy_id, img_path))
holdout_y = [0 for _ in range(len(holdout_x))]

In [10]:
len(train_x), len(train_y), len(test_x), len(test_y), len(holdout_x)

(10206, 10206, 10205, 10205, 16607)

In [11]:
transform_aug_train = transforms.Compose([
        transforms.Resize(224),
        transforms.RandomResizedCrop(size=224,scale=(0.8,1.0)),
        transforms.RandomRotation(degrees=15),
        transforms.RandomHorizontalFlip(),
        transforms.CenterCrop(size=224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])

transform_aug_test = transforms.Compose([
        transforms.Resize(224),
        # transforms.RandomResizedCrop(size=224,scale=(0.8,1.0)),
        # transforms.RandomRotation(degrees=15),
        # transforms.RandomHorizontalFlip(),
        transforms.CenterCrop(size=224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])

In [12]:
class ImageDataset(Dataset):
    def __init__(self, x, y, mode='train'): 
        self.x = x # biopsy_id, img_path
        self.y = y # label
        self.mode = mode # train/test

    def __getitem__(self, index):
        biopsy_id, path = self.x[index]
        x_pil = Image.open(path)
        if self.mode=='train': x_tensor = transform_aug_train(x_pil)
        elif self.mode == 'test': x_tensor = transform_aug_test(x_pil)
        return biopsy_id, x_tensor, self.y[index]

    def __len__(self):
        return len(self.x)

In [13]:
batch_size = 2

epochs = 20
learning_rate = 1e-3
momentum = 0.9
weight_decay=0 # 1e-8

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [14]:
train_dataset = ImageDataset(train_x, train_y, mode='train')
train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_dataset = ImageDataset(test_x, test_y, mode='test')
test_loader = DataLoader(test_dataset, batch_size=batch_size)
holdout_dataset = ImageDataset(holdout_x, holdout_y, mode='test')
holdout_loader = DataLoader(holdout_dataset, batch_size=batch_size)

In [15]:
# for data in holdout_loader:
#     biopsy_id, x, y = data
#     print(biopsy_id, x.shape, y)

In [16]:
train_x_list = []
train_y_list = train_y
train_biopsy_id_list = []
for i in tqdm(range(len(train_x))):
    biopsy_id, path = train_x[i]
    train_biopsy_id_list.append(biopsy_id)
    x_pil = Image.open(path)
    x_tensor = transform_aug_train(x_pil)
    train_x_list.append(x_tensor)

pd.to_pickle({'x': train_x_list, 'y': train_y_list, 'id': train_biopsy_id_list}, f'./datasets/train.pkl')

del train_x_list
del train_y_list
del train_y
del train_biopsy_id_list

100%|██████████| 10206/10206 [00:48<00:00, 211.02it/s]


In [19]:
test_x_list = []
test_y_list = test_y
test_biopsy_id_list = []
for i in tqdm(range(len(test_x))):
    biopsy_id, path = test_x[i]
    test_biopsy_id_list.append(biopsy_id)
    x_pil = Image.open(path)
    x_tensor = transform_aug_test(x_pil)
    test_x_list.append(x_tensor)

pd.to_pickle({'x': test_x_list, 'y': test_y_list, 'id': test_biopsy_id_list}, f'./datasets/test.pkl')

del test_x_list
del test_y_list
del test_y
del test_biopsy_id_list

100%|██████████| 10205/10205 [01:53<00:00, 90.00it/s] 


In [21]:
holdout_x_list = []
holdout_y_list = holdout_y
holdout_biopsy_id_list = []
for i in tqdm(range(len(holdout_x))):
    biopsy_id, path = holdout_x[i]
    holdout_biopsy_id_list.append(biopsy_id)
    x_pil = Image.open(path)
    x_tensor = transform_aug_test(x_pil)
    holdout_x_list.append(x_tensor)

pd.to_pickle({'x': holdout_x_list, 'y': holdout_y_list, 'id': holdout_biopsy_id_list}, f'./datasets/holdout.pkl')

del holdout_x_list
del holdout_y_list
del holdout_y
del holdout_biopsy_id_list

100%|██████████| 16607/16607 [02:50<00:00, 97.50it/s] 
