In [None]:
!nvidia-smi

Sun Oct 16 05:41:59 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install monai
!pip install pydicom

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting monai
  Downloading monai-1.0.0-202209161346-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 5.1 MB/s 
Installing collected packages: monai
Successfully installed monai-1.0.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydicom
  Downloading pydicom-2.3.0-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 5.0 MB/s 
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.3.0


In [None]:
import sys
import os
import cv2
import glob
import pydicom
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import time
import datetime
from dataclasses import dataclass, field
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from copy import deepcopy

from monai.data import CacheDataset, DataLoader
from monai.transforms import *

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(42)

class AverageMeter:
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
DATA_DIR = '/content/drive/MyDrive/PROJECT_kaggle'
MRI_TYPES = ["FLAIR", "T1w", "T2w", "T1wCE"]

In [None]:
MRI_TYPES = ["FLAIR", "T1w", "T2w", "T1wCE"]
excluded_imgaes = [109, 123, 709]

DATA_DIR_path = '/content/drive/MyDrive/PROJECT_kaggle'

train_df = pd.read_csv(DATA_DIR_path + "/train_df.csv")
test_df = pd.read_csv(DATA_DIR_path + '/test_df.csv')
train_df = train_df[~train_df.BraTS21ID.isin(excluded_imgaes)]

def load_dicom(path, size = 224):
    ''' 
    Reads a DICOM image, standardizes so that the pixel values are between 0 and 1, then rescales to 0 and 255
    
    Note super sure if this kind of scaling is appropriate, but everyone seems to do it. 
    '''
    dicom = pydicom.read_file(path)
    data = dicom.pixel_array
    if np.max(data) != 0:
        data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return cv2.resize(data, (size, size))

def get_all_image_paths(brats21id, image_type, folder='train'): 
    '''
    Returns an arry of all the images of a particular type for a particular patient ID
    '''
    assert(image_type in MRI_TYPES)
    
    patient_path = os.path.join(
        DATA_DIR_path + "/%s/" % folder, 
        str(brats21id).zfill(5),
    )

    paths = sorted(
        glob.glob(os.path.join(patient_path, image_type, "*")), 
        key=lambda x: int(x[:-4].split("-")[-1]),
    )
    
    num_images = len(paths)
    
    start = int(num_images * 0.25)
    end = int(num_images * 0.75)

    interval = 3
    
    if num_images < 10: 
        interval = 1
    
    return np.array(paths[start:end:interval])

def get_all_images(brats21id, image_type, folder='train', size=225):
    return [load_dicom(path, size) for path in get_all_image_paths(brats21id, image_type, folder)]
IMAGE_SIZE = 128

def get_all_data_for_train(image_type):
    global train_df
    
    X = []
    y = []
    train_ids = []

    for i in tqdm(train_df.index):
        x = train_df.loc[i]
        images = get_all_images(int(x['BraTS21ID']), image_type, 'train', IMAGE_SIZE)
        label = x['MGMT_value']

        X += images
        y += [label] * len(images)
        train_ids += [int(x['BraTS21ID'])] * len(images)
        assert(len(X) == len(y))
    return np.array(X), np.array(y), np.array(train_ids)

def get_all_data_for_test(image_type):
    global train_df
    
    X = []
    test_ids = []

    for i in tqdm(test_df.index):
        x = test_df.loc[i]
        images = get_all_images(int(x['BraTS21ID']), image_type, 'train', IMAGE_SIZE)
        X += images
        test_ids += [int(x['BraTS21ID'])] * len(images)

    return np.array(X), np.array(test_ids)

def get_all_data_for_val(image_type):
    global test_df
    
    X = []
    test_ids = []

    for i in tqdm(test_df.index):
        x = test_df.loc[i]
        images = get_all_images(int(x['BraTS21ID']), image_type, 'train', IMAGE_SIZE)
        X += images
        test_ids += [int(x['BraTS21ID'])] * len(images)

    return np.array(X), np.array(test_ids)

X, y, trainidt = get_all_data_for_train('T1wCE')
X_test, testidt = get_all_data_for_test('T1wCE')
X.shape, y.shape, trainidt.shape

100%|██████████| 466/466 [02:50<00:00,  2.74it/s]
100%|██████████| 117/117 [00:45<00:00,  2.60it/s]


((12832, 128, 128), (12832,), (12832,))

In [None]:
train_df

Unnamed: 0,BraTS21ID,MGMT_value
0,185,1
1,816,1
2,707,1
3,683,0
4,6,1
...,...,...
463,356,0
464,89,1
465,217,0
466,834,0


In [None]:
test_df

Unnamed: 0,BraTS21ID,MGMT_value
0,107,1
1,753,0
2,303,1
3,106,1
4,171,1
...,...,...
112,703,0
113,21,0
114,444,0
115,95,0


In [None]:
X_train, X_valid, y_train, y_valid, trainidt_train, trainidt_valid = train_test_split(X, y, trainidt, test_size=0.2, random_state=40)

split = int(X.shape[0] * 0.8)
 
X_train = torch.Tensor(X_train)
X_valid = torch.Tensor(X_valid)

X_train = torch.unsqueeze(X_train, -1).size()
X_valid = torch.unsqueeze(X_valid, -1).size()

y_train = torch.Tensor(y_train)
y_valid = torch.Tensor(y_valid)


# X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, trainidt_train.shape, trainidt_valid.shape
X_train, y_train, X_valid, y_valid, trainidt_train, trainidt_valid

(torch.Size([10265, 128, 128, 1]),
 tensor([0., 0., 0.,  ..., 1., 1., 1.]),
 torch.Size([2567, 128, 128, 1]),
 tensor([1., 0., 0.,  ..., 0., 1., 1.]),
 array([1010,  649,  378, ...,  638,  485,   58]),
 array([ 52, 657, 514, ..., 346, 468,  11]))

Dataset

In [None]:
class BrainTumorDataset(CacheDataset):
    def __init__(self, root_dir, patient_ids, mri_types, annotations, section, *args, **kwargs):
        self.root_dir = root_dir
        self.patient_ids = patient_ids
        self.mri_types = mri_types
        self.annotations = annotations
        data = self.get_data()
        if section is not None:
            train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
            data = train_data if section=='train' else val_data
        super(BrainTumorDataset, self).__init__(data, *args, **kwargs)
    
    def get_data(self):
        data = []
        for patient_id in tqdm(self.patient_ids):
            if self.annotations is not None:
                label = self.annotations[self.annotations['BraTS21ID'] 
                                         == int(patient_id)]['MGMT_value'].item()
            else:
                label = 0 # dummy value
            for slice_path in self.get_patient_slice_paths(patient_id):
                data.append({
                    'image': slice_path,
                    'label': label,
                    'patient_id': patient_id
                })
        return data
    
    def get_patient_slice_paths(self, patient_id):
        '''
        Returns an array of all the images of a particular type for a particular patient ID
        '''
        assert(set(self.mri_types) <= set(MRI_TYPES))
        patient_path = os.path.join(self.root_dir, str(patient_id).zfill(5))
        patient_slice_paths = []
        for mri_type in self.mri_types:
            paths = sorted(
                glob.glob(os.path.join(patient_path, mri_type, "*.dcm")), 
                key=lambda x: (x[:-4].split("-")[-1]),
            )

            num_images = len(paths)
            start = int(num_images * 0.25)
            end = int(num_images * 0.75)

            interval = 3
            if num_images < 10: 
                interval = 1
            patient_slice_paths.extend(paths[start:end:interval])
        return patient_slice_paths
    
class LoadDicomd(MapTransform):
    def __init__(self, img_size, *args, **kwargs):
        self.img_size = img_size
        super(LoadDicomd, self).__init__(*args, **kwargs)
    
    def __call__(self, data):
        d = dict(data)
        for key in self.keys:
            d[key] = self.load_dicom(d[key])
        return d

    def load_dicom(self, path):
        ''' 
        Reads a DICOM image, standardizes so that the pixel values are between 0 and 1, 
        then rescales to 0 and 255
        '''
        dicom = pydicom.read_file(path)
        data = dicom.pixel_array
        if np.max(data) != 0:
            data = data / np.max(data)
        data = (data * 255).astype(np.uint8)
        data = cv2.resize(data, (self.img_size, self.img_size)) / 255
        return np.expand_dims(data, axis=0)

Model

In [None]:
class Simple2dCNN(nn.Module):
    def __init__(self, 
                 input_channels=1, 
                 n_classes=2, 
                 img_size=32, 
                 conv1_filters=128,
                 conv2_filters=64,
                 dropout_prob=0.1,
                 fc1_units=48):
        super(Simple2dCNN, self).__init__()
        
        self.relu = nn.ReLU()
        
        self.conv1 = nn.Conv2d(input_channels, conv1_filters, 4)
        self.maxpool1 = nn.MaxPool2d(2)
        
        self.conv2 = nn.Conv2d(conv1_filters, conv2_filters, 2)
        self.maxpool2 = nn.MaxPool2d(1)
        
        self.dropout = nn.Dropout(dropout_prob)
        last_feature_map_size = (img_size - 3) // 2 - 1
        self.fc1 = nn.Linear(conv2_filters * last_feature_map_size**2, fc1_units)
        self.fc2 = nn.Linear(fc1_units, n_classes)

    def forward(self, x):
        # (None, 1, 32, 32)
        x = self.relu(self.conv1(x)) # (None, 128, 29, 29)
        x = self.maxpool1(x) # (None, 128, 14, 14)
        
        x = self.relu(self.conv2(x)) # (None, 64, 13, 13)
        x = self.maxpool2(x) # (None, 64, 13, 13)
        
        x = self.dropout(x)
        x = x.view(x.size(0), -1) # (None, 64 * 13 * 13)
        x = self.relu(self.fc1(x)) # (None, 48)
        x = self.fc2(x) # (None, 2)
        return x

In [None]:
model = Simple2dCNN()

In [None]:
print(model)

Simple2dCNN(
  (relu): ReLU()
  (conv1): Conv2d(1, 128, kernel_size=(4, 4), stride=(1, 1))
  (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(128, 64, kernel_size=(2, 2), stride=(1, 1))
  (maxpool2): MaxPool2d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc1): Linear(in_features=10816, out_features=48, bias=True)
  (fc2): Linear(in_features=48, out_features=2, bias=True)
)


Pipeline

In [None]:
@dataclass
class Config:
    train_dir: str = os.path.join(DATA_DIR_path, 'train')
    test_dir: str = os.path.join(DATA_DIR_path, 'train')
    annotation_path: str = os.path.join(DATA_DIR, 'train_df.csv')
    n_classes: int = 2
    img_size: int = 32
    n_workers: int = 4
    early_stopping_rounds: int = 3
    n_folds: int = 5
        
        
class Pipeline:
    def __init__(self, config):
        self.args = config
        self.device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
        self.annotations = None
        self.model = None
        self.load_model()
        # transforms
        self.preaugment_transform = [
            LoadDicomd(keys="image", img_size=self.args.img_size),
        ]
        self.augment_transform = [] # todo: add some augmentations
        self.postaugment_transform = [
            ToTensord(keys="image", dtype=torch.float),
            ToTensord(keys="label", dtype=torch.int64),
        ]
        
    def load_annotations(self):
        self.annotations = pd.read_csv(self.args.annotation_path)
        # exclude 3 cases
        self.annotations = self.annotations[~self.annotations['BraTS21ID'].isin([109, 123, 709])]
        self.annotations = self.annotations.reset_index(drop=True)
        skf = StratifiedKFold(n_splits=self.args.n_folds, shuffle=True, random_state=42)
        # split by patient, stratify based on target value
        folds = skf.split(self.annotations['BraTS21ID'].values, self.annotations['MGMT_value'].values)
        for i, (train_indices, val_indices) in enumerate(folds):
            self.annotations.loc[val_indices, 'fold'] = i
        self.annotations['fold'] = self.annotations['fold'].astype(int)
    
    def load_model(self, weights_path=None):
        self.model = Simple2dCNN(input_channels=1, 
                                 n_classes=self.args.n_classes,
                                 img_size=self.args.img_size).to(self.device)
        if weights_path:
            weights = torch.load(weights_path, map_location=self.device)
            self.model.load_state_dict(weights)
        
    def prepare_datasets(self, mri_types, fold, cache_rate):
        """
        Data format:
        {
            'image': torch tensor (batch_size, 1, 32, 32),
            'label': torch tensor (batch_size, )
            'patient_id'
        }
        Output: torch tensor (batch_size, 2)
        """
        train_transform = Compose(
            self.preaugment_transform +
            self.augment_transform +
            self.postaugment_transform
        )
        val_transform = Compose(
            self.preaugment_transform +
            self.postaugment_transform
        )
        
        train_ids = self.annotations[self.annotations['fold']!=fold]['BraTS21ID'].values.tolist()
        val_holdout_ids = self.annotations[self.annotations['fold']==fold]['BraTS21ID'].values.tolist()
        
        train_ds = BrainTumorDataset(root_dir=self.args.train_dir, 
                                     patient_ids=train_ids, 
                                     mri_types=mri_types,  
                                     annotations=self.annotations,
                                     transform=train_transform,
                                     section='train',
                                     cache_rate=cache_rate,
                                     num_workers=self.args.n_workers)
        val_ds = BrainTumorDataset(root_dir=self.args.train_dir, 
                                   patient_ids=train_ids, 
                                   mri_types=mri_types,  
                                   annotations=self.annotations,
                                   transform=val_transform,
                                   section='val',
                                   cache_rate=cache_rate,
                                   num_workers=self.args.n_workers)
        val_holdout_ds = BrainTumorDataset(root_dir=self.args.train_dir, 
                                           patient_ids=val_holdout_ids, 
                                           mri_types=mri_types, 
                                           annotations=self.annotations, 
                                           transform=val_transform,
                                           section=None,
                                           cache_rate=cache_rate,
                                           num_workers=self.args.n_workers)
        return train_ds, val_ds, val_holdout_ds
    
    def prepare_test_dataset(self, mri_types, cache_rate):
        test_transform = Compose(
            self.preaugment_transform +
            self.postaugment_transform
        )
        test_ids = [int(patient_id) for patient_id in os.listdir(self.args.test_dir)]
        test_ids = sorted(test_ids, key=lambda x: int(x))
        test_ds = BrainTumorDataset(root_dir=self.args.test_dir, 
                                    patient_ids=test_ids, 
                                    mri_types=mri_types, 
                                    annotations=None, 
                                    transform=test_transform,
                                    section=None,
                                    cache_rate=cache_rate,
                                    num_workers=self.args.n_workers)
        return test_ds
    
    def train_epoch(self, loader, loss_function, optimizer, verbose):
        self.model.train()
        summary_loss = AverageMeter()
        start = time.time()
        n = len(loader)
        for step, batch_data in enumerate(loader):
            inputs, labels = (
                batch_data["image"].to(self.device), # (None, 1, 32, 32)
                batch_data["label"].to(self.device), # (None, )
            )
            batch_size = inputs.size(0)
            # back propagation
            optimizer.zero_grad()
            outputs = self.model(inputs) # (None, 2)
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()
            # update stats
            summary_loss.update(loss.item(), batch_size)
            if verbose:
                print('Train step {}/{}, loss: {:.5f}'.format(step + 1, n, 
                                                              summary_loss.avg), end='\r')
        elapsed_time = str(datetime.timedelta(seconds=time.time() - start))
        print('Train loss: {:.5f} - time: {}'.format(summary_loss.avg, elapsed_time))
        return summary_loss.avg
    
    def evaluate_epoch(self, loader, loss_function, verbose):
        self.model.eval()
        summary_loss = AverageMeter()
        start = time.time()
        n = len(loader)
        patient_ids_all = []
        probabilities_all = []
        labels_all = []
        with torch.no_grad():
            for step, batch_data in enumerate(loader):
                inputs, labels, patient_ids = (
                    batch_data["image"].to(self.device), # (None, 1, 32, 32)
                    batch_data["label"].to(self.device), # (None, )
                    batch_data["patient_id"], # (None, )
                )
                batch_size = inputs.size(0)
                # back propagation
                outputs = self.model(inputs) # (None, 2)
                loss = loss_function(outputs, labels)
                # update stats
                probabilities = F.softmax(outputs, dim=1)[:, 1].tolist()
                probabilities_all.extend(probabilities)
                labels_all.extend(labels.tolist())
                patient_ids_all.extend(patient_ids)
                
                summary_loss.update(loss.item(), batch_size)
                if verbose:
                    print('Val step {}/{}, loss: {:.5f}'.format(step + 1, n, 
                                                                summary_loss.avg), end='\r')
        elapsed_time = str(datetime.timedelta(seconds=time.time() - start))
        print('Val loss: {:.5f} - time: {}'.format(summary_loss.avg, elapsed_time))
        result = {
            'BraTS21ID': list(map(lambda x: x.item(), patient_ids_all)), 
            'probability': probabilities_all,
            'label': labels_all
        }
        result = pd.DataFrame(result)
        slice_auc = roc_auc_score(result['label'], result['probability'])
        result = result.groupby("BraTS21ID", as_index=False).mean()
        patient_auc = roc_auc_score(result['label'], result['probability'])
        print('Patient AUC: {:.5f} - Slice AUC: {:.5f}'.format(patient_auc, slice_auc))
        
        return summary_loss.avg, patient_auc, result
    
    def infer_epoch(self, loader, verbose):
        self.model.eval()
        start = time.time()
        n = len(loader)
        patient_ids_all = []
        probabilities_all = []
        with torch.no_grad():
            for step, batch_data in enumerate(loader):
                inputs, patient_ids = (
                    batch_data["image"].to(self.device), # (None, 1, 32, 32)
                    batch_data["patient_id"], # (None, )
                )
                batch_size = inputs.size(0)
                # forward
                outputs = self.model(inputs) # (None, 2)
                # update stats
                probabilities = F.softmax(outputs, dim=1)[:, 1].tolist()
                probabilities_all.extend(probabilities)
                patient_ids_all.extend(patient_ids)
                if verbose:
                    print('Infer step {}/{}'.format(step + 1, n), end='\r')
        
        result = {
            'BraTS21ID': list(map(lambda x: x.item(), patient_ids_all)), 
            'probability': probabilities_all,
        }
        result = pd.DataFrame(result)
        result = result.groupby("BraTS21ID", as_index=False).mean()
        
        elapsed_time = str(datetime.timedelta(seconds=time.time() - start))
        print('Elapsed time: {}'.format(elapsed_time))
        
        return result
    
    def fit(self, train_ds, val_ds, val_holdout_ds, batch_size, epochs, lr, model_name, verbose):
        train_loader = DataLoader(train_ds, 
                                  batch_size=batch_size, 
                                  shuffle=True,
                                  num_workers=self.args.n_workers)
        val_loader = DataLoader(val_ds, 
                                batch_size=batch_size, 
                                shuffle=False,
                                num_workers=self.args.n_workers)
        val_holdout_loader = DataLoader(val_holdout_ds, 
                                        batch_size=batch_size, 
                                        shuffle=False,
                                        num_workers=self.args.n_workers)
        loss_function = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
        
        current_metric = -np.inf
        current_loss = np.inf
        current_epoch = 1
        current_state_dict = None
        save_path = '{}_imgsize{}_valloss{:.3f}_valauc{:.3f}.pth'
        for epoch in range(1, epochs + 1):
            print('\nEpoch {}/{}:'.format(epoch, epochs))
            train_loss = self.train_epoch(train_loader, loss_function, optimizer, verbose)
            print(' Validation:')
            val_loss, val_metric, _ = self.evaluate_epoch(val_loader, loss_function, verbose)
            print(' Hold out:')
            val_holdout_loss, val_holdout_metric, _ = self.evaluate_epoch(val_holdout_loader, 
                                                                          loss_function, 
                                                                          verbose)
            
#             if val_loss < current_loss:
            if val_metric > current_metric:
                print('Val AUC improved from {:.5f} to {:.5f}'.format(current_metric, val_metric))
                current_metric = val_metric
                current_loss = val_loss
                current_epoch = epoch
                current_state_dict = deepcopy(self.model.state_dict())
                
            elif (epoch - current_epoch) > self.args.early_stopping_rounds:
                print('Early stopping. Best model is epoch {}'.format(current_epoch))
                print('Val loss: {:.5f}, Val auc: {:.5f}'.format(current_loss, current_metric))
                print('Saving model...')
                torch.save(current_state_dict, 
                           save_path.format(model_name,
                                            self.args.img_size, 
                                            current_loss, 
                                            current_metric))
                break
            if epoch == epochs:
                print('Finished training. Best model is epoch {}'.format(current_epoch))
                print('Val loss: {:.5f}, Val auc: {:.5f}'.format(current_loss, current_metric))
                print('Saving model...')
                torch.save(current_state_dict, 
                           save_path.format(model_name,
                                            self.args.img_size, 
                                            current_loss, 
                                            current_metric))
                
    def evaluate(self, val_holdout_ds, batch_size, verbose):
        val_holdout_loader = DataLoader(val_holdout_ds, 
                                        batch_size=batch_size, 
                                        shuffle=False,
                                        num_workers=self.args.n_workers)
        loss_function = nn.CrossEntropyLoss()
        print(' Hold out:')
        _, val_holdout_metric, val_holdout_result = self.evaluate_epoch(val_holdout_loader, 
                                                                        loss_function, 
                                                                        verbose)
        return val_holdout_metric, val_holdout_result
    
    def predict(self, test_ds, batch_size, verbose):
        test_loader = DataLoader(test_ds, 
                                 batch_size=batch_size, 
                                 shuffle=False,
                                 num_workers=self.args.n_workers)
        test_result = self.infer_epoch(test_loader, verbose)
        return test_result

In [None]:
mri_types = ['T1wCE']
img_size = 16 #32
batch_size = 32
n_workers = 4
early_stopping_rounds = 3
n_folds = 5
epochs = 40 #50
lr = 1e-3

In [None]:
args = Config(img_size=img_size, 
              n_workers=n_workers, 
              early_stopping_rounds=early_stopping_rounds,
              n_folds=n_folds)
pipeline = Pipeline(args)

Train

In [None]:
# %%time
pipeline.load_annotations()
for fold in range(n_folds):
    print(f'### Train {mri_types} on fold {fold}: ###')
    train_ds, val_ds, val_holdout_ds = pipeline.prepare_datasets(mri_types=mri_types, 
                                                                 fold=fold,
                                                                 cache_rate=1.0)
    pipeline.load_model()
    pipeline.fit(train_ds, val_ds, val_holdout_ds,
                 batch_size=batch_size, epochs=epochs, lr=lr, 
                 model_name=f'{"_".join(mri_types)}_fold{fold}',
                 verbose=True)

### Train ['T1wCE'] on fold 0: ###


100%|██████████| 372/372 [00:01<00:00, 310.73it/s]
Loading dataset: 100%|██████████| 8245/8245 [00:27<00:00, 302.13it/s]
100%|██████████| 372/372 [00:01<00:00, 297.70it/s]
Loading dataset: 100%|██████████| 2062/2062 [00:06<00:00, 296.63it/s]
100%|██████████| 94/94 [00:00<00:00, 304.79it/s]
Loading dataset: 100%|██████████| 2525/2525 [00:08<00:00, 302.02it/s]



Epoch 1/40:
Train loss: 0.68355 - time: 0:00:11.641119
 Validation:
Val loss: 0.68745 - time: 0:00:00.815083
Patient AUC: 0.59263 - Slice AUC: 0.55945
 Hold out:
Val loss: 0.72193 - time: 0:00:00.911635
Patient AUC: 0.46909 - Slice AUC: 0.49627
Val AUC improved from -inf to 0.59263

Epoch 2/40:
Train loss: 0.67390 - time: 0:00:02.682613
 Validation:
Val loss: 0.66884 - time: 0:00:00.837492
Patient AUC: 0.65073 - Slice AUC: 0.60152
 Hold out:
Val loss: 0.71262 - time: 0:00:00.916960
Patient AUC: 0.44636 - Slice AUC: 0.48162
Val AUC improved from 0.59263 to 0.65073

Epoch 3/40:
Train loss: 0.67140 - time: 0:00:02.756412
 Validation:
Val loss: 0.66354 - time: 0:00:00.842722
Patient AUC: 0.66059 - Slice AUC: 0.60445
 Hold out:
Val loss: 0.71782 - time: 0:00:00.927150
Patient AUC: 0.43818 - Slice AUC: 0.48062
Val AUC improved from 0.65073 to 0.66059

Epoch 4/40:
Train loss: 0.66777 - time: 0:00:02.707593
 Validation:
Val loss: 0.66964 - time: 0:00:00.814863
Patient AUC: 0.63942 - Slice AUC

100%|██████████| 373/373 [00:01<00:00, 297.30it/s]
Loading dataset: 100%|██████████| 8336/8336 [00:27<00:00, 298.13it/s]
100%|██████████| 373/373 [00:01<00:00, 251.77it/s]
Loading dataset: 100%|██████████| 2084/2084 [00:07<00:00, 280.52it/s]
100%|██████████| 93/93 [00:00<00:00, 319.21it/s]
Loading dataset: 100%|██████████| 2412/2412 [00:08<00:00, 293.98it/s]


Epoch 1/40:





Train loss: 0.68906 - time: 0:00:02.883006
 Validation:
Val loss: 0.68341 - time: 0:00:00.846174
Patient AUC: 0.57965 - Slice AUC: 0.56117
 Hold out:
Val loss: 0.67697 - time: 0:00:00.924753
Patient AUC: 0.72171 - Slice AUC: 0.63722
Val AUC improved from -inf to 0.57965

Epoch 2/40:
Train loss: 0.68491 - time: 0:00:02.737656
 Validation:
Val loss: 0.68338 - time: 0:00:00.840208
Patient AUC: 0.58273 - Slice AUC: 0.56023
 Hold out:
Val loss: 0.67793 - time: 0:00:00.910671
Patient AUC: 0.72171 - Slice AUC: 0.60936
Val AUC improved from 0.57965 to 0.58273

Epoch 3/40:
Train loss: 0.68252 - time: 0:00:02.844179
 Validation:
Val loss: 0.67932 - time: 0:00:00.861555
Patient AUC: 0.59464 - Slice AUC: 0.56663
 Hold out:
Val loss: 0.67152 - time: 0:00:00.982299
Patient AUC: 0.70455 - Slice AUC: 0.61541
Val AUC improved from 0.58273 to 0.59464

Epoch 4/40:
Train loss: 0.68094 - time: 0:00:02.830863
 Validation:
Val loss: 0.67849 - time: 0:00:00.859633
Patient AUC: 0.58474 - Slice AUC: 0.56407
 Ho

100%|██████████| 373/373 [00:01<00:00, 310.29it/s]
Loading dataset: 100%|██████████| 8149/8149 [00:27<00:00, 298.26it/s]
100%|██████████| 373/373 [00:01<00:00, 309.21it/s]
Loading dataset: 100%|██████████| 2038/2038 [00:07<00:00, 286.66it/s]
100%|██████████| 93/93 [00:00<00:00, 303.51it/s]
Loading dataset: 100%|██████████| 2645/2645 [00:09<00:00, 288.41it/s]



Epoch 1/40:
Train loss: 0.69100 - time: 0:00:02.844769
 Validation:
Val loss: 0.68674 - time: 0:00:00.825300
Patient AUC: 0.58748 - Slice AUC: 0.54414
 Hold out:
Val loss: 0.67917 - time: 0:00:00.961038
Patient AUC: 0.70733 - Slice AUC: 0.60145
Val AUC improved from -inf to 0.58748

Epoch 2/40:
Train loss: 0.68660 - time: 0:00:02.750431
 Validation:
Val loss: 0.68071 - time: 0:00:00.837731
Patient AUC: 0.61801 - Slice AUC: 0.57590
 Hold out:
Val loss: 0.67626 - time: 0:00:00.959459
Patient AUC: 0.68646 - Slice AUC: 0.58508
Val AUC improved from 0.58748 to 0.61801

Epoch 3/40:
Train loss: 0.68438 - time: 0:00:02.646092
 Validation:
Val loss: 0.68006 - time: 0:00:00.817813
Patient AUC: 0.62384 - Slice AUC: 0.57785
 Hold out:
Val loss: 0.67912 - time: 0:00:00.978538
Patient AUC: 0.68506 - Slice AUC: 0.58750
Val AUC improved from 0.61801 to 0.62384

Epoch 4/40:
Train loss: 0.68365 - time: 0:00:02.748641
 Validation:
Val loss: 0.67955 - time: 0:00:00.841828
Patient AUC: 0.62888 - Slice AUC

100%|██████████| 373/373 [00:01<00:00, 296.08it/s]
Loading dataset: 100%|██████████| 8132/8132 [00:27<00:00, 297.72it/s]
100%|██████████| 373/373 [00:01<00:00, 304.79it/s]
Loading dataset: 100%|██████████| 2034/2034 [00:06<00:00, 294.86it/s]
100%|██████████| 93/93 [00:00<00:00, 289.19it/s]
Loading dataset: 100%|██████████| 2666/2666 [00:09<00:00, 286.17it/s]



Epoch 1/40:
Train loss: 0.69079 - time: 0:00:02.798481
 Validation:
Val loss: 0.68387 - time: 0:00:00.827820
Patient AUC: 0.59397 - Slice AUC: 0.55911
 Hold out:
Val loss: 0.67295 - time: 0:00:00.969850
Patient AUC: 0.66651 - Slice AUC: 0.57319
Val AUC improved from -inf to 0.59397

Epoch 2/40:
Train loss: 0.68520 - time: 0:00:02.707095
 Validation:
Val loss: 0.68101 - time: 0:00:00.833557
Patient AUC: 0.60790 - Slice AUC: 0.56802
 Hold out:
Val loss: 0.67413 - time: 0:00:00.975228
Patient AUC: 0.67764 - Slice AUC: 0.58480
Val AUC improved from 0.59397 to 0.60790

Epoch 3/40:
Train loss: 0.68276 - time: 0:00:02.697724
 Validation:
Val loss: 0.68196 - time: 0:00:00.799435
Patient AUC: 0.60313 - Slice AUC: 0.57518
 Hold out:
Val loss: 0.67775 - time: 0:00:00.943769
Patient AUC: 0.67996 - Slice AUC: 0.57852

Epoch 4/40:
Train loss: 0.68084 - time: 0:00:02.797603
 Validation:
Val loss: 0.67723 - time: 0:00:00.819013
Patient AUC: 0.61331 - Slice AUC: 0.57657
 Hold out:
Val loss: 0.66975 - 

100%|██████████| 373/373 [00:01<00:00, 301.45it/s]
Loading dataset: 100%|██████████| 8198/8198 [00:27<00:00, 295.49it/s]
100%|██████████| 373/373 [00:01<00:00, 312.09it/s]
Loading dataset: 100%|██████████| 2050/2050 [00:06<00:00, 295.39it/s]
100%|██████████| 93/93 [00:00<00:00, 307.30it/s]
Loading dataset: 100%|██████████| 2584/2584 [00:08<00:00, 300.26it/s]


Epoch 1/40:





Train loss: 0.68700 - time: 0:00:02.867749
 Validation:
Val loss: 0.68658 - time: 0:00:00.846588
Patient AUC: 0.56035 - Slice AUC: 0.55673
 Hold out:
Val loss: 0.69533 - time: 0:00:00.954969
Patient AUC: 0.55798 - Slice AUC: 0.53881
Val AUC improved from -inf to 0.56035

Epoch 2/40:
Train loss: 0.68101 - time: 0:00:02.666949
 Validation:
Val loss: 0.67263 - time: 0:00:00.878240
Patient AUC: 0.60537 - Slice AUC: 0.57884
 Hold out:
Val loss: 0.70718 - time: 0:00:00.945262
Patient AUC: 0.53479 - Slice AUC: 0.52121
Val AUC improved from 0.56035 to 0.60537

Epoch 3/40:
Train loss: 0.67685 - time: 0:00:02.714814
 Validation:
Val loss: 0.67334 - time: 0:00:00.856774
Patient AUC: 0.60842 - Slice AUC: 0.58284
 Hold out:
Val loss: 0.70012 - time: 0:00:00.941704
Patient AUC: 0.53618 - Slice AUC: 0.51920
Val AUC improved from 0.60537 to 0.60842

Epoch 4/40:
Train loss: 0.67514 - time: 0:00:02.686106
 Validation:
Val loss: 0.67536 - time: 0:00:00.827130
Patient AUC: 0.61244 - Slice AUC: 0.58067
 Ho

Evaluate

In [None]:
metrics = []
results = []
find_weight = lambda x: [w for w in os.listdir() if x in w][0]
weights_paths = [f'{"_".join(mri_types)}_fold{fold}' for fold in range(n_folds)]
weights_paths = [find_weight(x) for x in weights_paths]
for fold, weights_path in enumerate(weights_paths):
    print(f'### Evaluate {mri_types} on fold {fold}: ###')
    _, _, val_holdout_ds = pipeline.prepare_datasets(mri_types=mri_types, 
                                                     fold=fold,
                                                     cache_rate=0.0)
    pipeline.load_model(weights_path)
    val_metric, val_result = pipeline.evaluate(val_holdout_ds, batch_size=batch_size, verbose=True)
    metrics.append(val_metric)
    results.append(val_result)
results = pd.concat(results, ignore_index=True)
mean_auc = np.mean(metrics)
oof_auc = roc_auc_score(results['label'], results['probability'])
print('---')
print(f'{mri_types} holdout result:')
print(' Mean AUC: {:.5f}'.format(mean_auc))
print(' Out-of-fold AUC: {:.5f}'.format(oof_auc))
print('---')

### Evaluate ['T1wCE'] on fold 0: ###


100%|██████████| 372/372 [00:01<00:00, 286.85it/s]
100%|██████████| 372/372 [00:01<00:00, 223.14it/s]
100%|██████████| 94/94 [00:00<00:00, 292.32it/s]

 Hold out:





Val loss: 1.16614 - time: 0:00:05.415222
Patient AUC: 0.44136 - Slice AUC: 0.46616
### Evaluate ['T1wCE'] on fold 1: ###


100%|██████████| 373/373 [00:01<00:00, 284.12it/s]
100%|██████████| 373/373 [00:01<00:00, 297.83it/s]
100%|██████████| 93/93 [00:00<00:00, 301.64it/s]

 Hold out:





Val loss: 0.78218 - time: 0:00:05.008850
Patient AUC: 0.59647 - Slice AUC: 0.53258
### Evaluate ['T1wCE'] on fold 2: ###


100%|██████████| 373/373 [00:01<00:00, 298.51it/s]
100%|██████████| 373/373 [00:01<00:00, 313.40it/s]
100%|██████████| 93/93 [00:00<00:00, 291.29it/s]

 Hold out:





Val loss: 0.66818 - time: 0:00:05.598575
Patient AUC: 0.69063 - Slice AUC: 0.60524
### Evaluate ['T1wCE'] on fold 3: ###


100%|██████████| 373/373 [00:01<00:00, 293.50it/s]
100%|██████████| 373/373 [00:01<00:00, 308.08it/s]
100%|██████████| 93/93 [00:00<00:00, 311.77it/s]


 Hold out:
Val loss: 0.87276 - time: 0:00:05.563796
Patient AUC: 0.58442 - Slice AUC: 0.51715
### Evaluate ['T1wCE'] on fold 4: ###


100%|██████████| 373/373 [00:01<00:00, 293.79it/s]
100%|██████████| 373/373 [00:01<00:00, 317.84it/s]
100%|██████████| 93/93 [00:00<00:00, 310.52it/s]


 Hold out:
Val loss: 0.87275 - time: 0:00:05.222409
Patient AUC: 0.55566 - Slice AUC: 0.49975
---
['T1wCE'] holdout result:
 Mean AUC: 0.57371
 Out-of-fold AUC: 0.55820
---


submission

In [None]:
test_results = []
for fold, weights_path in enumerate(weights_paths):
    print(f'### Inference {mri_types} on fold {fold}: ###')
    test_ds = pipeline.prepare_test_dataset(mri_types=mri_types, cache_rate=0.0)
    pipeline.load_model(weights_path)
    test_result = pipeline.predict(test_ds, batch_size=batch_size, verbose=True)
    test_results.append(test_result)

### Inference ['T1wCE'] on fold 0: ###


100%|██████████| 585/585 [00:07<00:00, 75.77it/s]


Elapsed time: 0:00:50.170949
### Inference ['T1wCE'] on fold 1: ###


100%|██████████| 585/585 [00:01<00:00, 369.05it/s]


Elapsed time: 0:00:30.761955
### Inference ['T1wCE'] on fold 2: ###


100%|██████████| 585/585 [00:01<00:00, 361.37it/s]


Elapsed time: 0:00:31.838027
### Inference ['T1wCE'] on fold 3: ###


100%|██████████| 585/585 [00:01<00:00, 368.84it/s]


Elapsed time: 0:00:31.019870
### Inference ['T1wCE'] on fold 4: ###


100%|██████████| 585/585 [00:01<00:00, 361.12it/s]


Elapsed time: 0:00:31.302674


In [None]:
prediction = pd.concat([x.set_index('BraTS21ID') for x in test_results], axis=1).mean(axis=1)
prediction = pd.DataFrame(prediction, columns=['MGMT_value']).reset_index()
# prediction.to_csv('submission_T1wCE.csv',index=False)

In [None]:
prediction

Unnamed: 0,BraTS21ID,MGMT_value
0,0,0.735987
1,2,0.727254
2,3,0.775826
3,5,0.722720
4,6,0.716705
...,...,...
580,1005,0.513890
581,1007,0.356261
582,1008,0.536520
583,1009,0.493061


In [None]:
test_df

Unnamed: 0,BraTS21ID,MGMT_value
0,107,1
1,753,0
2,303,1
3,106,1
4,171,1
...,...,...
112,703,0
113,21,0
114,444,0
115,95,0


In [None]:
result = pd.merge(prediction, test_df, on='BraTS21ID', how='right')

In [None]:
result

Unnamed: 0,BraTS21ID,MGMT_value_x,MGMT_value_y
0,107,0.397917,1
1,753,0.547810,0
2,303,0.492362,1
3,106,0.279580,1
4,171,0.473340,1
...,...,...,...
112,703,0.614482,0
113,21,0.595206,0
114,444,0.558625,0
115,95,0.695518,0


In [None]:
def get_confusion_matrix(result3, threshold=0.5):

    confusion_matrix = [[0, 0], [0, 0]]

#     for i in range(len(result3)):
#         threshold = 1 if result3.loc[i, "MGMT_value_x"] > threshold else 0
#         confusion_matrix[result3.loc[i, "MGMT_value_y"]][threshold] += 1
        
    for idx, data in result3.iterrows():
        tmp = 1 if data.MGMT_value_x > threshold else 0
        confusion_matrix[int(data.MGMT_value_y)][tmp] += 1

    return confusion_matrix

def get_acc_recall(arr):
    acc = sum((arr[0][0], arr[1][1]))/sum((sum(arr[0]), sum(arr[1])))
    recall = arr[1][1] / sum(arr[1])
    print(f"Acc: {acc} \t Recall: {recall}")

In [None]:
arr = get_confusion_matrix(result)
print(arr)
get_acc_recall(arr)

[[6, 50], [14, 47]]
Acc: 0.452991452991453 	 Recall: 0.7704918032786885
