In [None]:
!nvidia-smi

Sun Oct 16 07:27:43 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install monai
!pip install pydicom

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting monai
  Downloading monai-1.0.0-202209161346-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 4.2 MB/s 
Installing collected packages: monai
Successfully installed monai-1.0.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydicom
  Downloading pydicom-2.3.0-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 4.2 MB/s 
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.3.0


In [None]:
import sys
import os
import cv2
import glob
import pydicom
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import time
import datetime
from dataclasses import dataclass, field
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from copy import deepcopy

from monai.data import CacheDataset, DataLoader
from monai.transforms import *

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(42)

class AverageMeter:
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
DATA_DIR = '/content/drive/MyDrive/PROJECT_kaggle'
MRI_TYPES = ["FLAIR", "T1w", "T2w", "T1wCE"]

In [None]:
TYPES = ["FLAIR", "T1w", "T2w", "T1wCE"]
excluded_imgaes = [109, 123, 709]

DATA_DIR_path = '/content/drive/MyDrive/PROJECT_kaggle'

train_df = pd.read_csv(DATA_DIR_path + "/train_df.csv")
test_df = pd.read_csv(DATA_DIR_path + '/test_df.csv')
train_df = train_df[~train_df.BraTS21ID.isin(excluded_imgaes)]

def load_dicom(path, size = 224):
    ''' 
    Reads a DICOM image, standardizes so that the pixel values are between 0 and 1, then rescales to 0 and 255
    
    Note super sure if this kind of scaling is appropriate, but everyone seems to do it. 
    '''
    dicom = pydicom.read_file(path)
    data = dicom.pixel_array
    if np.max(data) != 0:
        data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return cv2.resize(data, (size, size))

def get_all_image_paths(brats21id, image_type, folder='train'): 
    '''
    Returns an arry of all the images of a particular type for a particular patient ID
    '''
    assert(image_type in TYPES)
    
    patient_path = os.path.join(
        DATA_DIR_path + "/%s/" % folder, 
        str(brats21id).zfill(5),
    )

    paths = sorted(
        glob.glob(os.path.join(patient_path, image_type, "*")), 
        key=lambda x: int(x[:-4].split("-")[-1]),
    )
    
    num_images = len(paths)
    
    start = int(num_images * 0.25)
    end = int(num_images * 0.75)

    interval = 3
    
    if num_images < 10: 
        interval = 1
    
    return np.array(paths[start:end:interval])

def get_all_images(brats21id, image_type, folder='train', size=225):
    return [load_dicom(path, size) for path in get_all_image_paths(brats21id, image_type, folder)]
IMAGE_SIZE = 128

def get_all_data_for_train(image_type):
    global train_df
    
    X = []
    y = []
    train_ids = []

    for i in tqdm(train_df.index):
        x = train_df.loc[i]
        images = get_all_images(int(x['BraTS21ID']), image_type, 'train', IMAGE_SIZE)
        label = x['MGMT_value']

        X += images
        y += [label] * len(images)
        train_ids += [int(x['BraTS21ID'])] * len(images)
        assert(len(X) == len(y))
    return np.array(X), np.array(y), np.array(train_ids)

def get_all_data_for_test(image_type):
    global train_df
    
    X = []
    test_ids = []

    for i in tqdm(test_df.index):
        x = test_df.loc[i]
        images = get_all_images(int(x['BraTS21ID']), image_type, 'train', IMAGE_SIZE)
        X += images
        test_ids += [int(x['BraTS21ID'])] * len(images)

    return np.array(X), np.array(test_ids)

def get_all_data_for_val(image_type):
    global test_df
    
    X = []
    test_ids = []

    for i in tqdm(test_df.index):
        x = test_df.loc[i]
        images = get_all_images(int(x['BraTS21ID']), image_type, 'train', IMAGE_SIZE)
        X += images
        test_ids += [int(x['BraTS21ID'])] * len(images)

    return np.array(X), np.array(test_ids)

X, y, trainidt = get_all_data_for_train('FLAIR')
X_test, testidt = get_all_data_for_test('FLAIR')
X.shape, y.shape, trainidt.shape

100%|██████████| 466/466 [45:00<00:00,  5.80s/it]
100%|██████████| 117/117 [10:56<00:00,  5.61s/it]


((10011, 128, 128), (10011,), (10011,))

In [None]:
train_df

Unnamed: 0,BraTS21ID,MGMT_value
0,185,1
1,816,1
2,707,1
3,683,0
4,6,1
...,...,...
463,356,0
464,89,1
465,217,0
466,834,0


In [None]:
test_df

Unnamed: 0,BraTS21ID,MGMT_value
0,107,1
1,753,0
2,303,1
3,106,1
4,171,1
...,...,...
112,703,0
113,21,0
114,444,0
115,95,0


In [None]:
X_train, X_valid, y_train, y_valid, trainidt_train, trainidt_valid = train_test_split(X, y, trainidt, test_size=0.2, random_state=40)

split = int(X.shape[0] * 0.8)
 
X_train = torch.Tensor(X_train)
X_valid = torch.Tensor(X_valid)

X_train = torch.unsqueeze(X_train, -1).size()
X_valid = torch.unsqueeze(X_valid, -1).size()

y_train = torch.Tensor(y_train)
y_valid = torch.Tensor(y_valid)


# X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, trainidt_train.shape, trainidt_valid.shape
X_train, y_train, X_valid, y_valid, trainidt_train, trainidt_valid

(torch.Size([8008, 128, 128, 1]),
 tensor([0., 1., 1.,  ..., 0., 0., 1.]),
 torch.Size([2003, 128, 128, 1]),
 tensor([1., 1., 0.,  ..., 0., 1., 1.]),
 array([414,  33, 661, ..., 563, 477, 661]),
 array([ 89, 607, 392, ..., 589, 552,  28]))

Dataset

In [None]:
class BrainTumorDataset(CacheDataset):
    def __init__(self, root_dir, patient_ids, mri_types, annotations, section, *args, **kwargs):
        self.root_dir = root_dir
        self.patient_ids = patient_ids
        self.mri_types = mri_types
        self.annotations = annotations
        data = self.get_data()
        if section is not None:
            train_data, val_data = train_test_split(data, test_size=0.2, random_state=13)
            data = train_data if section=='train' else val_data
        super(BrainTumorDataset, self).__init__(data, *args, **kwargs)
    
    def get_data(self):
        data = []
        for patient_id in tqdm(self.patient_ids):
            if self.annotations is not None:
                label = self.annotations[self.annotations['BraTS21ID'] 
                                         == int(patient_id)]['MGMT_value'].item()
            else:
                label = 0 # dummy value
            for slice_path in self.get_patient_slice_paths(patient_id):
                data.append({
                    'image': slice_path,
                    'label': label,
                    'patient_id': patient_id
                })
        return data
    
    def get_patient_slice_paths(self, patient_id):
        '''
        Returns an array of all the images of a particular type for a particular patient ID
        '''
        assert(set(self.mri_types) <= set(MRI_TYPES))
        patient_path = os.path.join(self.root_dir, str(patient_id).zfill(5))
        patient_slice_paths = []
        for mri_type in self.mri_types:
            paths = sorted(
                glob.glob(os.path.join(patient_path, mri_type, "*.dcm")), 
                key=lambda x: (x[:-4].split("-")[-1]), # 괄호 앞에 int제거
            )

            num_images = len(paths)
            start = int(num_images * 0.25)
            end = int(num_images * 0.75)

            interval = 3
            if num_images < 10: 
                interval = 1
            patient_slice_paths.extend(paths[start:end:interval])
        return patient_slice_paths
    
class LoadDicomd(MapTransform):
    def __init__(self, img_size, *args, **kwargs):
        self.img_size = img_size
        super(LoadDicomd, self).__init__(*args, **kwargs)
    
    def __call__(self, data):
        d = dict(data)
        for key in self.keys:
            d[key] = self.load_dicom(d[key])
        return d

    def load_dicom(self, path):
        ''' 
        Reads a DICOM image, standardizes so that the pixel values are between 0 and 1, 
        then rescales to 0 and 255
        '''
        dicom = pydicom.read_file(path)
        data = dicom.pixel_array
        if np.max(data) != 0:
            data = data / np.max(data)
        data = (data * 255).astype(np.uint8)
        data = cv2.resize(data, (self.img_size, self.img_size)) / 255
        return np.expand_dims(data, axis=0)

Model

In [None]:
class Simple2dCNN(nn.Module):
    def __init__(self, 
                 input_channels=1, 
                 n_classes=2, 
                 img_size=32, 
                 conv1_filters=128,
                 conv2_filters=64,
                 dropout_prob=0.1,
                 fc1_units=48):
        super(Simple2dCNN, self).__init__()
        
        self.relu = nn.ReLU()
        
        self.conv1 = nn.Conv2d(input_channels, conv1_filters, 4)
        self.maxpool1 = nn.MaxPool2d(2)
        
        self.conv2 = nn.Conv2d(conv1_filters, conv2_filters, 2)
        self.maxpool2 = nn.MaxPool2d(1)
        
        self.dropout = nn.Dropout(dropout_prob)
        last_feature_map_size = (img_size - 3) // 2 - 1
        self.fc1 = nn.Linear(conv2_filters * last_feature_map_size**2, fc1_units)
        self.fc2 = nn.Linear(fc1_units, n_classes)

    def forward(self, x):
        # (None, 1, 32, 32)
        x = self.relu(self.conv1(x)) # (None, 128, 29, 29)
        x = self.maxpool1(x) # (None, 128, 14, 14)
        
        x = self.relu(self.conv2(x)) # (None, 64, 13, 13)
        x = self.maxpool2(x) # (None, 64, 13, 13)
        
        x = self.dropout(x)
        x = x.view(x.size(0), -1) # (None, 64 * 13 * 13)
        x = self.relu(self.fc1(x)) # (None, 48)
        x = self.fc2(x) # (None, 2)
        return x

Pipeline

In [None]:
@dataclass
class Config:
    train_dir: str = os.path.join(DATA_DIR_path, 'train')
    test_dir: str = os.path.join(DATA_DIR_path, 'train')
    annotation_path: str = os.path.join(DATA_DIR_path, 'train_df.csv')
    n_classes: int = 2
    img_size: int = 32
    n_workers: int = 4
    early_stopping_rounds: int = 3
    n_folds: int = 5
        
        
class Pipeline:
    def __init__(self, config):
        self.args = config
        self.device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
        self.annotations = None
        self.model = None
        self.load_model()
        # transforms
        self.preaugment_transform = [
            LoadDicomd(keys="image", img_size=self.args.img_size),
        ]
        self.augment_transform = [] # todo: add some augmentations
        self.postaugment_transform = [
            ToTensord(keys="image", dtype=torch.float),
            ToTensord(keys="label", dtype=torch.int64),
        ]
        
    def load_annotations(self):
        self.annotations = pd.read_csv(self.args.annotation_path)
        # exclude 3 cases
        self.annotations = self.annotations[~self.annotations['BraTS21ID'].isin([109, 123, 709])]
        self.annotations = self.annotations.reset_index(drop=True)
        skf = StratifiedKFold(n_splits=self.args.n_folds, shuffle=True, random_state=42)
        # split by patient, stratify based on target value
        folds = skf.split(self.annotations['BraTS21ID'].values, self.annotations['MGMT_value'].values)
        for i, (train_indices, val_indices) in enumerate(folds):
            self.annotations.loc[val_indices, 'fold'] = i
        self.annotations['fold'] = self.annotations['fold'].astype(int)
    
    def load_model(self, weights_path=None):
        self.model = Simple2dCNN(input_channels=1, 
                                 n_classes=self.args.n_classes,
                                 img_size=self.args.img_size).to(self.device)
        if weights_path:
            weights = torch.load(weights_path, map_location=self.device)
            self.model.load_state_dict(weights)
        
    def prepare_datasets(self, mri_types, fold, cache_rate):
        """
        Data format:
        {
            'image': torch tensor (batch_size, 1, 32, 32),
            'label': torch tensor (batch_size, )
            'patient_id'
        }
        Output: torch tensor (batch_size, 2)
        """
        train_transform = Compose(
            self.preaugment_transform +
            self.augment_transform +
            self.postaugment_transform
        )
        val_transform = Compose(
            self.preaugment_transform +
            self.postaugment_transform
        )
        
        train_ids = self.annotations[self.annotations['fold']!=fold]['BraTS21ID'].values.tolist()
        val_holdout_ids = self.annotations[self.annotations['fold']==fold]['BraTS21ID'].values.tolist()
        
        train_ds = BrainTumorDataset(root_dir=self.args.train_dir, 
                                     patient_ids=train_ids, 
                                     mri_types=mri_types,  
                                     annotations=self.annotations,
                                     transform=train_transform,
                                     section='train',
                                     cache_rate=cache_rate,
                                     num_workers=self.args.n_workers)
        val_ds = BrainTumorDataset(root_dir=self.args.train_dir, 
                                   patient_ids=train_ids, 
                                   mri_types=mri_types,  
                                   annotations=self.annotations,
                                   transform=val_transform,
                                   section='val',
                                   cache_rate=cache_rate,
                                   num_workers=self.args.n_workers)
        val_holdout_ds = BrainTumorDataset(root_dir=self.args.train_dir, 
                                           patient_ids=val_holdout_ids, 
                                           mri_types=mri_types, 
                                           annotations=self.annotations, 
                                           transform=val_transform,
                                           section=None,
                                           cache_rate=cache_rate,
                                           num_workers=self.args.n_workers)
        return train_ds, val_ds, val_holdout_ds
    
    def prepare_test_dataset(self, mri_types, cache_rate):
        test_transform = Compose(
            self.preaugment_transform +
            self.postaugment_transform
        )
        test_ids = [int(patient_id) for patient_id in os.listdir(self.args.test_dir)]
        test_ids = sorted(test_ids, key=lambda x: int(x))
        test_ds = BrainTumorDataset(root_dir=self.args.test_dir, 
                                    patient_ids=test_ids, 
                                    mri_types=mri_types, 
                                    annotations=None, 
                                    transform=test_transform,
                                    section=None,
                                    cache_rate=cache_rate,
                                    num_workers=self.args.n_workers)
        return test_ds
    
    def train_epoch(self, loader, loss_function, optimizer, verbose):
        self.model.train()
        summary_loss = AverageMeter()
        start = time.time()
        n = len(loader)
        for step, batch_data in enumerate(loader):
            inputs, labels = (
                batch_data["image"].to(self.device), # (None, 1, 32, 32)
                batch_data["label"].to(self.device), # (None, )
            )
            batch_size = inputs.size(0)
            # back propagation
            optimizer.zero_grad()
            outputs = self.model(inputs) # (None, 2)
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()
            # update stats
            summary_loss.update(loss.item(), batch_size)
            if verbose:
                print('Train step {}/{}, loss: {:.5f}'.format(step + 1, n, 
                                                              summary_loss.avg), end='\r')
        elapsed_time = str(datetime.timedelta(seconds=time.time() - start))
        print('Train loss: {:.5f} - time: {}'.format(summary_loss.avg, elapsed_time))
        return summary_loss.avg
    
    def evaluate_epoch(self, loader, loss_function, verbose):
        self.model.eval()
        summary_loss = AverageMeter()
        start = time.time()
        n = len(loader)
        patient_ids_all = []
        probabilities_all = []
        labels_all = []
        with torch.no_grad():
            for step, batch_data in enumerate(loader):
                inputs, labels, patient_ids = (
                    batch_data["image"].to(self.device), # (None, 1, 32, 32)
                    batch_data["label"].to(self.device), # (None, )
                    batch_data["patient_id"], # (None, )
                )
                batch_size = inputs.size(0)
                # back propagation
                outputs = self.model(inputs) # (None, 2)
                loss = loss_function(outputs, labels)
                # update stats
                probabilities = F.softmax(outputs, dim=1)[:, 1].tolist()
                probabilities_all.extend(probabilities)
                labels_all.extend(labels.tolist())
                patient_ids_all.extend(patient_ids)
                
                summary_loss.update(loss.item(), batch_size)
                if verbose:
                    print('Val step {}/{}, loss: {:.5f}'.format(step + 1, n, 
                                                                summary_loss.avg), end='\r')
        elapsed_time = str(datetime.timedelta(seconds=time.time() - start))
        print('Val loss: {:.5f} - time: {}'.format(summary_loss.avg, elapsed_time))
        result = {
            'BraTS21ID': list(map(lambda x: x.item(), patient_ids_all)), 
            'probability': probabilities_all,
            'label': labels_all
        }
        result = pd.DataFrame(result)
        slice_auc = roc_auc_score(result['label'], result['probability'])
        result = result.groupby("BraTS21ID", as_index=False).mean()
        patient_auc = roc_auc_score(result['label'], result['probability'])
        print('Patient AUC: {:.5f} - Slice AUC: {:.5f}'.format(patient_auc, slice_auc))
        
        return summary_loss.avg, patient_auc, result
    
    def infer_epoch(self, loader, verbose):
        self.model.eval()
        start = time.time()
        n = len(loader)
        patient_ids_all = []
        probabilities_all = []
        with torch.no_grad():
            for step, batch_data in enumerate(loader):
                inputs, patient_ids = (
                    batch_data["image"].to(self.device), # (None, 1, 32, 32)
                    batch_data["patient_id"], # (None, )
                )
                batch_size = inputs.size(0)
                # forward
                outputs = self.model(inputs) # (None, 2)
                # update stats
                probabilities = F.softmax(outputs, dim=1)[:, 1].tolist()
                probabilities_all.extend(probabilities)
                patient_ids_all.extend(patient_ids)
                if verbose:
                    print('Infer step {}/{}'.format(step + 1, n), end='\r')
        
        result = {
            'BraTS21ID': list(map(lambda x: x.item(), patient_ids_all)), 
            'probability': probabilities_all,
        }
        result = pd.DataFrame(result)
        result = result.groupby("BraTS21ID", as_index=False).mean()
        
        elapsed_time = str(datetime.timedelta(seconds=time.time() - start))
        print('Elapsed time: {}'.format(elapsed_time))
        
        return result
    
    def fit(self, train_ds, val_ds, val_holdout_ds, batch_size, epochs, lr, model_name, verbose):
        train_loader = DataLoader(train_ds, 
                                  batch_size=batch_size, 
                                  shuffle=True,
                                  num_workers=self.args.n_workers)
        val_loader = DataLoader(val_ds, 
                                batch_size=batch_size, 
                                shuffle=False,
                                num_workers=self.args.n_workers)
        val_holdout_loader = DataLoader(val_holdout_ds, 
                                        batch_size=batch_size, 
                                        shuffle=False,
                                        num_workers=self.args.n_workers)
        loss_function = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
        
        current_metric = -np.inf
        current_loss = np.inf
        current_epoch = 1
        current_state_dict = None
        save_path = '{}_imgsize{}_valloss{:.3f}_valauc{:.3f}.pth'
        for epoch in range(1, epochs + 1):
            print('\nEpoch {}/{}:'.format(epoch, epochs))
            train_loss = self.train_epoch(train_loader, loss_function, optimizer, verbose)
            print(' Validation:')
            val_loss, val_metric, _ = self.evaluate_epoch(val_loader, loss_function, verbose)
            print(' Hold out:')
            val_holdout_loss, val_holdout_metric, _ = self.evaluate_epoch(val_holdout_loader, 
                                                                          loss_function, 
                                                                          verbose)
            
#             if val_loss < current_loss:
            if val_metric > current_metric:
                print('Val AUC improved from {:.5f} to {:.5f}'.format(current_metric, val_metric))
                current_metric = val_metric
                current_loss = val_loss
                current_epoch = epoch
                current_state_dict = deepcopy(self.model.state_dict())
                
            elif (epoch - current_epoch) > self.args.early_stopping_rounds:
                print('Early stopping. Best model is epoch {}'.format(current_epoch))
                print('Val loss: {:.5f}, Val auc: {:.5f}'.format(current_loss, current_metric))
                print('Saving model...')
                torch.save(current_state_dict, 
                           save_path.format(model_name,
                                            self.args.img_size, 
                                            current_loss, 
                                            current_metric))
                break
            if epoch == epochs:
                print('Finished training. Best model is epoch {}'.format(current_epoch))
                print('Val loss: {:.5f}, Val auc: {:.5f}'.format(current_loss, current_metric))
                print('Saving model...')
                torch.save(current_state_dict, 
                           save_path.format(model_name,
                                            self.args.img_size, 
                                            current_loss, 
                                            current_metric))
                
    def evaluate(self, val_holdout_ds, batch_size, verbose):
        val_holdout_loader = DataLoader(val_holdout_ds, 
                                        batch_size=batch_size, 
                                        shuffle=False,
                                        num_workers=self.args.n_workers)
        loss_function = nn.CrossEntropyLoss()
        print(' Hold out:')
        _, val_holdout_metric, val_holdout_result = self.evaluate_epoch(val_holdout_loader, 
                                                                        loss_function, 
                                                                        verbose)
        return val_holdout_metric, val_holdout_result
    
    def predict(self, test_ds, batch_size, verbose):
        test_loader = DataLoader(test_ds, 
                                 batch_size=batch_size, 
                                 shuffle=False,
                                 num_workers=self.args.n_workers)
        test_result = self.infer_epoch(test_loader, verbose)
        return test_result

In [None]:
mri_types = ['FLAIR']
img_size = 16 #32
batch_size = 32
n_workers = 4
early_stopping_rounds = 3
n_folds = 5
epochs = 40 #50
lr = 1e-3

In [None]:
args = Config(img_size=img_size, 
              n_workers=n_workers, 
              early_stopping_rounds=early_stopping_rounds,
              n_folds=n_folds)
pipeline = Pipeline(args)

Train

In [None]:
pipeline.load_annotations()
for fold in range(n_folds):
    print(f'### Train {mri_types} on fold {fold}: ###')
    train_ds, val_ds, val_holdout_ds = pipeline.prepare_datasets(mri_types=mri_types, 
                                                                 fold=fold,
                                                                 cache_rate=1.0)
    pipeline.load_model()
    pipeline.fit(train_ds, val_ds, val_holdout_ds,
                 batch_size=batch_size, epochs=epochs, lr=lr, 
                 model_name=f'{"_".join(mri_types)}_fold{fold}',
                 verbose=True)

### Train ['FLAIR'] on fold 0: ###


100%|██████████| 372/372 [00:03<00:00, 100.95it/s]
Loading dataset: 100%|██████████| 6239/6239 [05:20<00:00, 19.49it/s]
100%|██████████| 372/372 [00:01<00:00, 362.96it/s]
Loading dataset: 100%|██████████| 1560/1560 [01:20<00:00, 19.46it/s]
100%|██████████| 94/94 [00:00<00:00, 127.01it/s]
Loading dataset: 100%|██████████| 2212/2212 [01:29<00:00, 24.63it/s]


Epoch 1/40:





Train loss: 0.64300 - time: 0:00:09.466342
 Validation:
Val loss: 0.62830 - time: 0:00:00.623816
Patient AUC: 0.65642 - Slice AUC: 0.62521
 Hold out:
Val loss: 0.76457 - time: 0:00:00.807568
Patient AUC: 0.43136 - Slice AUC: 0.52241
Val AUC improved from -inf to 0.65642

Epoch 2/40:
Train loss: 0.63502 - time: 0:00:02.025410
 Validation:
Val loss: 0.62654 - time: 0:00:00.632114
Patient AUC: 0.66169 - Slice AUC: 0.64499
 Hold out:
Val loss: 0.73343 - time: 0:00:00.783940
Patient AUC: 0.43136 - Slice AUC: 0.53915
Val AUC improved from 0.65642 to 0.66169

Epoch 3/40:
Train loss: 0.62448 - time: 0:00:02.114524
 Validation:
Val loss: 0.62195 - time: 0:00:00.642326
Patient AUC: 0.67001 - Slice AUC: 0.64132
 Hold out:
Val loss: 0.72422 - time: 0:00:00.827069
Patient AUC: 0.43409 - Slice AUC: 0.54427
Val AUC improved from 0.66169 to 0.67001

Epoch 4/40:
Train loss: 0.61476 - time: 0:00:02.086230
 Validation:
Val loss: 0.61972 - time: 0:00:00.642673
Patient AUC: 0.65423 - Slice AUC: 0.64043
 Ho

100%|██████████| 373/373 [00:01<00:00, 365.62it/s]
Loading dataset: 100%|██████████| 6308/6308 [00:23<00:00, 266.29it/s]
100%|██████████| 373/373 [00:01<00:00, 357.77it/s]
Loading dataset: 100%|██████████| 1578/1578 [00:06<00:00, 258.21it/s]
100%|██████████| 93/93 [00:00<00:00, 357.01it/s]
Loading dataset: 100%|██████████| 2125/2125 [00:07<00:00, 266.32it/s]



Epoch 1/40:
Train loss: 0.67371 - time: 0:00:02.215062
 Validation:
Val loss: 0.67420 - time: 0:00:00.671680
Patient AUC: 0.58556 - Slice AUC: 0.55465
 Hold out:
Val loss: 0.63517 - time: 0:00:00.773464
Patient AUC: 0.66466 - Slice AUC: 0.63268
Val AUC improved from -inf to 0.58556

Epoch 2/40:
Train loss: 0.66621 - time: 0:00:02.104331
 Validation:
Val loss: 0.67122 - time: 0:00:00.662026
Patient AUC: 0.60601 - Slice AUC: 0.58577
 Hold out:
Val loss: 0.64180 - time: 0:00:00.788265
Patient AUC: 0.67022 - Slice AUC: 0.63653
Val AUC improved from 0.58556 to 0.60601

Epoch 3/40:
Train loss: 0.66014 - time: 0:00:02.085740
 Validation:
Val loss: 0.66591 - time: 0:00:00.665879
Patient AUC: 0.60829 - Slice AUC: 0.57677
 Hold out:
Val loss: 0.62852 - time: 0:00:00.793537
Patient AUC: 0.64425 - Slice AUC: 0.62523
Val AUC improved from 0.60601 to 0.60829

Epoch 4/40:
Train loss: 0.65424 - time: 0:00:02.052915
 Validation:
Val loss: 0.66892 - time: 0:00:00.647539
Patient AUC: 0.60192 - Slice AUC

100%|██████████| 373/373 [00:01<00:00, 371.44it/s]
Loading dataset: 100%|██████████| 6232/6232 [00:22<00:00, 277.83it/s]
100%|██████████| 373/373 [00:00<00:00, 377.13it/s]
Loading dataset: 100%|██████████| 1558/1558 [00:05<00:00, 278.99it/s]
100%|██████████| 93/93 [00:00<00:00, 358.60it/s]
Loading dataset: 100%|██████████| 2221/2221 [00:08<00:00, 251.04it/s]



Epoch 1/40:
Train loss: 0.67266 - time: 0:00:02.236673
 Validation:
Val loss: 0.65683 - time: 0:00:00.678651
Patient AUC: 0.59066 - Slice AUC: 0.58152
 Hold out:
Val loss: 0.64267 - time: 0:00:00.820855
Patient AUC: 0.64564 - Slice AUC: 0.55686
Val AUC improved from -inf to 0.59066

Epoch 2/40:
Train loss: 0.66448 - time: 0:00:02.123719
 Validation:
Val loss: 0.64984 - time: 0:00:00.665084
Patient AUC: 0.60455 - Slice AUC: 0.59419
 Hold out:
Val loss: 0.63748 - time: 0:00:00.820565
Patient AUC: 0.68924 - Slice AUC: 0.59360
Val AUC improved from 0.59066 to 0.60455

Epoch 3/40:
Train loss: 0.65914 - time: 0:00:02.047226
 Validation:
Val loss: 0.64099 - time: 0:00:00.689261
Patient AUC: 0.61732 - Slice AUC: 0.61339
 Hold out:
Val loss: 0.65544 - time: 0:00:00.832367
Patient AUC: 0.67393 - Slice AUC: 0.60042
Val AUC improved from 0.60455 to 0.61732

Epoch 4/40:
Train loss: 0.65454 - time: 0:00:02.116159
 Validation:
Val loss: 0.63998 - time: 0:00:00.680039
Patient AUC: 0.63018 - Slice AUC

100%|██████████| 373/373 [00:01<00:00, 367.05it/s]
Loading dataset: 100%|██████████| 6544/6544 [00:23<00:00, 276.57it/s]
100%|██████████| 373/373 [00:01<00:00, 368.85it/s]
Loading dataset: 100%|██████████| 1637/1637 [00:06<00:00, 260.10it/s]
100%|██████████| 93/93 [00:00<00:00, 398.97it/s]
Loading dataset: 100%|██████████| 1830/1830 [00:07<00:00, 230.50it/s]



Epoch 1/40:
Train loss: 0.65752 - time: 0:00:02.255945
 Validation:
Val loss: 0.65736 - time: 0:00:00.689197
Patient AUC: 0.60205 - Slice AUC: 0.60733
 Hold out:
Val loss: 0.67657 - time: 0:00:00.743701
Patient AUC: 0.58071 - Slice AUC: 0.55032
Val AUC improved from -inf to 0.60205

Epoch 2/40:
Train loss: 0.64728 - time: 0:00:02.172149
 Validation:
Val loss: 0.65222 - time: 0:00:00.682086
Patient AUC: 0.61385 - Slice AUC: 0.60542
 Hold out:
Val loss: 0.68619 - time: 0:00:00.741524
Patient AUC: 0.56354 - Slice AUC: 0.51955
Val AUC improved from 0.60205 to 0.61385

Epoch 3/40:
Train loss: 0.64305 - time: 0:00:02.175517
 Validation:
Val loss: 0.64601 - time: 0:00:00.688644
Patient AUC: 0.62855 - Slice AUC: 0.62177
 Hold out:
Val loss: 0.69347 - time: 0:00:00.734568
Patient AUC: 0.55612 - Slice AUC: 0.53554
Val AUC improved from 0.61385 to 0.62855

Epoch 4/40:
Train loss: 0.63735 - time: 0:00:02.146590
 Validation:
Val loss: 0.64203 - time: 0:00:00.690016
Patient AUC: 0.63637 - Slice AUC

100%|██████████| 373/373 [00:01<00:00, 368.55it/s]
Loading dataset: 100%|██████████| 6710/6710 [00:24<00:00, 278.80it/s]
100%|██████████| 373/373 [00:01<00:00, 353.83it/s]
Loading dataset: 100%|██████████| 1678/1678 [00:06<00:00, 273.42it/s]
100%|██████████| 93/93 [00:00<00:00, 420.72it/s]
Loading dataset: 100%|██████████| 1623/1623 [00:06<00:00, 253.25it/s]



Epoch 1/40:
Train loss: 0.65554 - time: 0:00:02.347694
 Validation:
Val loss: 0.65922 - time: 0:00:00.696772
Patient AUC: 0.61588 - Slice AUC: 0.60258
 Hold out:
Val loss: 0.68093 - time: 0:00:00.674670
Patient AUC: 0.58673 - Slice AUC: 0.55773
Val AUC improved from -inf to 0.61588

Epoch 2/40:
Train loss: 0.64959 - time: 0:00:02.221899
 Validation:
Val loss: 0.65326 - time: 0:00:00.675981
Patient AUC: 0.61618 - Slice AUC: 0.60544
 Hold out:
Val loss: 0.67937 - time: 0:00:00.693027
Patient AUC: 0.58024 - Slice AUC: 0.54807
Val AUC improved from 0.61588 to 0.61618

Epoch 3/40:
Train loss: 0.64247 - time: 0:00:02.209633
 Validation:
Val loss: 0.66525 - time: 0:00:00.671889
Patient AUC: 0.61843 - Slice AUC: 0.61305
 Hold out:
Val loss: 0.69800 - time: 0:00:00.677612
Patient AUC: 0.60250 - Slice AUC: 0.54201
Val AUC improved from 0.61618 to 0.61843

Epoch 4/40:
Train loss: 0.63762 - time: 0:00:02.186776
 Validation:
Val loss: 0.64438 - time: 0:00:00.692766
Patient AUC: 0.63015 - Slice AUC

Evaluate

In [None]:
metrics = []
results = []
find_weight = lambda x: [w for w in os.listdir() if x in w][0]
weights_paths = [f'{"_".join(mri_types)}_fold{fold}' for fold in range(n_folds)]
weights_paths = [find_weight(x) for x in weights_paths]
for fold, weights_path in enumerate(weights_paths):
    print(f'### Evaluate {mri_types} on fold {fold}: ###')
    _, _, val_holdout_ds = pipeline.prepare_datasets(mri_types=mri_types, 
                                                     fold=fold,
                                                     cache_rate=0.0)
    pipeline.load_model(weights_path)
    val_metric, val_result = pipeline.evaluate(val_holdout_ds, batch_size=batch_size, verbose=True)
    metrics.append(val_metric)
    results.append(val_result)
results = pd.concat(results, ignore_index=True)
mean_auc = np.mean(metrics)
oof_auc = roc_auc_score(results['label'], results['probability'])
print('---')
print(f'{mri_types} holdout result:')
print(' Mean AUC: {:.5f}'.format(mean_auc))
print(' Out-of-fold AUC: {:.5f}'.format(oof_auc))
print('---')

### Evaluate ['FLAIR'] on fold 0: ###


100%|██████████| 372/372 [00:00<00:00, 381.52it/s]
100%|██████████| 372/372 [00:00<00:00, 390.53it/s]
100%|██████████| 94/94 [00:00<00:00, 346.92it/s]

 Hold out:





Val loss: 1.20251 - time: 0:00:05.529822
Patient AUC: 0.43864 - Slice AUC: 0.48440
### Evaluate ['FLAIR'] on fold 1: ###


100%|██████████| 373/373 [00:00<00:00, 373.56it/s]
100%|██████████| 373/373 [00:00<00:00, 386.78it/s]
100%|██████████| 93/93 [00:00<00:00, 341.35it/s]

 Hold out:





Val loss: 0.82530 - time: 0:00:05.144009
Patient AUC: 0.59508 - Slice AUC: 0.56123
### Evaluate ['FLAIR'] on fold 2: ###


100%|██████████| 373/373 [00:00<00:00, 376.30it/s]
100%|██████████| 373/373 [00:00<00:00, 392.45it/s]
100%|██████████| 93/93 [00:00<00:00, 350.21it/s]

 Hold out:





Val loss: 1.17777 - time: 0:00:05.395374
Patient AUC: 0.57375 - Slice AUC: 0.52369
### Evaluate ['FLAIR'] on fold 3: ###


100%|██████████| 373/373 [00:01<00:00, 363.40it/s]
100%|██████████| 373/373 [00:00<00:00, 373.77it/s]
100%|██████████| 93/93 [00:00<00:00, 401.54it/s]

 Hold out:





Val loss: 1.02693 - time: 0:00:04.432219
Patient AUC: 0.51994 - Slice AUC: 0.51313
### Evaluate ['FLAIR'] on fold 4: ###


100%|██████████| 373/373 [00:01<00:00, 363.93it/s]
100%|██████████| 373/373 [00:01<00:00, 350.38it/s]
100%|██████████| 93/93 [00:00<00:00, 416.24it/s]

 Hold out:





Val loss: 0.84087 - time: 0:00:04.763394
Patient AUC: 0.53942 - Slice AUC: 0.51479
---
['FLAIR'] holdout result:
 Mean AUC: 0.53337
 Out-of-fold AUC: 0.52875
---


submission

In [None]:
test_results = []
for fold, weights_path in enumerate(weights_paths):
    print(f'### Inference {mri_types} on fold {fold}: ###')
    test_ds = pipeline.prepare_test_dataset(mri_types=mri_types, cache_rate=0.0)
    pipeline.load_model(weights_path)
    test_result = pipeline.predict(test_ds, batch_size=batch_size, verbose=True)
    test_results.append(test_result)

### Inference ['FLAIR'] on fold 0: ###


100%|██████████| 585/585 [00:04<00:00, 123.11it/s]


Elapsed time: 0:02:43.323466
### Inference ['FLAIR'] on fold 1: ###


100%|██████████| 585/585 [00:01<00:00, 446.32it/s]


Elapsed time: 0:00:28.469974
### Inference ['FLAIR'] on fold 2: ###


100%|██████████| 585/585 [00:01<00:00, 445.30it/s]


Elapsed time: 0:00:28.260193
### Inference ['FLAIR'] on fold 3: ###


100%|██████████| 585/585 [00:01<00:00, 448.59it/s]


Elapsed time: 0:00:28.217544
### Inference ['FLAIR'] on fold 4: ###


100%|██████████| 585/585 [00:01<00:00, 450.16it/s]


Elapsed time: 0:00:29.118069


In [None]:
prediction = pd.concat([x.set_index('BraTS21ID') for x in test_results], axis=1).mean(axis=1)
prediction = pd.DataFrame(prediction, columns=['MGMT_value']).reset_index()
# prediction.to_csv('submission_T1wCE.csv',index=False)

In [None]:
prediction

Unnamed: 0,BraTS21ID,MGMT_value
0,0,0.799002
1,2,0.857802
2,3,0.466467
3,5,0.803545
4,6,0.741668
...,...,...
580,1005,0.552404
581,1007,0.366012
582,1008,0.566009
583,1009,0.483141


In [None]:
test_df

Unnamed: 0,BraTS21ID,MGMT_value
0,107,1
1,753,0
2,303,1
3,106,1
4,171,1
...,...,...
112,703,0
113,21,0
114,444,0
115,95,0


In [None]:
result = pd.merge(prediction, test_df, on='BraTS21ID', how='right')

In [None]:
result

Unnamed: 0,BraTS21ID,MGMT_value_x,MGMT_value_y
0,107,0.417566,1
1,753,0.516087,0
2,303,0.292858,1
3,106,0.378623,1
4,171,0.472401,1
...,...,...,...
112,703,0.652167,0
113,21,0.544203,0
114,444,0.484825,0
115,95,0.823517,0


In [None]:
def get_confusion_matrix(result3, threshold=0.5):

    confusion_matrix = [[0, 0], [0, 0]]

#     for i in range(len(result3)):
#         threshold = 1 if result3.loc[i, "MGMT_value_x"] > threshold else 0
#         confusion_matrix[result3.loc[i, "MGMT_value_y"]][threshold] += 1
        
    for idx, data in result3.iterrows():
        tmp = 1 if data.MGMT_value_x > threshold else 0
        confusion_matrix[int(data.MGMT_value_y)][tmp] += 1

    return confusion_matrix

def get_acc_recall(arr):
    acc = sum((arr[0][0], arr[1][1]))/sum((sum(arr[0]), sum(arr[1])))
    recall = arr[1][1] / sum(arr[1])
    print(f"Acc: {acc} \t Recall: {recall}")

In [None]:
arr = get_confusion_matrix(result)
print(arr)
get_acc_recall(arr)

[[27, 29], [25, 36]]
Acc: 0.5384615384615384 	 Recall: 0.5901639344262295
