In [None]:
import warnings

warnings.filterwarnings('ignore')

In [None]:
#Google Drive

# from google.colab import drive
# drive.mount("/content/drive")

In [None]:
#Kaggle

"""Получить ссылку на архив с общим доступом их гугл диска.
между /d/ и /view? - id для загрузки
"""

url = 'https://drive.google.com/file/...../view?usp=sharing'


!conda install -y gdown &> /dev/null
!gdown --id 1nIywScpYSExIjpCR9tcl3lvwgwJXF8kw

In [None]:
# Перезапустить среду после обновления matplotlib!!
# Не требуется для kaggle
# !pip install --upgrade matplotlib &> /dev/null

# Data Load

In [None]:
import os
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
import cv2
import seaborn as sns

sns.set()

In [None]:
# Google Drive
# !unzip /content/drive/MyDrive/ml/Chest_Xray_segmentation.zip -d data/ &> /dev/null

# Kaggle

!unzip /kaggle/working/Chest_Xray_segmentation.zip -d data/ &> /dev/null

In [None]:
img_format = '.bmp'
new_img_format = '.png'

base_path = Path('data/SegmChest/')
pathes = ['xray', 'lung', 'ribs', 'col', 'heart'] 
mask_dirs = ['lung', 'ribs', 'col', 'heart']

# train and val folders are for initial data (in .bmp format) - upload your data 
# as a zip archive here in folder "data" please

# # preprocessed data will appear here
(base_path / 'train/masks').mkdir(parents=True, exist_ok=True)
(base_path / 'val/masks').mkdir(parents=True, exist_ok=True)

In [None]:
train_dir = base_path / 'train'
valid_dir = base_path / 'val'

In [None]:
data = []
for cur_dir in [train_dir, valid_dir]:
    images = sorted(cur_dir.glob('xray/*.bmp'))
    df = pd.DataFrame(images, columns=['image_path'])
    masks = pd.DataFrame({d: sorted(cur_dir.glob(f'{d}/*.bmp')) for d in mask_dirs}, columns=mask_dirs)
    df['mask_path'] = df.image_path.apply(lambda x: str(x).replace('xray', 'masks'))
    data.append(pd.concat([df, masks], axis=1))
    
train_data, valid_data = data

In [None]:
train_data.head(2)

In [None]:
train_data.to_csv('train.csv', index=False)
valid_data.to_csv('valid.csv', index=False)

# Data Visualisation

In [None]:
def plot_images(imgs, names=None, axs=None, show=True, nrows=None, ncols=None, figsize=(8, 4)):
    from math import ceil
    if nrows is None and ncols is None:
        nrows = 1
        ncols = len(imgs)
    elif nrows is None:
        nrows = ceil(len(imgs) / ncols)
    elif ncols is None:
        ncols = ceil(len(imgs) / nrows)
    
    if axs is None:
        fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=figsize)
    if nrows == 1 and ncols == 1:
        axs.imshow(imgs[0])
        axs.set_axis_off()
        if names and len(names) > 0:
            axs.set_title(names[0], fontsize=15)
    elif nrows == 1 or ncols == 1:
        for j, ax in enumerate(axs):
            ax.imshow(imgs[j])
            ax.set_axis_off()
            if names and j < len(names):
                ax.set_title(names[j], fontsize=15)
    else:
        for j, ax in enumerate(axs):
            for k, sub_ax in enumerate(ax):
                image_id = j * ncols + k
                sub_ax.set_axis_off()
                if image_id < len(imgs):
                    sub_ax.imshow(imgs[image_id])
                    if names and image_id < len(names):
                        sub_ax.set_title(names[image_id], fontsize=15)
    if show:
        plt.show()

In [None]:
# Merge the mask images

def merge_masks(data_frame, plot_sample=False):  
    for record in data_frame.iloc:
        lungs_mask = np.array(Image.open(record.lung))
        ribs_mask = np.array(Image.open(record.ribs))
        col_mask = np.array(Image.open(record.col))
        heart_mask = np.array(Image.open(record.heart))
        
        mask = (lungs_mask > 0).astype('uint8')
        mask[ribs_mask > 0] = 2
        mask[col_mask > 0] = 3
        mask[heart_mask > 0] = 4
        
        if plot_sample:
            plot_images([mask])
    
        Image.fromarray(mask).save(record.mask_path)

In [None]:
merge_masks(train_data)

In [None]:
merge_masks(valid_data, True)

# Create DataLoader

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import cv2
import pandas as pd
import imageio

from PIL import Image
from statistics import stdev 
from sklearn.model_selection import train_test_split
import torch

In [None]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset as BaseDataset

In [None]:
class Dataset(BaseDataset):
    """Chest Xray Dataset. Read images, apply augmentation and preprocessing transformations.
    
    Args:
        images_dir (str): path to csv with all image and mask paths
        augmentation (albumentations.Compose): data transfromation pipeline 
            (e.g. flip, scale, etc.)
        preprocessing (albumentations.Compose): data preprocessing 
            (e.g. noralization, shape manipulation, etc.)
    
    """
    
    def __init__(
            self,
            path_to_csv,
            augmentation=None, 
            preprocessing=None,
    ):
        self.df = pd.read_csv(path_to_csv)
        
        self.augmentation = augmentation
        self.preprocessing = preprocessing
    
    def __getitem__(self, i):
        record = self.df.iloc[i]
        
        sample = {'image': cv2.imread(record.image_path, 0)[..., None], 
                  'mask': cv2.imread(record.mask_path, 0).astype('float')}
        
        if self.augmentation:
            sample = self.augmentation(**sample)
        
        if self.preprocessing:
            sample = self.preprocessing(**sample)
            masks = [(sample['mask'] == v) for v in range(len(mask_dirs) + 1)]
            sample['mask'] = torch.stack(masks).type(torch.float)
            
        return sample['image'], sample['mask']
        
    def __len__(self):
        return self.df.shape[0]

In [None]:
!pip install git+https://github.com/albumentations-team/albumentations.git &> /dev/null

In [None]:
import albumentations as albu
from albumentations.pytorch.transforms import ToTensorV2

In [None]:
height = 512
width = 512

augmentations = albu.Compose([
    albu.HorizontalFlip(),
    albu.OneOf([
        albu.RandomContrast(),
        albu.RandomGamma(),
        albu.RandomBrightness(),
    ], p=0.3),
    albu.OneOf([
        albu.ElasticTransform(),
        albu.GridDistortion(),
        albu.OpticalDistortion(),
    ]),
    albu.PadIfNeeded(height, width),
    albu.RandomSizedCrop(min_max_height=[int(height/8*7), height], 
                    height=height, width=width)
])

preprocessing = albu.Compose([
    albu.Resize(height, width),
    albu.Normalize([0.5], [0.5]),
    ToTensorV2()
])

train_dataset = Dataset('train.csv', augmentation=augmentations, preprocessing=preprocessing)
valid_dataset = Dataset('valid.csv', preprocessing=preprocessing)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False)

#added
loaders = {
    'train' : train_loader,
    'valid' : valid_loader
}

In [None]:
dataset = Dataset('train.csv', augmentation=augmentations)

for i in range(4):
    sample = dataset[i] # get some sample
    plot_images(sample, names=['Image', 'Mask'])

# Create and train model

In [None]:
#added 
!pip install segmentation_models_pytorch &> /dev/null

In [None]:
!pip install catalyst==21.03 &> /dev/null

In [None]:
from catalyst.callbacks.metrics.segmentation import (DiceCallback,
                                               IOUCallback)
from catalyst.callbacks.misc import EarlyStoppingCallback
from catalyst.callbacks.metrics.confusion_matrix import ConfusionMatrixCallback
from catalyst.callbacks.optimizer import OptimizerCallback

from catalyst.dl import SupervisedRunner

from catalyst.contrib.nn import OneCycleLRWithWarmup
from catalyst.contrib.nn.criterion.dice import DiceLoss
from catalyst.contrib.nn.optimizers.radam import RAdam

import torch
import numpy as np
import segmentation_models_pytorch as smp

In [None]:
encoder_name = 'timm-efficientnet-b0'
activation = 'softmax2d' # could be None for logits or 'softmax2d' for multicalss segmentation
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

#added classes
classes = ['lung', 'ribs', 'col', 'heart', 'xray']

# create segmentation model with pretrained encoder
model = smp.Unet(
    encoder_name=encoder_name,
    classes=len(classes), 
    activation=activation,
    in_channels=1,
)

In [None]:

"""
IOUCallback - обратный вызов метрики iou (сохраняет метрику для каждого класса/модели)

EarlyStoppingCallback - останаливает обучение, если после 'patience' эпох не происходит 
улучшения метрики 'metric_key' на основе валидации 'loader_key'

OptimizerCallback - обратный вызов оптимизатора (сохранение функции потерь)

ConfusionMatrixCallback - обратный вызов для матрицы ошибок.
"""

callbacks = [
    IOUCallback(input_key="logits", target_key="targets",
                threshold=0.5, class_names=classes),
    OptimizerCallback(metric_key='loss'),
    EarlyStoppingCallback(patience=5,loader_key='valid', 
                          metric_key='iou', minimize=False),
    DiceCallback(input_key="logits", target_key="targets", 
                 class_names=classes)
]

In [None]:
from torch.optim.lr_scheduler import StepLR

In [None]:
num_epochs = 50
learning_rate = 1e-4


# Критерий - функция потерь
critetion = DiceLoss()


# metrics = [
#     smp.utils.metrics.IoU(threshold=0.5),
# ]

"""
Вариант оптимизатора Adam, адаптивная 
скорость обучения которого выпрямлена.
"""

optimizer = RAdam([ 
    dict(params=model.parameters(), lr=0.0001),
])

"""
scheduler - планировщик из pytorch'a. Снижает скорость обучения по формуле :
lr = lr * gamma. Снижение происходит с шагом "step_size".

До этого lr был 1e-4 и резо снижался на 25 эпохе в 10 раз.
Теперь lr плавно снижается с 1e-4 до 1e-5. gamma можно крутить
и менять темп изменения lr.
"""

scheduler = StepLR(
    optimizer,
    step_size=1,
    gamma=0.956
)

"""Планировщик из catalyst. Интересная штука, можно изучить."""

# scheduler = OneCycleLRWithWarmup(
#     optimizer, 
#     num_steps=num_epochs, 
#     lr_range=(1e-4, 1e-5),
#     init_lr = learning_rate,
#     warmup_steps=2
# )

In [None]:
#added
#
logdir = "./logs"

"""

Обучение модели "пайплайном". Меньше циклов, красивее и читабельнее
код. 

Для обучения передаются модель, функция ошибки, оптимайзер,
планировщик, обратные вызовы, загрузчик (с датасетами), путь для
логгирования, кол-во эпох, датасет в загрузчике для валидации,
метрика для валидации,флаг для отключение минимизации IOU и флаг
для вывода в консоль процесса обучения.

"""
runner = SupervisedRunner()


runner.train(
    model=model,
    criterion=critetion,
    optimizer=optimizer,
    scheduler=scheduler,
    callbacks=callbacks,
    loaders=loaders,
    logdir=logdir,
    num_epochs=num_epochs,
    valid_loader="valid",
    valid_metric="iou",
    minimize_valid_metric=False,
    verbose=True
)

# Tensorboard

In [None]:
def create_heatmap(data):
    fig = plt.figure(figsize=(12, 12))
    heatmap = sns.heatmap(data, annot=True, cbar=False)
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=12)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=0, ha='right', fontsize=12)
    heatmap.set(ylabel='Predicted Class', xlabel='Actual Class', title="Матрица ошибок по классам")
    plt.show()

In [None]:
dataset = Dataset('valid.csv')

# Последняя строка и последний столбец это суммы 
# по столбцам и по строкам

matrix_for_classes = np.zeros(shape=(6, 6))

for image, mask in dataset:
    sample = preprocessing(image=image, mask=mask)
    image_tensor = sample['image']
    mask = sample['mask']
    
    prediction = model(image_tensor[None, ...].to(device)).argmax(dim=1)
    pred = prediction.cpu().numpy()[0]
    act = mask[None, ...].cpu().numpy().reshape((512, 512))

    for i in range(pred.shape[0]):
        for n in range(pred.shape[1]):

            matrix_for_classes[int(act[i, n]), int(pred[i, n])] += 1

for i in range(0, 5):
    matrix_for_classes[5:,i] = np.sum(matrix_for_classes[:5, i])
    matrix_for_classes[i,5:] = np.sum(matrix_for_classes[i, :5])
  

create_heatmap(matrix_for_classes)

In [None]:
#Архитектура сети
from catalyst import dl, utils
features_batch = next(iter(loaders["valid"]))[0]
utils.trace_model(model=runner.model, batch=features_batch.cuda())

Kaggle


In [1]:

"""
Следующую ячейку лучше не запускать сразу с этой.
Может появться ошибка.
"""

!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip &> /dev/null
!unzip ngrok-stable-linux-amd64.zip &> /dev/null

# Run tensorboard as well as Ngrox (for tunneling as non-blocking processes)
import os
import multiprocessing


pool = multiprocessing.Pool(processes = 10)
results_of_processes = [pool.apply_async(os.system, args=(cmd, ), callback = None )
                        for cmd in [
                        f"tensorboard --logdir ./logs/ --host 0.0.0.0 --port 6006 &",
                        "./ngrok http 6006 &"
                        ]]

^C


In [2]:
# Tensorboard

"""
По ссылке на 'доске'доступны все графики по метрикам 
для классов, модели, ошибки, изменения скорости обучения и т.д.
"""

! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

https://5aaa3e490109.ngrok.io


Google Drive

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir logs

# Save and Load Model

In [None]:
# Save model (Google Drive)
PATH = "/content/drive/MyDrive/"

# Kaggle

torch.save(runner.model, "model.pth")

In [None]:
#Load model
model = torch.load("model.pth")

# Test model

In [None]:
dataset = Dataset('valid.csv')

for image, mask in dataset:
    sample = preprocessing(image=image, mask=mask)
    image_tensor = sample['image']
    mask = sample['mask']
    
    prediction = model(image_tensor[None, ...].to(device)).argmax(dim=1)
    prediction = prediction.cpu().numpy()[0]
    
    
    plot_images([image, mask, prediction], names=['Image', 'Mask', 'Predicted Mask'])
    