# UW-Madison GI Tract Image Segmentation

## Data analysis & visualisation

## Load libaries 📚

In [1]:
#!/usr/bin/env python

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import os
import re
import cv2
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.offline import init_notebook_mode
from glob import glob

import tensorflow as tf


In [2]:
%cd 'hpc_train_files'

/Users/viktor/Documents/BA/GIT/HealthyOrganTracker/hpc_train_files


In [3]:
from config import CFG
from dataloader import DataGenerator
from loss import dice_coef, iou_coef, dice_loss, bce_dice_loss
from datapreparation import extract_metadata, remove_faulties
from utility import rle_encode, rle_decode, build_masks, fix_empty_slices, is_isolated, fix_nc_slices

## Config ⚙️

In [4]:
BASE_PATH = 'input/uw-madison-gi-tract-image-segmentation/'
TRAIN_DIR =  BASE_PATH +'train'
TRAIN_CSV =  BASE_PATH +'train.csv'

"""
Backbones available: 
        'efficientnetb0'
        'efficientnetb1'
        'efficientnetb2'
        'efficientnetb3'
        'efficientnetb4'
        'efficientnetb5'
        'efficientnetb6'
        'efficientnetb7'
        'inceptionresnetv2'
        'inceptionv3'
        'resnet50'
        'resnext50'
        'resnext101'
        'seresnext50'
        'seresnext101'
        'densenet121'
        'densenet201'
        """

"\nBackbones available: \n        'efficientnetb0'\n        'efficientnetb1'\n        'efficientnetb2'\n        'efficientnetb3'\n        'efficientnetb4'\n        'efficientnetb5'\n        'efficientnetb6'\n        'efficientnetb7'\n        'inceptionresnetv2'\n        'inceptionv3'\n        'resnet50'\n        'resnext50'\n        'resnext101'\n        'seresnext50'\n        'seresnext101'\n        'densenet121'\n        'densenet201'\n        "

In [5]:
cfg = CFG(
    backbone            = 'efficientnetb0',
    img_dims            = (32,32,3),
    model               = 'UNet',
    batch_size          = 16, 
    epochs              = 1, 
    kaggle              = False, 
    use_fold_csv        = True,
    semi3d_data         = False,
    remove_faulty_cases = True,
    use_crop_data       = False)

In [6]:
df = pd.read_csv(cfg.train_csv)
#df = df.sample(n=90, random_state=1)

## Extract metadata

* Restructure frame
* Extract Case ID, Day and Slice
* Extract image path
* Extract image dimensions (width, height, pixelspacing)
* Extract amount of classes shown in slice

## 2.5D Data
* Adds depth to each slice
* Instead of a RGB channel we have: slice[0], slice[stride] amd slice[stride*2]

In [7]:
df_train = extract_metadata(df, TRAIN_DIR, channels=3, stride=2)
df_train.fillna('',inplace=True);

Frame merged. Shape: (38496, 22)
Remove faulty cases: True
Sucess. Shape: (38208, 22)


## Final table

In [8]:
df_train.sample(5)

Unnamed: 0,id,large_bowel,small_bowel,stomach,case,day,slice,case_day,path,width,...,pixel_y,count,path00,path01,path02,image_paths,rs,re,cs,ce
18528,case85_day21_slice_0113,25889 4 26041 3 26153 8 26305 7 26418 10 26570...,17457 21 17684 18 17721 25 17949 22 17985 28 1...,,85,21,113,case85_day21,input/uw-madison-gi-tract-image-segmentation/t...,266,...,1.5,2,input/uw-madison-gi-tract-image-segmentation/t...,input/uw-madison-gi-tract-image-segmentation/t...,input/uw-madison-gi-tract-image-segmentation/t...,[input/uw-madison-gi-tract-image-segmentation/...,40,10000,0,266
7929,case36_day8_slice_0026,,,,36,8,26,case36_day8,input/uw-madison-gi-tract-image-segmentation/t...,266,...,1.5,0,input/uw-madison-gi-tract-image-segmentation/t...,input/uw-madison-gi-tract-image-segmentation/t...,input/uw-madison-gi-tract-image-segmentation/t...,[input/uw-madison-gi-tract-image-segmentation/...,37,10000,0,266
19398,case88_day38_slice_0119,21889 4 22248 8 22607 15 22967 18 23326 21 236...,18198 7 18556 10 18914 13 19273 15 19632 16 19...,,88,38,119,case88_day38,input/uw-madison-gi-tract-image-segmentation/t...,360,...,1.5,2,input/uw-madison-gi-tract-image-segmentation/t...,input/uw-madison-gi-tract-image-segmentation/t...,input/uw-madison-gi-tract-image-segmentation/t...,[input/uw-madison-gi-tract-image-segmentation/...,18,10000,0,360
11649,case49_day15_slice_0002,,,,49,15,2,case49_day15,input/uw-madison-gi-tract-image-segmentation/t...,360,...,1.5,0,input/uw-madison-gi-tract-image-segmentation/t...,input/uw-madison-gi-tract-image-segmentation/t...,input/uw-madison-gi-tract-image-segmentation/t...,[input/uw-madison-gi-tract-image-segmentation/...,35,10000,0,289
22340,case108_day13_slice_0101,20846 2 21109 10 21373 14 21637 17 21902 18 22...,20647 3 20911 8 21176 11 21442 13 21461 6 2170...,,108,13,101,case108_day13,input/uw-madison-gi-tract-image-segmentation/t...,266,...,1.5,2,input/uw-madison-gi-tract-image-segmentation/t...,input/uw-madison-gi-tract-image-segmentation/t...,input/uw-madison-gi-tract-image-segmentation/t...,[input/uw-madison-gi-tract-image-segmentation/...,41,10000,0,263


## Exploratory Data Analysis

* 57% of the slices have no segmentation at all
* Left bowel is most segmented, stomach least

In [None]:
import plotly.graph_objects as go

if not os.path.exists("images"):
    os.mkdir("images")

dist = np.mean(df_train.iloc[:,1:4]!='',axis=0)
no_masks = np.sum(df_train['count'] == 0)/df_train.shape[0]
dist = np.round(np.append(dist.values, no_masks) * 100, 2)

fig = go.Figure([go.Bar(x=['Dickdarm', 'Dünndarm', 'Magen', 'None'], y=dist, text=dist, textposition='auto')])
fig.update_traces( textposition='outside')
fig.update_layout(title_text='Segmentierungs Verteilung in %', xaxis_title="Klassen")
fig.write_image("images/segmentation_distribution.png")
fig.show("svg")

In [None]:
dist.to_latex()

In [None]:
cases = df_train["case"].value_counts().reset_index()

fig = px.bar(cases, x='case', y='index', text_auto='.2s')
fig.update_layout(
    font_family="Courier New",
    xaxis_title="Fall Nr.",
    yaxis_title="Anzahl Slices",
)
fig.show("svg")
fig.write_image("images/case_slice.png")

In [None]:
days = df_train["day"].value_counts().reset_index()
days = days.loc[days['day'] > 700]
fig = px.pie(days, values='day', names='index', title='Verteilung der Slices an Tax X')
fig.show()
fig.write_image("images/day_slice_pie.png")

In [None]:
days_ = df_train["day"].value_counts().reset_index().iloc[1:,:]

fig = px.bar(days_, x='index', y='day', text_auto='.2s')
fig.update_layout(
    font_family="Courier New",
    xaxis_title="Tag Nr.",
    yaxis_title="Anzahl Slices",
)
fig.show()
fig.write_image("images/day_slice.png")

In [None]:
width = df_train["width"].value_counts().reset_index()

fig = px.bar(width, x='index', y='width', text_auto='.2s')
fig.update_layout(
    font_family="Courier New",
    xaxis_title="Breite",
    yaxis_title="Anzahl Slices",
)
fig.show("svg")
fig.write_image("images/width_slice.png")

In [None]:
height = df_train["height"].value_counts().reset_index()

fig = px.bar(height, x='index', y='height', text_auto='.2s')
fig.update_layout(
    font_family="Courier New",
    title="Which height do the slices have?",
    xaxis_title="Height",
    yaxis_title="Anzahl Slices",
)
fig.show("svg")
fig.write_image("images/height_slice.png")

In [None]:
df_train["pixel_x"].value_counts().reset_index()

In [None]:
df_train["pixel_y"].value_counts().reset_index()

In [None]:
def crop_image(x, img):
    rs = x['rs']
    re = x['re']
    cs = x['cs']
    ce = x['ce']
    return img[rs:re, cs:ce]

def ratio(df, class_name, crop=False):
    dff = pd.DataFrame()
    df['decoded'] = df.apply(lambda x: rle_decode(x[class_name], shape=(x['height'], x['width'])), axis = 1)
    dff['decoded'] = df['decoded'].copy()
    if crop:
        dff['decoded'] = df.apply(lambda x: crop_image(x, x['decoded']), axis=1)
        
    dff['foreground'] = dff['decoded'].apply(lambda x: np.count_nonzero(x))
    dff['background'] = dff.apply(lambda x: x.decoded.size - x.foreground, axis=1)
    
    ratio = dff['background'].sum() / dff['foreground'].sum()
    
    print("Ratio of background / foreground of class " + class_name + " is equal to " + str(ratio))
    
    return dff

In [None]:
sb = ratio(df_train, "small_bowel")

In [None]:
lb = ratio(df_train, "large_bowel")

In [None]:
stomach = ratio(df_train, "stomach")

In [None]:
background_average = (sb['background'].sum() + lb['background'].sum() + stomach['background'].sum() )/3

In [None]:
df['total'] = stomach['foreground'] + sb['foreground'] + lb['foreground']

total_ratio =  background_average / df['total'].sum()

In [None]:
total_ratio

In [None]:
sb_crop = ratio(df_train, "small_bowel", crop=True)

In [None]:
lb_crop = ratio(df_train, "large_bowel", crop=True)

In [None]:
stomach_crop = ratio(df_train, "stomach", crop=True)

In [None]:
background_average_crop = (sb_crop['background'].sum() + lb_crop['background'].sum() + stomach_crop['background'].sum() )/3

In [None]:
df['total'] = stomach_crop['foreground'] + sb_crop['foreground'] + lb_crop['foreground']

total_ratio_crop =  background_average_crop / df['total'].sum()

In [None]:
total_ratio_crop