In [2]:
import numpy as np
import pandas as pd
import random
from glob import glob
import os, shutil
from tqdm.notebook import tqdm
import time
import copy
import joblib
import gc
from IPython import display as ipd
from joblib import Parallel, delayed

# visualization
import cv2
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

# import tensorflow as tf

# Config

In [3]:
NUM_LOG = 1000 # for WandB interactive Visualiztion
NO_EMPTY = True # set False to include images with empty mask in WandB

# RLE

In [4]:
# ref: https://www.kaggle.com/paulorzp/run-length-encode-and-decode
def rle_decode(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = np.asarray(mask_rle.split(), dtype=int)
    starts = s[0::2] - 1
    lengths = s[1::2]
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)  # Needed to align to RLE direction


# ref.: https://www.kaggle.com/stainsby/fast-tested-rle
def rle_encode(img):
    '''
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels = img.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

# MetaData

In [9]:
def get_metadata(row):
    data = row['id'].split('_')
    case = int(data[0].replace('case',''))
    day = int(data[1].replace('day',''))
    slice_ = int(data[-1])
    row['case'] = case
    row['day'] = day
    row['slice'] = slice_
    return row

def path2info(row):
    path = row['image_path']
    data = path.split('/')
    slice_ = int(data[-1].split('_')[1])
    case = int(data[-3].split('_')[0].replace('case',''))
    day = int(data[-3].split('_')[1].replace('day',''))
    width = int(data[-1].split('_')[2])
    height = int(data[-1].split('_')[3])
    row['height'] = height
    row['width'] = width
    row['case'] = case
    row['day'] = day
    row['slice'] = slice_
    return row

## Metaの関数を探る

In [10]:
file = "/home/taku/Desktop/kaggle/kaggle/seg/preprocess/origin_dataset/train.csv"

df = pd.read_csv(file)
df.head()

# progress_apply : applyの進捗をtqdmで表示する



Unnamed: 0,id,class,segmentation
0,case123_day20_slice_0001,large_bowel,
1,case123_day20_slice_0001,small_bowel,
2,case123_day20_slice_0001,stomach,
3,case123_day20_slice_0002,large_bowel,
4,case123_day20_slice_0002,small_bowel,


In [12]:
# get_metadata関数
df_meta = pd.read_csv(file)

#  ------ get_metadata -----  #
"""
data ; [case123 day20 slice 0001]
case : case123 -> 123
day  : day20 -> 20
slice = 1, 2, ...

case, day, slice を追加
"""
meta = df_meta.apply(get_metadata, axis=1)
meta.head()

Exception ignored in: <function tqdm.__del__ at 0x7f353c3a2790>
Traceback (most recent call last):
  File "/home/taku/.pyenv/versions/3.8.10/envs/kaggle/lib/python3.8/site-packages/tqdm/std.py", line 1162, in __del__
    self.close()
  File "/home/taku/.pyenv/versions/3.8.10/envs/kaggle/lib/python3.8/site-packages/tqdm/notebook.py", line 287, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


Unnamed: 0,id,class,segmentation,case,day,slice
0,case123_day20_slice_0001,large_bowel,,123,20,1
1,case123_day20_slice_0001,small_bowel,,123,20,1
2,case123_day20_slice_0001,stomach,,123,20,1
3,case123_day20_slice_0002,large_bowel,,123,20,2
4,case123_day20_slice_0002,small_bowel,,123,20,2


## Path

In [25]:
paths = glob('origin_dataset/train/*/*/*/*') # 各画像へのpath
# print(len(paths)) 38496
# print(paths[:10])

path_df = pd.DataFrame(paths, columns=["image_path"])
# path_df.head()

# ------ path2info ------ #
"""
    data    : [origin_dataset train case114 case114_day17 scans slice_0074_360_310_1.50_1.50.png]
    slice_  : 74 1, 2, ...
    case    : 114
    day     : 17
    width   : 360
    height  : 310
"""

path_df = path_df.apply(path2info, axis=1)
df = df.merge(path_df, on=['case','day','slice'])
df.head()

KeyError: 'case'

# Mask

In [7]:
def id2mask(id_, df=None):
    idf = df[df['id']==id_]
    wh = idf[['height','width']].iloc[0]
    shape = (wh.height, wh.width, 3)
    mask = np.zeros(shape, dtype=np.uint8)
    for i, class_ in enumerate(['large_bowel', 'small_bowel', 'stomach']):
        cdf = idf[idf['class']==class_]
        rle = cdf.segmentation.squeeze()
        if len(cdf) and not pd.isna(rle):
            mask[..., i] = rle_decode(rle, shape[:2])
    return mask

def rgb2gray(mask):
    pad_mask = np.pad(mask, pad_width=[(0,0),(0,0),(1,0)])
    gray_mask = pad_mask.argmax(-1)
    return gray_mask

def gray2rgb(mask):
    rgb_mask = tf.keras.utils.to_categorical(mask, num_classes=4)
    return rgb_mask[..., 1:].astype(mask.dtype)

# Checkmask