# GI-Tract Image Segmentation

## Data Preprocessing - For Simplified Directory (e.g. 2.5D Data)

#### by Zhiyin (Steven) Lu

In [1]:
# from google.colab import drive
# drive.mount('/content/drive/')

# %ls

In [2]:
import glob
import pandas as pd

In [3]:
# get scan and mask image information and organize into a dataframe
scan_dir = './data_simple_dir/scans/*.png'
mask_dir = './data_simple_dir/masks/*.npy'
save_dir = './data_simple_dir/data.csv'
save_debug_dir = './data_simple_dir/debug.csv'

# scan_dir = './2.5d_data/scans/*.npy' 
# mask_dir = './2.5d_data/masks/*.npy'
# save_dir = './2.5d_data/data.csv'
# save_debug_dir = './2.5d_data/masks/debug.csv'

scan_path = sorted(glob.glob(scan_dir))
mask_path = sorted(glob.glob(mask_dir))
data = pd.DataFrame(data={'scan_path': scan_path, 'mask_path': mask_path})

# extract information into separate columns
data['id'] = data.scan_path.map( lambda x: '_'.join(x.split('/')[-1].split('_')[:4]) )
data['case'] = data.id.map( lambda x: int(x.split('_')[0][4:]) )
data['day'] = data.id.map( lambda x: int(x.split('_')[1][3:]) )
data['slice'] = data.id.map( lambda x: int(x.split('_')[-1]) )
data['height'] = data.scan_path.map( lambda x: int(x.split('/')[-1].split('_')[-4]) )
data['width'] = data.scan_path.map( lambda x: int(x.split('/')[-1].split('_')[-3]) )

# get the segmentation information from train.csv and organize into a 'masks' dataframe
train_csv_dir = './data/train.csv'
train_csv = pd.read_csv(train_csv_dir)
train_csv.segmentation = train_csv.segmentation.fillna('')
train_csv['rle_len'] = train_csv.segmentation.map(len)

# group segmentation into lists and calculate the length of the segmentation list
seg = train_csv.groupby(['id'])['segmentation'].agg(list).to_frame()
rle_len = train_csv.groupby(['id'])['rle_len'].agg(sum).to_frame()

# merge and add segmentation list and rle_length to the 'mask_info' dataframe
mask_info = seg.merge(rle_len, on=['id'])
mask_info['empty'] = (mask_info['rle_len'] == 0)

# merge 'scans' and 'masks' into one dataframe based on IDs
data = data.merge(mask_info, on=['id'])

# Total: 38496
display(data)

data.to_csv(save_dir, index=False)

Unnamed: 0,scan_path,mask_path,id,case,day,slice,height,width,segmentation,rle_len,empty
0,./data_simple_dir/scans/case101_day20_slice_00...,./data_simple_dir/masks/case101_day20_slice_00...,case101_day20_slice_0001,101,20,1,266,266,"[, , ]",0,True
1,./data_simple_dir/scans/case101_day20_slice_00...,./data_simple_dir/masks/case101_day20_slice_00...,case101_day20_slice_0002,101,20,2,266,266,"[, , ]",0,True
2,./data_simple_dir/scans/case101_day20_slice_00...,./data_simple_dir/masks/case101_day20_slice_00...,case101_day20_slice_0003,101,20,3,266,266,"[, , ]",0,True
3,./data_simple_dir/scans/case101_day20_slice_00...,./data_simple_dir/masks/case101_day20_slice_00...,case101_day20_slice_0004,101,20,4,266,266,"[, , ]",0,True
4,./data_simple_dir/scans/case101_day20_slice_00...,./data_simple_dir/masks/case101_day20_slice_00...,case101_day20_slice_0005,101,20,5,266,266,"[, , ]",0,True
...,...,...,...,...,...,...,...,...,...,...,...
38491,./data_simple_dir/scans/case9_day22_slice_0140...,./data_simple_dir/masks/case9_day22_slice_0140...,case9_day22_slice_0140,9,22,140,360,310,"[, , ]",0,True
38492,./data_simple_dir/scans/case9_day22_slice_0141...,./data_simple_dir/masks/case9_day22_slice_0141...,case9_day22_slice_0141,9,22,141,360,310,"[, , ]",0,True
38493,./data_simple_dir/scans/case9_day22_slice_0142...,./data_simple_dir/masks/case9_day22_slice_0142...,case9_day22_slice_0142,9,22,142,360,310,"[, , ]",0,True
38494,./data_simple_dir/scans/case9_day22_slice_0143...,./data_simple_dir/masks/case9_day22_slice_0143...,case9_day22_slice_0143,9,22,143,360,310,"[, , ]",0,True


In [4]:
##### RANDOM SAMPLING TO GET A SMALL DATASET FOR DEBUG PURPOSE #####
debug = data.sample(n=20, random_state=1)

display(debug)

debug.to_csv(save_debug_dir, index=False)

Unnamed: 0,scan_path,mask_path,id,case,day,slice,height,width,segmentation,rle_len,empty
16139,./data_simple_dir/scans/case148_day20_slice_00...,./data_simple_dir/masks/case148_day20_slice_00...,case148_day20_slice_0028,148,20,28,360,310,"[, , ]",0,True
16022,./data_simple_dir/scans/case148_day0_slice_005...,./data_simple_dir/masks/case148_day0_slice_005...,case148_day0_slice_0055,148,0,55,360,310,"[, , ]",0,True
23096,./data_simple_dir/scans/case33_day21_slice_007...,./data_simple_dir/masks/case33_day21_slice_007...,case33_day21_slice_0073,33,21,73,266,266,"[, , 29459 5 29721 10 29982 16 30246 18 30510 ...",331,False
27187,./data_simple_dir/scans/case47_day0_slice_0004...,./data_simple_dir/masks/case47_day0_slice_0004...,case47_day0_slice_0004,47,0,4,266,266,"[, , ]",0,True
34168,./data_simple_dir/scans/case7_day19_slice_0073...,./data_simple_dir/masks/case7_day19_slice_0073...,case7_day19_slice_0073,7,19,73,266,266,"[, , 29692 5 29956 9 30221 11 30486 13 30751 1...",420,False
13452,./data_simple_dir/scans/case142_day16_slice_00...,./data_simple_dir/masks/case142_day16_slice_00...,case142_day16_slice_0013,142,16,13,266,266,"[, , ]",0,True
36255,./data_simple_dir/scans/case85_day29_slice_014...,./data_simple_dir/masks/case85_day29_slice_014...,case85_day29_slice_0144,85,29,144,360,310,"[, , ]",0,True
13918,./data_simple_dir/scans/case143_day23_slice_00...,./data_simple_dir/masks/case143_day23_slice_00...,case143_day23_slice_0047,143,23,47,266,266,[35819 8 36084 12 36349 15 36615 17 36881 17 3...,424,False
17681,./data_simple_dir/scans/case156_day10_slice_01...,./data_simple_dir/masks/case156_day10_slice_01...,case156_day10_slice_0130,156,10,130,266,266,[20067 4 20331 9 20348 11 20595 31 20860 33 21...,1310,False
20287,./data_simple_dir/scans/case22_day0_slice_0144...,./data_simple_dir/masks/case22_day0_slice_0144...,case22_day0_slice_0144,22,0,144,266,266,"[, , ]",0,True
