In [12]:
import numpy as np
import pandas as pd
import os
import glob
import pickle
from tqdm import tqdm

In [38]:
DATA_DIR = '../../data'
DATASET_DIR = os.path.join(DATA_DIR, 'dataset')

d = pickle.load(open(os.path.join(DATA_DIR, 'd.pkl'), 'rb'))

image_level_label = pd.read_csv(os.path.join(DATASET_DIR, 'image_level_labels.csv'))
inverse = pd.read_csv(os.path.join(DATA_DIR, 'inverse.csv'))
bbox = pd.read_csv(os.path.join(DATA_DIR, 'active_extravasation_bounding_boxes.csv')) # ian's label
dcm_number = pd.read_csv(os.path.join(DATA_DIR, 'dcm_number.csv'))
dcm_number['psid'] = dcm_number['patient_id'].astype(str) + '_' + dcm_number['series_id'].astype(str)

train = pd.read_csv(os.path.join(DATA_DIR, 'train_df.csv'))
df = train.copy()
df['psid'] = df['patient_id'].astype(str) + '_' + df['series_id'].astype(str)

psid = []
organ = []
healthy = []
injury = []

fold = []
for _, row in df.iterrows():
    for o in ['extravasation']:
        psid.append(row['psid'])
        organ.append(o)
        healthy.append(row[o + '_healthy'])
        injury.append(row[o + '_injury'])
        fold.append(row['fold'])
        
df = pd.DataFrame({
    'psid':psid,
    'organ':organ,
    'healthy':healthy,
    'injury':injury,
    'fold':fold
})

In [39]:
def preprocess_bbox_df(bbox):
    remove = 63618

    bbox = bbox[bbox['series_id'] != remove].reset_index(drop=True)
    bbox['psid'] = bbox['pid'].astype(str) + '_' + bbox['series_id'].astype(str)
    bbox = bbox.merge(inverse, how='left', on='psid')

    bbox2 = bbox[['psid','instance_number','x1','y1','x2','y2','width','height','inverse']].merge(dcm_number[['psid','start','end']], how='left', on='psid')
    bbox2['instance_number_inv'] = bbox2['end'] - (bbox2['instance_number']-bbox2['start'])
    bbox2['final_instance'] = np.where(bbox2['inverse'], bbox2['instance_number_inv'], bbox2['instance_number'])

    bbox2['png_start'] = bbox2['start'] - bbox2['start']
    bbox2['png_end'] = bbox2['end'] - bbox2['start']
    bbox2['png_instance'] = bbox2['final_instance'] - bbox2['start']

    bbox2['image_label'] = 1
    
    return bbox2

bbox = preprocess_bbox_df(bbox)
bbox

Unnamed: 0,psid,instance_number,x1,y1,x2,y2,width,height,inverse,start,end,instance_number_inv,final_instance,png_start,png_end,png_instance,image_label
0,54371_55647,96,175,130,213,172,38,42,True,47,224,175,175,0,177,128,1
1,54371_55647,94,187,137,223,174,36,37,True,47,224,177,177,0,177,130,1
2,54371_55647,95,187,138,216,170,29,32,True,47,224,176,176,0,177,129,1
3,54371_55647,93,193,145,218,171,25,26,True,47,224,178,178,0,177,131,1
4,54371_55647,97,172,130,211,171,39,41,True,47,224,174,174,0,177,127,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6343,15188_53257,88,33,245,66,277,33,32,True,1,632,545,545,0,631,544,1
6344,15188_53257,97,32,243,67,282,35,39,True,1,632,536,536,0,631,535,1
6345,15188_29661,324,44,205,176,266,132,61,True,174,576,426,426,0,402,252,1
6346,15188_29661,325,48,208,175,270,127,62,True,174,576,425,425,0,402,251,1


In [40]:
def get_image_slices():
    psids, png_folders = [], []
    for k, v in tqdm(d.items(), total=len(list(d.keys()))):
        psids.append([k] * len(v))
        png_folders.append(v)
    psids = np.concatenate(psids)
    png_folders = np.concatenate(png_folders)
    len(psids), len(png_folders)

    df_image = pd.DataFrame({'psid':psids, 'png_folder':png_folders})
    df_image['png_number'] = df_image['png_folder'].apply(lambda x: int(x.split("_")[-1].split('.')[0]))

    # df :: start, end : dcm / label_min, label_max : dcm
    df_image2 = df_image.merge(df, how='left', on='psid')
    return df_image2

df_image = get_image_slices()
df_image

100%|██████████| 4711/4711 [00:00<00:00, 124297.76it/s]


Unnamed: 0,psid,png_folder,png_number,organ,healthy,injury,fold
0,10005_18667,/home/superrich/programming/kaggle/rsna-abdomi...,0,extravasation,1,0,4
1,10005_18667,/home/superrich/programming/kaggle/rsna-abdomi...,1,extravasation,1,0,4
2,10005_18667,/home/superrich/programming/kaggle/rsna-abdomi...,2,extravasation,1,0,4
3,10005_18667,/home/superrich/programming/kaggle/rsna-abdomi...,3,extravasation,1,0,4
4,10005_18667,/home/superrich/programming/kaggle/rsna-abdomi...,4,extravasation,1,0,4
...,...,...,...,...,...,...,...
1500648,9813_24149,/home/superrich/programming/kaggle/rsna-abdomi...,140,extravasation,1,0,1
1500649,9813_24149,/home/superrich/programming/kaggle/rsna-abdomi...,141,extravasation,1,0,1
1500650,9813_24149,/home/superrich/programming/kaggle/rsna-abdomi...,142,extravasation,1,0,1
1500651,9813_24149,/home/superrich/programming/kaggle/rsna-abdomi...,143,extravasation,1,0,1


In [43]:
# make extra mask

def save_mask(bbox, save=False):
    """
        save mask only image_label == 1
    """
    print('save mask') if save else print('only check')
        

    output_dir = os.path.join(DATA_DIR, 'extra_mask')
    os.makedirs(output_dir, exist_ok=True)

    sz = 512
    for i, row in tqdm(bbox.iterrows(), total=len(bbox)):
        path = os.path.join(output_dir, f"{str(row['psid'])}_{str(row['png_instance'])}.npy")
        x1, y1, x2, y2 = row['x1'], row['y1'], row['x2'], row['y2']
        mask = np.zeros((512,512), dtype=np.uint8)
        mask[y1:y2, x1:x2] = 1.
        if save:
            np.save(path, mask)

save_mask(bbox, save=False)

only check


100%|██████████| 6348/6348 [00:00<00:00, 20472.67it/s]


In [44]:
def get_all_image_df(df_image, bbox):
    df_image = df_image.merge(bbox.rename(columns={'png_instance':'png_number'}), how='left', on=['psid','png_number'])
    df_image['image_label'] = df_image['image_label'].fillna(0)
    return df_image

df_image = get_all_image_df(df_image, bbox)
df_image

Unnamed: 0,psid,png_folder,png_number,organ,healthy,injury,fold,instance_number,x1,y1,...,width,height,inverse,start,end,instance_number_inv,final_instance,png_start,png_end,image_label
0,10005_18667,/home/superrich/programming/kaggle/rsna-abdomi...,0,extravasation,1,0,4,,,,...,,,,,,,,,,0.0
1,10005_18667,/home/superrich/programming/kaggle/rsna-abdomi...,1,extravasation,1,0,4,,,,...,,,,,,,,,,0.0
2,10005_18667,/home/superrich/programming/kaggle/rsna-abdomi...,2,extravasation,1,0,4,,,,...,,,,,,,,,,0.0
3,10005_18667,/home/superrich/programming/kaggle/rsna-abdomi...,3,extravasation,1,0,4,,,,...,,,,,,,,,,0.0
4,10005_18667,/home/superrich/programming/kaggle/rsna-abdomi...,4,extravasation,1,0,4,,,,...,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1500648,9813_24149,/home/superrich/programming/kaggle/rsna-abdomi...,140,extravasation,1,0,1,,,,...,,,,,,,,,,0.0
1500649,9813_24149,/home/superrich/programming/kaggle/rsna-abdomi...,141,extravasation,1,0,1,,,,...,,,,,,,,,,0.0
1500650,9813_24149,/home/superrich/programming/kaggle/rsna-abdomi...,142,extravasation,1,0,1,,,,...,,,,,,,,,,0.0
1500651,9813_24149,/home/superrich/programming/kaggle/rsna-abdomi...,143,extravasation,1,0,1,,,,...,,,,,,,,,,0.0


## sliding slices   
to reduce training hours, get image slices with 5 stride, 5 channels with +- 2 chans.

In [45]:
# sliding slices (5 channels, 5 stride, +-2 chans)

def sliding_slices(df_image, save=False):
    n_s, unit = 5, 2 # 5, +-2
    psid_lst = df_image['psid'].unique()
    lst = []
    for _psid in tqdm(psid_lst, total=len(psid_lst)):
        tmp = df_image[df_image['psid']==_psid]
        if len(tmp) < 10:
            lst.append(tmp)
        else:
            tmp_ill = tmp['image_label'].values
            _lst = []
            for i, k in enumerate(tmp['png_number'].values):
                if i < unit or i >= len(tmp)-unit:
                    continue
                if k%n_s == unit:
                    row = tmp.iloc[k]
                    _lst.append(row)
            _lst = pd.DataFrame(_lst)
            lst.append(_lst)
    lst = pd.concat(lst, axis=0)
    
    if save:
        lst.reset_index(drop=True).to_csv(
            os.path.join(DATA_DIR, f"extra_sliding_{n_s}_bbox.csv"), index=False
        )
    return lst

extra_sliding_5_bbox = sliding_slices(df_image, save=False)
extra_sliding_5_bbox

100%|██████████| 4711/4711 [05:03<00:00, 15.55it/s]


Unnamed: 0,psid,png_folder,png_number,organ,healthy,injury,fold,instance_number,x1,y1,...,width,height,inverse,start,end,instance_number_inv,final_instance,png_start,png_end,image_label
2,10005_18667,/home/superrich/programming/kaggle/rsna-abdomi...,2,extravasation,1,0,4,,,,...,,,,,,,,,,0.0
7,10005_18667,/home/superrich/programming/kaggle/rsna-abdomi...,7,extravasation,1,0,4,,,,...,,,,,,,,,,0.0
12,10005_18667,/home/superrich/programming/kaggle/rsna-abdomi...,12,extravasation,1,0,4,,,,...,,,,,,,,,,0.0
17,10005_18667,/home/superrich/programming/kaggle/rsna-abdomi...,17,extravasation,1,0,4,,,,...,,,,,,,,,,0.0
22,10005_18667,/home/superrich/programming/kaggle/rsna-abdomi...,22,extravasation,1,0,4,,,,...,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1500630,9813_24149,/home/superrich/programming/kaggle/rsna-abdomi...,122,extravasation,1,0,1,,,,...,,,,,,,,,,0.0
1500635,9813_24149,/home/superrich/programming/kaggle/rsna-abdomi...,127,extravasation,1,0,1,,,,...,,,,,,,,,,0.0
1500640,9813_24149,/home/superrich/programming/kaggle/rsna-abdomi...,132,extravasation,1,0,1,,,,...,,,,,,,,,,0.0
1500645,9813_24149,/home/superrich/programming/kaggle/rsna-abdomi...,137,extravasation,1,0,1,,,,...,,,,,,,,,,0.0


only get more positive slices images to train model because of imbalance dataset.  

In [51]:
idx = extra_sliding_5_bbox.query('image_label == 1').index

def get_only_pos_df(df_image, idx, save=False):
    df_image_pos = df_image[df_image['image_label']==1]
    df_image_pos = df_image_pos[~df_image_pos.index.isin(idx)]

    if save:
        df_image_pos.to_csv(os.path.join(DATA_DIR, 'extra_sliding_5_bbox.csv'), index=False)
    return df_image_pos

df_image_pos = get_only_pos_df(df_image, idx, save=False)
df_image_pos

Unnamed: 0,psid,png_folder,png_number,organ,healthy,injury,fold,instance_number,x1,y1,...,width,height,inverse,start,end,instance_number_inv,final_instance,png_start,png_end,image_label
8492,10292_14945,/home/superrich/programming/kaggle/rsna-abdomi...,63,extravasation,0,1,0,26.0,293.0,91.0,...,47.0,42.0,True,1.0,89.0,64.0,64.0,0.0,88.0,1.0
8493,10292_14945,/home/superrich/programming/kaggle/rsna-abdomi...,64,extravasation,0,1,0,25.0,294.0,95.0,...,40.0,38.0,True,1.0,89.0,65.0,65.0,0.0,88.0,1.0
8494,10292_14945,/home/superrich/programming/kaggle/rsna-abdomi...,65,extravasation,0,1,0,24.0,298.0,97.0,...,34.0,29.0,True,1.0,89.0,66.0,66.0,0.0,88.0,1.0
8495,10292_14945,/home/superrich/programming/kaggle/rsna-abdomi...,66,extravasation,0,1,0,23.0,290.0,96.0,...,44.0,37.0,True,1.0,89.0,67.0,67.0,0.0,88.0,1.0
8497,10292_14945,/home/superrich/programming/kaggle/rsna-abdomi...,68,extravasation,0,1,0,21.0,287.0,89.0,...,49.0,38.0,True,1.0,89.0,69.0,69.0,0.0,88.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1496931,64256_12102,/home/superrich/programming/kaggle/rsna-abdomi...,346,extravasation,0,1,0,298.0,360.0,272.0,...,45.0,48.0,True,62.0,644.0,408.0,408.0,0.0,582.0,1.0
1496933,64256_12102,/home/superrich/programming/kaggle/rsna-abdomi...,348,extravasation,0,1,0,296.0,357.0,263.0,...,60.0,58.0,True,62.0,644.0,410.0,410.0,0.0,582.0,1.0
1496934,64256_12102,/home/superrich/programming/kaggle/rsna-abdomi...,349,extravasation,0,1,0,295.0,356.0,258.0,...,64.0,72.0,True,62.0,644.0,411.0,411.0,0.0,582.0,1.0
1496935,64256_12102,/home/superrich/programming/kaggle/rsna-abdomi...,350,extravasation,0,1,0,294.0,360.0,258.0,...,60.0,65.0,True,62.0,644.0,412.0,412.0,0.0,582.0,1.0


In [54]:
if False:
    extra = pd.read_csv(os.path.join(DATA_DIR, 'extra_sliding_5_bbox.csv'))
    extra_pos = pd.read_csv(os.path.join(DATA_DIR, 'extra_sliding_5_bbox_pos.csv'))
    display(extra.shape, extra_pos.shape)