In [2]:
import os
import sys
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import glob

In [3]:
seed = 42

DATA_DIR = '../../data'
DATASET_DIR = os.path.join(DATA_DIR, 'dataset')
SEG_DIR = os.path.join(DATASET_DIR, 'segmentations')
PNG_DIR = os.path.join(DATA_DIR, 'png_folder')
DCM_DIR = os.path.join(DATASET_DIR, 'train_images')

train_meta = pd.read_csv(os.path.join(DATASET_DIR, 'train_series_meta.csv'))
train = pd.read_csv(os.path.join(DATASET_DIR, 'train.csv'))
image_level_label = pd.read_csv(os.path.join(DATASET_DIR, 'image_level_labels.csv'))

In [8]:
def to_seg_df(save=False):
    seg_series_ids = sorted([int(p.split(".")[0]) for p in os.listdir(SEG_DIR)])
    print(len(seg_series_ids))

    seg_df = train_meta[train_meta['series_id'].isin(seg_series_ids)].reset_index(drop=True)
    tmp = train[train['patient_id'].isin(seg_df['patient_id'].unique())]
    seg_df = pd.merge(tmp, seg_df, how='left', on='patient_id')

    if save:
        seg_df.to_csv(os.path.join(DATA_DIR, f"seg_df.csv"), index=False)
        print('save [seg_df.csv]')
    return seg_df

seg_df = to_seg_df(save=False)
seg_df

206


Unnamed: 0,patient_id,bowel_healthy,bowel_injury,extravasation_healthy,extravasation_injury,kidney_healthy,kidney_low,kidney_high,liver_healthy,liver_low,liver_high,spleen_healthy,spleen_low,spleen_high,any_injury,series_id,aortic_hu,incomplete_organ
0,10004,1,0,0,1,0,1,0,1,0,0,0,0,1,1,21057,146.00,0
1,10004,1,0,0,1,0,1,0,1,0,0,0,0,1,1,51033,454.75,0
2,10217,1,0,0,1,1,0,0,0,1,0,0,0,1,1,16066,208.00,0
3,10228,1,0,1,0,1,0,0,0,1,0,0,1,0,1,30522,145.00,0
4,10228,1,0,1,0,1,0,0,0,1,0,0,1,0,1,40471,291.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,65504,1,0,1,0,1,0,0,0,1,0,0,0,1,1,55928,144.00,0
202,7642,0,1,1,0,1,0,0,0,1,0,0,1,0,1,778,183.00,0
203,8848,1,0,1,0,1,0,0,0,1,0,0,1,0,1,41663,238.00,0
204,8848,1,0,1,0,1,0,0,0,1,0,0,1,0,1,7384,367.00,0


In [9]:
def to_df_seg(seg_df, save=False):
    df_seg = pd.read_csv(os.path.join(DATA_DIR, 'seg_df.csv'))

    mask_files = os.listdir(SEG_DIR) # by series_id
    df_mask = pd.DataFrame({
        'mask_file': mask_files,
    })
    df_mask['series_id'] = df_mask['mask_file'].apply(lambda x: int(x[:-4]))
    df_mask['mask_file'] = df_mask['mask_file'].apply(lambda x: os.path.join(SEG_DIR, x))
    df = df_seg.merge(df_mask, how='left', on='series_id')
    df['png_suffix'] = PNG_DIR + '/' + df['patient_id'].astype(str) + '_' + df['series_id'].astype(str)
    df['dcm_folder'] = DCM_DIR + '/' + df['patient_id'].astype(str) + '/' + df['series_id'].astype(str)
    df['mask_file'].fillna('', inplace=True)

    df = df.query('mask_file != ""').reset_index(drop=True)
    
    tmp = df.groupby('patient_id')['series_id'].agg(['count']).reset_index()
    skf = StratifiedKFold(5, shuffle=True, random_state=seed)
    tmp['fold'] = -1
    for fold, (train_idx, valid_idx) in enumerate(skf.split(tmp, tmp['count'])):
        tmp.loc[valid_idx, 'fold'] = fold
    
    df = df.merge(tmp, how='left', on='patient_id')
    if save:
        df.to_csv(os.path.join(DATA_DIR, 'df_seg.csv'), index=False)
        print('save [df_seg.csv]')
    return df

df_seg = to_df_seg(seg_df, save=False)
df_seg

Unnamed: 0,patient_id,bowel_healthy,bowel_injury,extravasation_healthy,extravasation_injury,kidney_healthy,kidney_low,kidney_high,liver_healthy,liver_low,...,spleen_low,spleen_high,any_injury,series_id,aortic_hu,incomplete_organ,mask_file,png_suffix,dcm_folder,fold
0,10004,1,0,0,1,0,1,0,1,0,...,0,1,1,21057,146.00,0,../../data/dataset/segmentations/21057.nii,../../data/png_folder/10004_21057,../../data/dataset/train_images/10004/21057,2
1,10004,1,0,0,1,0,1,0,1,0,...,0,1,1,51033,454.75,0,../../data/dataset/segmentations/51033.nii,../../data/png_folder/10004_51033,../../data/dataset/train_images/10004/51033,4
2,10217,1,0,0,1,1,0,0,0,1,...,0,1,1,16066,208.00,0,../../data/dataset/segmentations/16066.nii,../../data/png_folder/10217_16066,../../data/dataset/train_images/10217/16066,2
3,10228,1,0,1,0,1,0,0,0,1,...,1,0,1,30522,145.00,0,../../data/dataset/segmentations/30522.nii,../../data/png_folder/10228_30522,../../data/dataset/train_images/10228/30522,3
4,10228,1,0,1,0,1,0,0,0,1,...,1,0,1,40471,291.00,0,../../data/dataset/segmentations/40471.nii,../../data/png_folder/10228_40471,../../data/dataset/train_images/10228/40471,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,65504,1,0,1,0,1,0,0,0,1,...,0,1,1,55928,144.00,0,../../data/dataset/segmentations/55928.nii,../../data/png_folder/65504_55928,../../data/dataset/train_images/65504/55928,0
202,7642,0,1,1,0,1,0,0,0,1,...,1,0,1,778,183.00,0,../../data/dataset/segmentations/778.nii,../../data/png_folder/7642_778,../../data/dataset/train_images/7642/778,4
203,8848,1,0,1,0,1,0,0,0,1,...,1,0,1,41663,238.00,0,../../data/dataset/segmentations/41663.nii,../../data/png_folder/8848_41663,../../data/dataset/train_images/8848/41663,1
204,8848,1,0,1,0,1,0,0,0,1,...,1,0,1,7384,367.00,0,../../data/dataset/segmentations/7384.nii,../../data/png_folder/8848_7384,../../data/dataset/train_images/8848/7384,1
