In [9]:
import os
import pandas as pd
import numpy as np

In [10]:
# To get the features, pleases run '../src/radiomics_extraction' first.

### (1) Get 'pid', 'series', 'label' for features dataframes.

In [11]:
feas_file_dir = '/media/tx-deepocean/Data/2022/chongfu1/Model/src/Data_features'
df_dicom = pd.read_csv(f'{feas_file_dir}/data_feas_dicom.csv')
df_nii = pd.read_csv(f'{feas_file_dir}/data_feas_nii.csv')
df_feas = pd.concat([df_dicom, df_nii], axis=0)

## (2) Split the dataset into train and test data.

In [12]:
df_all = pd.read_csv('./dataset_info/all_data_info.csv')
ndf = df_all[df_all['label']==0]
pdf = df_all[df_all['label']==1]
df_icc = pd.read_csv('./dataset_info/icc_data_info.csv')
data_split_ratio = 0.7     # which mean train:validation = 0.7:0.3
split_pid = 'pid'  # patients-based unit
R0df = df_all[df_all['RD']=='R0']
R1df = df_all[df_all['RD']=='R1']
R2df = df_all[df_all['RD']=='R2']

In [13]:
# Split by the pid
def df_split_bypid(df, train_df=None, ratio=0.5, split_col='pid'):
    if train_df.empty:
        return df_split_bypid1(datafile, ratio, split_col)
    else:
        index = df[split_col].value_counts().index
        train_df = train_df[train_df[split_col].isin(index)]
        index1 = train_df[split_col].value_counts().index
        df_train = df[df[split_col].isin(index1)]
        df_re = df[~df[split_col].isin(index1)]
        # cut_idx = int(ratio*len(index)-len(index1))
        ratio1 = (ratio*len(index)-len(index1))/(len(index)-len(index1))
        df_train1, df_test = df_split_bypid1(df_re, ratio1, split_col)
        return pd.concat([df_train, df_train1], axis=0), df_test
        
def df_split_bypid1(df, ratio=0.5, split_col='pid'):
    index = df[split_col].value_counts().index
    index = np.random.permutation(index)    # Random the index
    cut_inx = int(len(index) * ratio)
    index_train = index[:cut_inx]
    index_test = index[cut_inx:]
    df_train = df[df[split_col].isin(index_train)]
    df_test = df[df[split_col].isin(index_test)]
    return df_train, df_test

# Split the R0, R1 and R2 group into 7:3
R0_train, R0_test = df_split_bypid(R0df, df_icc, ratio=0.7, split_col=split_pid)
print('Pos: train len = {}, test len = {}'.format(len(R0_train), len(R0_test)))
R1_train, R1_test = df_split_bypid(R1df, df_icc, ratio=0.7, split_col=split_pid)
print('Pos: train len = {}, test len = {}'.format(len(R1_train), len(R1_test)))
R2_train, R2_test = df_split_bypid(R2df, df_icc, ratio=0.7, split_col=split_pid)
print('Neg: train len = {}, test len = {}'.format(len(R2_train), len(R2_test)))
df_train = pd.concat([R0_train, R1_train, R2_train], axis=0)
df_test = pd.concat([R0_test, R1_test, R2_test], axis=0)
df_train['dataset']='train'
df_test['dataset']='test'
df_all = pd.concat([df_train, df_test], axis=0)
df_all.to_csv('./dataset_info/data_train_test.csv', index=0)

Pos: train len = 165, test len = 72
Pos: train len = 96, test len = 45
Neg: train len = 213, test len = 93


In [14]:
df_feas_dicom = pd.read_csv('../src/Data_features/data_feas_dicom.csv')
df_feas_nii = pd.read_csv('../src/Data_features/data_feas_nii.csv')
df_feas = pd.concat([df_feas_dicom, df_feas_nii], axis=0)
df_all = df_all.sort_values(['pid', 'series'], ascending=[True, True])
df_merge = df_all[['pid', 'label', 'series', 'RD',  'mask', 'dataset']]
print(len(df_merge))
print(len(df_feas))
df_feas_all = pd.merge(df_merge, df_feas, on=['mask'], how='inner')
print(len(df_feas_all))
df_feas_all.to_csv('./dataset_info/data_feas_original.csv', index=0)
icc_feas_dicom = pd.read_csv('../src/Data_features/icc_feas_dicom.csv')
icc_feas_nii = pd.read_csv('../src/Data_features/icc_feas_nii.csv')
icc_feas = pd.concat([icc_feas_dicom, icc_feas_nii], axis=0)
icc_feas.to_csv('./dataset_info/icc_feas_original.csv', index=0)

684
684
684


In [15]:
df_feas_all.head()

Unnamed: 0,pid,label,series,RD,mask,dataset,image,shape_Elongation_original,shape_Flatness_original,shape_LeastAxisLength_original,...,glszm_SmallAreaHighGrayLevelEmphasis_exponential,glszm_SmallAreaLowGrayLevelEmphasis_exponential,glszm_ZoneEntropy_exponential,glszm_ZonePercentage_exponential,glszm_ZoneVariance_exponential,ngtdm_Busyness_exponential,ngtdm_Coarseness_exponential,ngtdm_Complexity_exponential,ngtdm_Contrast_exponential,ngtdm_Strength_exponential
0,1,0,2,R0,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,train,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,0.612454,0.479558,60.283309,...,184.474007,0.016484,7.148733,0.077864,171642.8,50.467099,5.3e-05,7010.061457,0.027314,1.483758
1,1,0,3,R0,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,train,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,0.617942,0.457688,58.350101,...,40.630952,0.02161,5.615794,0.007615,1910399.0,33.006826,0.000726,59.243156,0.000727,1.619471
2,1,0,4,R0,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,train,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,0.577315,0.479415,59.09594,...,32.766934,0.037426,6.993332,0.004338,29918220.0,159.820055,8.3e-05,125.424714,0.000345,0.473312
3,2,1,2,R2,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,train,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,0.530179,0.506282,47.871153,...,154.724619,0.029147,6.675069,0.015894,1229567.0,17.509281,0.000487,514.589173,0.003183,4.349297
4,2,1,3,R2,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,train,/media/tx-deepocean/Data/2022/chongfu1/Dataset...,0.54044,0.470662,42.822993,...,140.88784,0.029401,5.847722,0.038482,81040.74,11.219188,0.000997,680.728086,0.007924,10.225813


## Cleaning the external features

In [16]:
df_feas_external = pd.read_csv('../src/Data_features/external_feas.csv')
df_external_info =  pd.read_csv('./dataset_info/external_data_info.csv')
df_external_info = df_external_info.sort_values(['pid', 'series'], ascending=[True, True])
ex_df_merge = df_external_info[['pid', 'label', 'series', 'RD',  'mask']]
print(len(ex_df_merge))
print(len(df_feas_external))
ex_df_feas = pd.merge(ex_df_merge, df_feas_external, on=['mask'], how='inner')
print(len(ex_df_feas))
ex_df_feas.to_csv('./dataset_info/external_feas_original.csv', index=0)

90
90
90
