In [1]:
import pandas as pd


**Split training set into training and validation set.**

In [3]:
training_file = './train_listfile_full.csv'
df = pd.read_csv(training_file)
patient_id = df['stay'].str.split("_", n=1, expand=True)
patient_id

Unnamed: 0,0,1
0,61624,episode1_timeseries.csv
1,32476,episode1_timeseries.csv
2,8415,episode1_timeseries.csv
3,59877,episode1_timeseries.csv
4,48374,episode1_timeseries.csv
...,...,...
2704619,93525,episode1_timeseries.csv
2704620,22564,episode1_timeseries.csv
2704621,66907,episode1_timeseries.csv
2704622,45583,episode1_timeseries.csv


In [4]:
patient_id.sample(frac=0.15) #use 15% of patients as the validation set

Unnamed: 0,0,1
1081137,11417,episode1_timeseries.csv
840612,27472,episode4_timeseries.csv
553299,28298,episode1_timeseries.csv
1072006,25862,episode1_timeseries.csv
1530302,26408,episode1_timeseries.csv
...,...,...
981541,55844,episode1_timeseries.csv
826647,2081,episode3_timeseries.csv
1818384,75752,episode1_timeseries.csv
1216082,11464,episode1_timeseries.csv


In [5]:
patient_id_val = patient_id.sample(frac=0.15)
patient_id_val

Unnamed: 0,0,1
53667,21280,episode6_timeseries.csv
2610491,32253,episode1_timeseries.csv
943903,77973,episode1_timeseries.csv
2047746,54636,episode1_timeseries.csv
1763012,23744,episode1_timeseries.csv
...,...,...
861633,75431,episode1_timeseries.csv
2543404,46728,episode1_timeseries.csv
1715588,4802,episode1_timeseries.csv
1673853,24444,episode1_timeseries.csv


In [5]:
#find unique patient ids
val_patients = patient_id[0].unique()
val_patients.size

1154

In [6]:
val_patients_id = pd.Series(val_patients).sample(frac=0.1)
val_patients_id

423      827
1110    2230
922     1861
1011    2032
912     1837
        ... 
972     1965
134      262
338      654
935     1888
857     1716
Length: 115, dtype: object

In [7]:
df['patient_id'] = patient_id[0] # add patient_id col to the original df
df

Unnamed: 0,stay,period_length,y_true,patient_id
0,3_episode1_timeseries.csv,127.0,0,3
1,3_episode1_timeseries.csv,104.0,0,3
2,3_episode1_timeseries.csv,36.0,0,3
3,3_episode1_timeseries.csv,128.0,0,3
4,3_episode1_timeseries.csv,65.0,0,3
...,...,...,...,...
135226,2322_episode2_timeseries.csv,1016.0,0,2322
135227,2322_episode2_timeseries.csv,208.0,0,2322
135228,2322_episode2_timeseries.csv,550.0,0,2322
135229,2322_episode2_timeseries.csv,747.0,0,2322


In [8]:
#filter df with validation set patient ids
df_val = df.loc[df['patient_id'].isin(val_patients_id)]
df_val

Unnamed: 0,stay,period_length,y_true,patient_id
1822,53_episode1_timeseries.csv,22.0,0,53
1823,53_episode1_timeseries.csv,37.0,0,53
1824,53_episode1_timeseries.csv,30.0,0,53
1825,53_episode1_timeseries.csv,39.0,0,53
1826,53_episode1_timeseries.csv,35.0,0,53
...,...,...,...,...
132614,2289_episode1_timeseries.csv,27.0,0,2289
132615,2289_episode1_timeseries.csv,49.0,0,2289
132616,2289_episode1_timeseries.csv,51.0,0,2289
132617,2289_episode1_timeseries.csv,62.0,0,2289


In [9]:
# save validation df to csv file
df_val = df_val[['stay', 'period_length', 'y_true']]
df_val.to_csv('val_listfile.csv', index=False)

In [10]:
# remove validation patients from original training df and save it as a new file
df_train = df.loc[~df['patient_id'].isin(val_patients_id)]
df_train

Unnamed: 0,stay,period_length,y_true,patient_id
0,3_episode1_timeseries.csv,127.0,0,3
1,3_episode1_timeseries.csv,104.0,0,3
2,3_episode1_timeseries.csv,36.0,0,3
3,3_episode1_timeseries.csv,128.0,0,3
4,3_episode1_timeseries.csv,65.0,0,3
...,...,...,...,...
135226,2322_episode2_timeseries.csv,1016.0,0,2322
135227,2322_episode2_timeseries.csv,208.0,0,2322
135228,2322_episode2_timeseries.csv,550.0,0,2322
135229,2322_episode2_timeseries.csv,747.0,0,2322


In [11]:
df_train = df_train[['stay', 'period_length', 'y_true']]
df_train.to_csv('train_listfile.csv', index=False)