In [1]:
import json
import pandas as pd
with open('/home/weisi/Temporal/data/MIMIC-IV-Note/mimic-top50.json', 'r', encoding='utf-8') as f:
    df=pd.read_json(f,lines=True)


In [None]:
print(df.groupby('time').size())

In [3]:
import os
from sklearn.model_selection import train_test_split
seed=2

folder_path ='/home/weisi/Temporal/data/MIMIC-IV-Note/seed{}/'.format(seed)

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# devide dataset to 3 time periods
df_2008_2010 = df[df['time'] == '2008 - 2010']
df_2011_2013 = df[df['time'] == '2011 - 2013']
df_2014_2016 = df[df['time'] == '2014 - 2016']
df_2017_2019 = df[df['time'] == '2017 - 2019']

# reduce the datasets to the same size
min_size = min(len(df_2008_2010), len(df_2011_2013), len(df_2014_2016), len(df_2017_2019))


df_2008_2010_sampled = df_2008_2010.sample(n=min_size, random_state=seed)
df_2011_2013_sampled = df_2011_2013.sample(n=min_size, random_state=seed)
df_2014_2016_sampled = df_2014_2016.sample(n=min_size, random_state=seed)
df_2017_2019_sampled = df_2017_2019.sample(n=min_size, random_state=seed)


def save_datasets(df, period,seed):
    # split train, validation and test datasets by ratio 0.7 0.15 0.15
    train, test = train_test_split(df, test_size=0.3, random_state=seed)  
    validation, test = train_test_split(test, test_size=0.5, random_state=seed)  

    # save files
    train_filename = f'mimic_{period}_train.json'
    validation_filename = f'mimic_{period}_validation.json'
    test_filename = f'mimic_{period}_test.json'
    train.to_json(os.path.join(folder_path, train_filename), orient='records', lines=True)
    validation.to_json(os.path.join(folder_path, validation_filename), orient='records', lines=True)
    test.to_json(os.path.join(folder_path, test_filename), orient='records', lines=True)


save_datasets(df_2008_2010_sampled, 'T1_2008-2010',seed)
save_datasets(df_2011_2013_sampled, 'T2_2011-2013',seed)
save_datasets(df_2014_2016_sampled, 'T3_2014-2016',seed)
save_datasets(df_2017_2019_sampled, 'T4_2017-2019',seed)


In [None]:
samples_per_period = min_size // 4  # floor

# create a all year data that draw equal data from 4 time periods and has same size as other time periods
allyear_sampled = pd.concat([
    df_2008_2010.sample(n=samples_per_period, random_state=1),
    df_2011_2013.sample(n=samples_per_period, random_state=1),
    df_2014_2016.sample(n=samples_per_period, random_state=1),
    df_2017_2019.sample(n=samples_per_period, random_state=1)
])
# randomlize the order od all year data
allyear_sampled = allyear_sampled.sample(frac=1, random_state=1).reset_index(drop=True)
save_datasets(allyear_sampled, 'Allyear_sampled')

In [None]:
df_test = allyear_sampled.sample(n=1000, random_state=1)
save_datasets(df_test, 'test_sample_1000')

In [None]:
from tqdm import tqdm
df_08_10 = df[df['time'] == '2008 - 2010'].sample(n=35000)
tqdm.pandas()#sets up the tqdm progress bar for tracking the progress of the following operation

df_08_10.to_json('mimic-top50_2008-2010_sample35k.json', orient='records', lines=True)

In [None]:
df_17_19 = df[df['time'] == '2017 - 2019'].sample(n=4000)
tqdm.pandas()#sets up the tqdm progress bar for tracking the progress of the following operation

df_17_19.to_json('mimic-top50_2017-2019_sample4k.json', orient='records', lines=True)