In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

alldf = pd.read_json('/home/weisi/TemporalAssessment/data/BIOASQ/BioASQ.json', lines=True)


def split_and_save_datasets(df,period,seed,folder_path):
    # split train, validation and test datasets by ratio 0.6 0.2 0.2
    train, rest = train_test_split(df, test_size=0.4, random_state=seed)  
    validation, test = train_test_split(rest, test_size=0.5, random_state=seed)  
    # save files
    train_filename = f'{period}_train.json'
    validation_filename = f'{period}_validation.json'
    test_filename = f'{period}_test.json'
    train.to_json(os.path.join(folder_path, train_filename), orient='records', lines=True)
    validation.to_json(os.path.join(folder_path, validation_filename), orient='records', lines=True)
    test.to_json(os.path.join(folder_path, test_filename), orient='records', lines=True)



In [None]:
# split the factoid questions data
factoid_df=alldf[alldf['type']=='factoid']
factoid_df_2013_2015 = factoid_df[factoid_df['year'].isin([2013, 2015])]
factoid_df_2016_2018 = factoid_df[factoid_df['year'].isin([2016, 2018])]
factoid_df_2019_2020 = factoid_df[factoid_df['year'].isin([2019, 2020])]
factoid_df_2021_2022 = factoid_df[factoid_df['year'].isin([2021, 2022])]
factoid_df_all_year= factoid_df[factoid_df['year'].isin([2013, 2020])]

min_size_factoid = min(len(factoid_df_2013_2015), len(factoid_df_2016_2018),len(factoid_df_2019_2020), len(factoid_df_2021_2022))

for seed in range(1, 6):  # randomly split 5 times
    folder_path ='/home/weisi/TemporalAssessment/data/BIOASQ/seed{}/'.format(seed)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    df_2013_2015_sampled = factoid_df_2013_2015.sample(n=min_size_factoid, random_state=seed)
    df_2016_2018_sampled = factoid_df_2016_2018.sample(n=min_size_factoid, random_state=seed)
    df_2019_2020_sampled = factoid_df_2019_2020.sample(n=min_size_factoid, random_state=seed)
    df_2021_2022_sampled = factoid_df_2021_2022.sample(n=min_size_factoid, random_state=seed)
    all_year_sampled = factoid_df_all_year.sample(n=min_size, random_state=seed)
    split_and_save_datasets(df_2013_2015_sampled, 'bioasq_factoid_T1_2013-2015',seed,folder_path)
    split_and_save_datasets(df_2016_2018_sampled, 'bioasq_factoid_T2_2016-2018',seed,folder_path)
    split_and_save_datasets(df_2019_2020_sampled, 'bioasq_factoid_T3_2019-2020',seed,folder_path)
    split_and_save_datasets(df_2021_2022_sampled, 'bioasq_factoid_T4_2021-2022',seed,folder_path)
    split_and_save_datasets(all_year_sampled, 'bioasq_factoid_AY_2013-2020',seed,folder_path)

In [None]:
# split the list questions data
list_df=alldf[alldf['type']=='list']
list_df_2013_2015 = list_df[list_df['year'].isin([2013, 2015])]
list_df_2016_2018 = list_df[list_df['year'].isin([2016, 2018])]
list_df_2019_2020 = list_df[list_df['year'].isin([2019, 2020])]
list_df_2021_2022 = list_df[list_df['year'].isin([2021, 2022])]
list_df_all_year= list_df[list_df['year'].isin([2013, 2020])]

min_size_list = min(len(list_df_2013_2015), len(list_df_2016_2018),len(list_df_2019_2020), len(list_df_2021_2022))

for seed in range(1, 2):  # randomly split 5 times
    folder_path ='/home/weisi/TemporalAssessment/data/BIOASQ/list/seed{}/'.format(seed)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    df_2013_2015_sampled = list_df_2013_2015.sample(n=min_size_list, random_state=seed)
    df_2016_2018_sampled = list_df_2016_2018.sample(n=min_size_list, random_state=seed)
    df_2019_2020_sampled = list_df_2019_2020.sample(n=min_size_list, random_state=seed)
    df_2021_2022_sampled = list_df_2021_2022.sample(n=min_size_list, random_state=seed)
    all_year_sampled = list_df_all_year.sample(n=min_size, random_state=seed)
    split_and_save_datasets(df_2013_2015_sampled, 'bioasq_list_T1_2013-2015',seed,folder_path)
    split_and_save_datasets(df_2016_2018_sampled, 'bioasq_list_T2_2016-2018',seed,folder_path)
    split_and_save_datasets(df_2019_2020_sampled, 'bioasq_list_T3_2019-2020',seed,folder_path)
    split_and_save_datasets(df_2021_2022_sampled, 'bioasq_list_T4_2021-2022',seed,folder_path)
    split_and_save_datasets(all_year_sampled, 'bioasq_list_AY_2013-2020',seed,folder_path)
