In [1]:
import pandas as pd

import os

import json

input_path = '/home/weisi/TemporalAssessment/data/BIOASQ/BioASQ.json'
output_path = '/home/weisi/TemporalAssessment/data/BIOASQ/BioASQ_formatted.json'


with open(input_path, 'r') as file:
    data = json.load(file)


for item in data:
    if item['type'] == 'list':
        # change exact_answer: ["A","B"]  in list type to ["A, B"] 
        item['exact_answer'] = [', '.join([answer[0] for answer in item['exact_answer']])]
    elif item['type'] == 'yesno':
        # change exact_answer:"yes" to ["yes"]
        item['exact_answer'] = [item['exact_answer']]
    elif item['type'] == 'summary':
        # exact_answer in summary type is null, copy it from ideal_answer
        item['exact_answer'] = item['ideal_answer']


with open(output_path, 'w') as file:
    json.dump(data, file, indent=4)


JSONDecodeError: Extra data: line 2 column 1 (char 4285)

In [6]:
import pandas as pd
import json
from itertools import product

output_file_path = '/home/weisi/TemporalAssessment/data/BIOASQ/BioASQ_formatted.json'  
df = pd.read_json('/home/weisi/TemporalAssessment/data/BIOASQ/BioASQ.json', lines=True)


def process_row(row):
    if row['type'] == 'list':
        #row['exact_answer'] = [', '.join(sum(row['exact_answer'], []))] #this simply connect all answers
        #change "exact_answer": [["a1", "a2", "a3"], ["b"], ["c1", "c2"], ["d"]] to:
        # ["a1, b, c1, d", "a2, b, c1, d", "a3, b, c1, d", "a1, b, c2, d", "a2, b, c2, d", "a3, b, c2, d"]
        # use itertools.product to get all combinations 
        all_combinations = list(product(*row['exact_answer']))
        # change each combination to string and connect with commas
        row['exact_answer'] = [', '.join(combination) for combination in all_combinations]
    elif row['type'] == 'yesno':
        row['exact_answer'] = [row['exact_answer']]
    elif row['type'] == 'summary':
        row['exact_answer'] = row['ideal_answer']
    return row

df = df.apply(process_row, axis=1)
df .to_json(output_file_path, orient='records', lines=True)




In [8]:
from sklearn.model_selection import train_test_split
import json

df = pd.read_json('/home/weisi/TemporalAssessment/data/BIOASQ/BioASQ_formatted.json', lines=True)

def split_and_save_datasets(df,period,seed,folder_path):
    # split train, validation and test datasets by ratio 0.6 0.2 0.2
    train, rest = train_test_split(df, test_size=0.4, random_state=seed)  
    validation, test = train_test_split(rest, test_size=0.5, random_state=seed)  
    # save files
    train_filename = f'{period}-train.json'
    validation_filename = f'{period}-validation.json'
    test_filename = f'{period}-test.json'
    train.to_json(os.path.join(folder_path, train_filename), orient='records', lines=True)
    validation.to_json(os.path.join(folder_path, validation_filename), orient='records', lines=True)
    test.to_json(os.path.join(folder_path, test_filename), orient='records', lines=True)

def split_and_save_datasets_no_valid(df,period,seed,folder_path):
    # split train, validation and test datasets by ratio 0.8 0.2
    train, test = train_test_split(df, test_size=0.2, random_state=seed)  
    #validation, test = train_test_split(rest, test_size=0.5, random_state=seed)  
    # save files
    train_filename = f'{period}-train.json'
    #validation_filename = f'{period}-validation.json'
    test_filename = f'{period}-test.json'
    train.to_json(os.path.join(folder_path, train_filename), orient='records', lines=True)
    #validation.to_json(os.path.join(folder_path, validation_filename), orient='records', lines=True)
    test.to_json(os.path.join(folder_path, test_filename), orient='records', lines=True)

    

In [9]:
df_2013_2015 = df[df['year'].isin([2013, 2015])]
df_2016_2018 = df[df['year'].isin([2016, 2018])]
df_2019_2020 = df[df['year'].isin([2019, 2020])]
df_2021_2022 = df[df['year'].isin([2021, 2022])]
#df_all_year= df[df['year'].isin([2013, 2020])]

min_size = min(len(df_2013_2015), len(df_2016_2018),len(df_2019_2020), len(df_2021_2022))

for seed in range(1, 6):  # range(1,6):randomly split 5 times
    folder_path ='/home/weisi/TemporalAssessment/data/BIOASQ_formatted/alltypes/seed{}/'.format(seed)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    df_2013_2015_sampled = df_2013_2015.sample(n=min_size, random_state=seed)
    df_2016_2018_sampled = df_2016_2018.sample(n=min_size, random_state=seed)
    df_2019_2020_sampled = df_2019_2020.sample(n=min_size, random_state=seed)
    df_2021_2022_sampled = df_2021_2022.sample(n=min_size, random_state=seed)
    #all_year_sampled = df_all_year.sample(n=min_size, random_state=seed)
    df_sampled = df.sample(n=min_size, random_state=seed)
    split_and_save_datasets_no_valid(df_2013_2015_sampled, 'bioasq-alltypes-T1_2013_2015',seed,folder_path)
    split_and_save_datasets_no_valid(df_2016_2018_sampled, 'bioasq-alltypes-T2_2016_2018',seed,folder_path)
    split_and_save_datasets_no_valid(df_2019_2020_sampled, 'bioasq-alltypes-T3_2019_2020',seed,folder_path)
    split_and_save_datasets_no_valid(df_2021_2022_sampled, 'bioasq-alltypes-T4_2021_2022',seed,folder_path)
    split_and_save_datasets_no_valid(df_sampled, 'bioasq-alltypes-ALL_2013_2022',seed,folder_path)

In [11]:
# split the factoid questions data
factoid_df=df[df['type']=='factoid']
factoid_df_2013_2015 = factoid_df[factoid_df['year'].isin([2013, 2015])]
factoid_df_2016_2018 = factoid_df[factoid_df['year'].isin([2016, 2018])]
factoid_df_2019_2020 = factoid_df[factoid_df['year'].isin([2019, 2020])]
factoid_df_2021_2022 = factoid_df[factoid_df['year'].isin([2021, 2022])]
#factoid_df_all_year= factoid_df[factoid_df['year'].isin([2013, 2020])]

min_size= min(len(factoid_df_2013_2015), len(factoid_df_2016_2018),len(factoid_df_2019_2020), len(factoid_df_2021_2022))


for seed in range(1, 6):  # range(1,6):randomly split 5 times
    folder_path ='/home/weisi/TemporalAssessment/data/BIOASQ_formatted/factoid/seed{}/'.format(seed)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    df_2013_2015_sampled = factoid_df_2013_2015.sample(n=min_size, random_state=seed)
    df_2016_2018_sampled = factoid_df_2016_2018.sample(n=min_size, random_state=seed)
    df_2019_2020_sampled = factoid_df_2019_2020.sample(n=min_size, random_state=seed)
    df_2021_2022_sampled = factoid_df_2021_2022.sample(n=min_size, random_state=seed)
    #all_year_sampled = df_all_year.sample(n=min_size, random_state=seed)
    df_sampled = factoid_df.sample(n=min_size, random_state=seed)
    split_and_save_datasets_no_valid(df_2013_2015_sampled, 'bioasq-factoid-T1_2013_2015',seed,folder_path)
    split_and_save_datasets_no_valid(df_2016_2018_sampled, 'bioasq-factoid-T2_2016_2018',seed,folder_path)
    split_and_save_datasets_no_valid(df_2019_2020_sampled, 'bioasq-factoid-T3_2019_2020',seed,folder_path)
    split_and_save_datasets_no_valid(df_2021_2022_sampled, 'bioasq-factoid-T4_2021_2022',seed,folder_path)
    split_and_save_datasets_no_valid(df_sampled, 'bioasq-factoid-ALL_2013_2022',seed,folder_path)

In [12]:
# split the list questions data
list_df=df[df['type']=='list']
list_df_2013_2015 = list_df[list_df['year'].isin([2013, 2015])]
list_df_2016_2018 = list_df[list_df['year'].isin([2016, 2018])]
list_df_2019_2020 = list_df[list_df['year'].isin([2019, 2020])]
list_df_2021_2022 = list_df[list_df['year'].isin([2021, 2022])]


min_size = min(len(list_df_2013_2015), len(list_df_2016_2018),len(list_df_2019_2020), len(list_df_2021_2022))


for seed in range(1, 6):  # range(1,6):randomly split 5 times
    folder_path ='/home/weisi/TemporalAssessment/data/BIOASQ_formatted/list/seed{}/'.format(seed)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    df_2013_2015_sampled = list_df_2013_2015.sample(n=min_size, random_state=seed)
    df_2016_2018_sampled = list_df_2016_2018.sample(n=min_size, random_state=seed)
    df_2019_2020_sampled = list_df_2019_2020.sample(n=min_size, random_state=seed)
    df_2021_2022_sampled = list_df_2021_2022.sample(n=min_size, random_state=seed)
    #all_year_sampled = df_all_year.sample(n=min_size, random_state=seed)
    df_sampled = list_df.sample(n=min_size, random_state=seed)
    split_and_save_datasets_no_valid(df_2013_2015_sampled, 'bioasq-list-T1_2013_2015',seed,folder_path)
    split_and_save_datasets_no_valid(df_2016_2018_sampled, 'bioasq-list-T2_2016_2018',seed,folder_path)
    split_and_save_datasets_no_valid(df_2019_2020_sampled, 'bioasq-list-T3_2019_2020',seed,folder_path)
    split_and_save_datasets_no_valid(df_2021_2022_sampled, 'bioasq-list-T4_2021_2022',seed,folder_path)
    split_and_save_datasets_no_valid(df_sampled, 'bioasq-list-ALL_2013_2022',seed,folder_path)


In [13]:
# split the yesno questions data
yesno_df=df[df['type']=='yesno']
yesno_df_2013_2015 = yesno_df[yesno_df['year'].isin([2013, 2015])]
yesno_df_2016_2018 = yesno_df[yesno_df['year'].isin([2016, 2018])]
yesno_df_2019_2020 = yesno_df[yesno_df['year'].isin([2019, 2020])]
yesno_df_2021_2022 = yesno_df[yesno_df['year'].isin([2021, 2022])]

min_size = min(len(yesno_df_2013_2015), len(yesno_df_2016_2018),len(yesno_df_2019_2020), len(yesno_df_2021_2022))

for seed in range(1, 6):  # range(1,6):randomly split 5 times
    folder_path ='/home/weisi/TemporalAssessment/data/BIOASQ_formatted/yesno/seed{}/'.format(seed)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    df_2013_2015_sampled = yesno_df_2013_2015.sample(n=min_size, random_state=seed)
    df_2016_2018_sampled = yesno_df_2016_2018.sample(n=min_size, random_state=seed)
    df_2019_2020_sampled = yesno_df_2019_2020.sample(n=min_size, random_state=seed)
    df_2021_2022_sampled = yesno_df_2021_2022.sample(n=min_size, random_state=seed)
    #all_year_sampled = df_all_year.sample(n=min_size, random_state=seed)
    df_sampled = yesno_df.sample(n=min_size, random_state=seed)
    split_and_save_datasets_no_valid(df_2013_2015_sampled, 'bioasq-yesno-T1_2013_2015',seed,folder_path)
    split_and_save_datasets_no_valid(df_2016_2018_sampled, 'bioasq-yesno-T2_2016_2018',seed,folder_path)
    split_and_save_datasets_no_valid(df_2019_2020_sampled, 'bioasq-yesno-T3_2019_2020',seed,folder_path)
    split_and_save_datasets_no_valid(df_2021_2022_sampled, 'bioasq-yesno-T4_2021_2022',seed,folder_path)
    split_and_save_datasets_no_valid(df_sampled, 'bioasq-yesno-ALL_2013_2022',seed,folder_path)


In [14]:
# split the summary questions data
summary_df=df[df['type']=='summary']
summary_df_2013_2015 = summary_df[summary_df['year'].isin([2013, 2015])]
summary_df_2016_2018 = summary_df[summary_df['year'].isin([2016, 2018])]
summary_df_2019_2020 = summary_df[summary_df['year'].isin([2019, 2020])]
summary_df_2021_2022 = summary_df[summary_df['year'].isin([2021, 2022])]

min_size = min(len(summary_df_2013_2015), len(summary_df_2016_2018),len(summary_df_2019_2020), len(summary_df_2021_2022))

for seed in range(1, 6):  # range(1,6):randomly split 5 times
    folder_path ='/home/weisi/TemporalAssessment/data/BIOASQ_formatted/summary/seed{}/'.format(seed)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    df_2013_2015_sampled = summary_df_2013_2015.sample(n=min_size, random_state=seed)
    df_2016_2018_sampled = summary_df_2016_2018.sample(n=min_size, random_state=seed)
    df_2019_2020_sampled = summary_df_2019_2020.sample(n=min_size, random_state=seed)
    df_2021_2022_sampled = summary_df_2021_2022.sample(n=min_size, random_state=seed)
    #all_year_sampled = df_all_year.sample(n=min_size, random_state=seed)
    df_sampled = summary_df.sample(n=min_size, random_state=seed)
    split_and_save_datasets_no_valid(df_2013_2015_sampled, 'bioasq-summary-T1_2013_2015',seed,folder_path)
    split_and_save_datasets_no_valid(df_2016_2018_sampled, 'bioasq-summary-T2_2016_2018',seed,folder_path)
    split_and_save_datasets_no_valid(df_2019_2020_sampled, 'bioasq-summary-T3_2019_2020',seed,folder_path)
    split_and_save_datasets_no_valid(df_2021_2022_sampled, 'bioasq-summary-T4_2021_2022',seed,folder_path)
    split_and_save_datasets_no_valid(df_sampled, 'bioasq-summary-ALL_2013_2022',seed,folder_path)
