In [1]:
from datasets import load_dataset
import pandas as pd
import json


dataset = load_dataset("adsabs/WIESP2022-NER")

splits = ["train", "test", "validation"]
all_data = []

for split in splits:
    if split in dataset:
        filename = f"WIESP2022-NER-{split}.jsonl"
        with open(filename, "w", encoding="utf-8") as file:
            for item in dataset[split]:
                year = item["bibcode"][:4]
                item["year"] = year  #add key "time" by extacting year from bibcode.
                all_data.append(item)
                file.write(json.dumps(item, ensure_ascii=False) + "\n")
    else:
        print(f"'{split}' subset not found in the dataset.")  

df = pd.DataFrame(all_data)
df['year'] = df['year'].astype(int)

print(df.groupby("year").size())



  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset parquet (/HDD16TB/models/huggingface/datasets/adsabs___parquet/fgrezes--WIESP2022-NER-7ede0456e2865c5d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 3/3 [00:00<00:00, 25.01it/s]


year
2015     495
2016    1033
2017    1014
2018    1027
2019    1099
2020     623
2021     333
dtype: int64


In [3]:
# this is the 3 interval split version

import os
from sklearn.model_selection import train_test_split

folder_path = '/home/weisi/Temporal/data/WIESP'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# devide dataset to 3 time periods
df_2015_2016 = df[df['year'] < 2017]
df_2017_2018 = df[df['year'].isin([2017, 2018])]
df_2019_2021 = df[df['year'] > 2018]

# reduce the datasets to the same size
min_size = min(len(df_2015_2016), len(df_2017_2018), len(df_2019_2021))


df_2015_2016_sampled = df_2015_2016.sample(n=min_size, random_state=1)
df_2017_2018_sampled = df_2017_2018.sample(n=min_size, random_state=1)
df_2019_2021_sampled = df_2019_2021.sample(n=min_size, random_state=1)


def save_datasets(df, period):
    # split train and test datasets
    train, validation = train_test_split(df, test_size=0.2, random_state=1)
    # save files
    train_filename = f'wiesp_{period}_train.json'
    validation_filename = f'wiesp_{period}_validation.json'
    train.to_json(os.path.join(folder_path, train_filename), orient='records')
    validation.to_json(os.path.join(folder_path, validation_filename), orient='records')


save_datasets(df_2015_2016_sampled, 'T1_2015_2016')
save_datasets(df_2017_2018_sampled, 'T2_2017_2018')
save_datasets(df_2019_2021_sampled, 'T3_2019_2021')

In [None]:
#4 time intervals
import os
from sklearn.model_selection import train_test_split

folder_path = '/home/weisi/Temporal/data/WIESP'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    

def split_and_save_datasets(df,period,seed,folder_path):
    # split train, validation and test datasets by ratio 0.6 0.2 0.2
    train, rest = train_test_split(df, test_size=0.4, random_state=seed)  
    validation, test = train_test_split(rest, test_size=0.5, random_state=seed)  
    # save files
    train_filename = f'{period}-train.json'
    validation_filename = f'{period}-validation.json'
    test_filename = f'{period}-test.json'
    train.to_json(os.path.join(folder_path, train_filename), orient='records', lines=True)
    validation.to_json(os.path.join(folder_path, validation_filename), orient='records', lines=True)
    test.to_json(os.path.join(folder_path, test_filename), orient='records', lines=True)

# split the factoid questions data

t1 = df[df['year'].isin([2015, 2016])]
t2 = df[df['year'].isin([2017, 2018])]
t3 = df[df['year'].isin([2019, 2019])]
t4 = df[df['year'].isin([2020, 2021])]
df_all_year= df[df['year'].isin([2015, 2019])]

min_size = min(len(t1), len(t2),len(t3), len(t4))

for seed in range(1, 6):  # randomly split 5 times
    folder_path ='/home/weisi/TemporalAssessment/data/WIESP/seed{}/'.format(seed)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    t1_sampled = t1.sample(n=min_size, random_state=seed)
    t2_sampled = t2.sample(n=min_size, random_state=seed)
    t3_sampled = t3.sample(n=min_size, random_state=seed)
    t4_sampled = t4.sample(n=min_size, random_state=seed)
    all_year_sampled = df_all_year.sample(n=min_size, random_state=seed)
    split_and_save_datasets(t1_sampled, 'wiesp-T1_2015_2016',seed,folder_path)
    split_and_save_datasets(t2_sampled, 'wiesp-T2_2017_2018',seed,folder_path)
    split_and_save_datasets(t3_sampled, 'wiesp-T3_2019',seed,folder_path)
    split_and_save_datasets(t4_sampled, 'wiesp-T4_2020_2021',seed,folder_path)
    split_and_save_datasets(all_year_sampled, 'wiesp-AY_2015_2019',seed,folder_path)