In [2]:
from datasets import load_dataset
import pandas as pd
import json


dataset = load_dataset("adsabs/WIESP2022-NER")

splits = ["train", "test", "validation"]
all_data = []

for split in splits:
    if split in dataset:
        filename = f"WIESP2022-NER-{split}.jsonl"
        with open(filename, "w", encoding="utf-8") as file:
            for item in dataset[split]:
                year = item["bibcode"][:4]
                item["year"] = year  #add key "time" by extacting year from bibcode.
                all_data.append(item)
                file.write(json.dumps(item, ensure_ascii=False) + "\n")
    else:
        print(f"'{split}' subset not found in the dataset.")  

df = pd.DataFrame(all_data)
df['year'] = df['year'].astype(int)
print(df.groupby("year").size())



  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset parquet (/HDD16TB/models/huggingface/datasets/adsabs___parquet/fgrezes--WIESP2022-NER-7ede0456e2865c5d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 3/3 [00:00<00:00, 803.76it/s]


year
2015     495
2016    1033
2017    1014
2018    1027
2019    1099
2020     623
2021     333
dtype: int64


In [3]:
import os
from sklearn.model_selection import train_test_split

folder_path = '/home/weisi/Temporal/data/WIESP'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# devide dataset to 3 time periods
df_2015_2016 = df[df['year'] < 2017]
df_2017_2018 = df[df['year'].isin([2017, 2018])]
df_2019_2021 = df[df['year'] > 2018]

# reduce the datasets to the same size
min_size = min(len(df_2015_2016), len(df_2017_2018), len(df_2019_2021))


df_2015_2016_sampled = df_2015_2016.sample(n=min_size, random_state=1)
df_2017_2018_sampled = df_2017_2018.sample(n=min_size, random_state=1)
df_2019_2021_sampled = df_2019_2021.sample(n=min_size, random_state=1)


def save_datasets(df, period):
    # split train and test datasets
    train, validation = train_test_split(df, test_size=0.2, random_state=1)
    # save files
    train_filename = f'wiesp_{period}_train.json'
    validation_filename = f'wiesp_{period}_validation.json'
    train.to_json(os.path.join(folder_path, train_filename), orient='records')
    validation.to_json(os.path.join(folder_path, validation_filename), orient='records')


save_datasets(df_2015_2016_sampled, 'T1_2015_2016')
save_datasets(df_2017_2018_sampled, 'T2_2017_2018')
save_datasets(df_2019_2021_sampled, 'T3_2019_2021')