In [1]:
import pandas as pd

df = pd.read_json('/home/weisi/Temporal/data/GMB/gmb-2.2.0-seqtag.json', lines=True)
df['year'] =  pd.to_datetime(df['time']).dt.year
print('year:',df.groupby('year').size())
print('source:',df.groupby('source').size())

year: year
2002       1
2004     651
2005    3300
2006    1888
2007    1081
2008    1033
2009     501
2010     677
2011      33
dtype: int64
source: source
http    9165
dtype: int64


In [5]:
unique_labels = set()
for labels in df['ner_tags']:
    unique_labels.update(labels)
print(unique_labels)

{'tim-nam', 'org-leg', 'tim-dow', 'art-nam', 'per-fam', 'per-giv', 'org-nam', 'tim-clo', 'gpe-nam', 'art-add', 'per-ini', 'nat-nam', 'per-nam', 'geo-nam', 'eve-nam', 'tim-yoc', 'eve-ord', 'tim-dat', 'per-tit', 'per-mid', 'tim-dom', 'tim-moy', 'per-ord', 'O'}


In [6]:
print(len(unique_labels))

24


In [2]:
print('genre:',df.groupby('genre').size())
print('subcorpus:',df.groupby('subcorpus').size())

genre: genre
newspaper    9165
dtype: int64
subcorpus: subcorpus
Voice of America    9165
dtype: int64


In [4]:
import os

folder_path = '/home/weisi/Temporal/data/GMB'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    
for year in range(2005, 2009):
    df_year = df[df['year'] == year]   
    #filename: 'gmb_2006.json'...
    filename = f'gmb_{year}.json'
    # save to json file
    df_year.to_json(os.path.join(folder_path, filename), orient='records') 
   

In [3]:
import os
from sklearn.model_selection import train_test_split

folder_path = '/home/weisi/Temporal/data/GMB'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# devide dataset to 3 time periods
df_before_2005 = df[df['year'] < 2006]
df_2006_2007 = df[df['year'].isin([2006, 2007])]
df_after_2008 = df[df['year'] > 2007]

# reduce the datasets to the same size
min_size = min(len(df_before_2005), len(df_2006_2007), len(df_after_2008))


df_before_2005_sampled = df_before_2005.sample(n=min_size, random_state=1)
df_2006_2007_sampled = df_2006_2007.sample(n=min_size, random_state=1)
df_after_2008_sampled = df_after_2008.sample(n=min_size, random_state=1)

def save_datasets(df, period):
    # split train, validation and test datasets by ratio 0.7 0.15 0.15
    train, test = train_test_split(df, test_size=0.3, random_state=1)  
    validation, test = train_test_split(test, test_size=0.5, random_state=1)  

    # save files
    train_filename = f'gmb_{period}_train.json'
    validation_filename = f'gmb_{period}_validation.json'
    test_filename = f'gmb_{period}_test.json'
    train.to_json(os.path.join(folder_path, train_filename), orient='records', lines=True)
    validation.to_json(os.path.join(folder_path, validation_filename), orient='records', lines=True)
    test.to_json(os.path.join(folder_path, test_filename), orient='records', lines=True)

'''def save_datasets(df, period):
    # split train and test datasets
    train, test = train_test_split(df, test_size=0.2, random_state=1)
    
    # save files
    train_filename = f'gmb_{period}_train.json'
    test_filename = f'gmb_{period}_test.json'
    train.to_json(os.path.join(folder_path, train_filename), orient='records')
    test.to_json(os.path.join(folder_path, test_filename), orient='records')'''


save_datasets(df_before_2005_sampled, 'T1_2004-2005')
save_datasets(df_2006_2007_sampled, 'T2_2006-2007')
save_datasets(df_after_2008_sampled, 'T3_2008-2011')