In [1]:
import pandas as pd
import json
import os
from tqdm import tqdm
from collections import Counter, defaultdict
import pickle
import csv
import argparse

In [2]:
column_types = {'EventCode': str}
raw_data = pd.read_csv('./EGIRIS_ce_final.csv',  dtype=column_types, sep='\t')

In [3]:
raw_data

Unnamed: 0,Actor1Name,EventCode,Relation_choice,Actor2Name,Md5,day,ce_id,timid
0,US,057,Sign formal agreement,Iran,0b6cd037f3fc22ed82e0c147839ba9f9,20150219,0,0
1,Egypt,021,Appeal for material cooperation,Qatari government,022b9f92c6a790cb1f1e1b7e6d5986e9,20150219,-1,0
2,Iran,031,Express intent to engage in material cooperation,Russia,3a4bb7378ab66388eee2ab5657ae794f,20150219,-1,0
3,King Hussein,013,Make optimistic comment,Israel,b8c6ea9a1d38101ce4d5f8ea673b404a,20150219,-1,0
4,Hezbollah,1431,Conduct strike or boycott for leadership change,Qatar,2604cc86179d28276f48d69594531be3,20150219,-1,0
...,...,...,...,...,...,...,...,...
455872,Dmitry Peskov,1222,Reject request for military aid,Ukraine,9592dd1b296aa51f5b2053e8311659e2,20220317,4395,2583
455873,Ukraine,093,Investigate military action,Russia,dafe15f7bc95e33b171b5663fc55b735,20220317,4395,2583
455874,Russia,1421,Conduct hunger strike for leadership change,Ukraine,0647d97fcb81a6f13c8b606058e4109a,20220317,4395,2583
455875,Ukraine,150,Exhibit military or police power,Russia,c43e61753acf9cd4c4617109f7130ae5,20220317,4396,2583


In [5]:
unique_time = raw_data['day'].unique()
assert all(unique_time == sorted(unique_time))
tim2id = {name: idx for idx, name in enumerate(unique_time)}
with open('./data_tkg_EGIRIS/date2id.txt', 'w') as f:
    for k, v in tim2id.items():
        f.write(f"{k}\t{v}\n")

In [6]:
max_date = raw_data['day'].max()
valid_split = tim2id[raw_data['day'].max()-10000]
train_split = tim2id[raw_data['day'].max()-20000]

In [7]:
train_df = raw_data[raw_data['timid'] <= train_split]
val_df = raw_data[(raw_data['timid'] > train_split) & (raw_data['timid'] <= valid_split)]
test_df = raw_data[raw_data['timid'] > valid_split]

In [9]:
anchor_ent = (train_df['Actor1Name']._append(train_df['Actor2Name'])).unique()
anchor_rel = train_df['EventCode'].unique()

In [10]:
print(len(train_df))
print(len(val_df))
print(len(test_df))

360190
49125
46562


In [11]:
val_df = val_df[val_df['EventCode'].isin(anchor_rel)]
test_df = test_df[test_df['EventCode'].isin(anchor_rel)]
val_df = val_df[(val_df['Actor1Name'].isin(anchor_ent)) & (val_df['Actor2Name'].isin(anchor_ent))]
test_df = test_df[(test_df['Actor1Name'].isin(anchor_ent)) & (test_df['Actor2Name'].isin(anchor_ent))]

In [12]:
print(len(train_df))
print(len(val_df))
print(len(test_df))

360190
49125
46562


In [15]:
print(len(train_df['Md5'].unique()))
print(len(val_df['Md5'].unique()))
print(len(test_df['Md5'].unique()))

201925
26254
25657


In [16]:
print(len(raw_data))
print(len(raw_data['Md5'].unique()))

455877
253836


In [17]:
print(len(train_df['Actor1Name']._append(train_df['Actor2Name']).unique()))
print(len(val_df['Actor1Name']._append(val_df['Actor2Name']).unique()))
print(len(test_df['Actor1Name']._append(test_df['Actor2Name']).unique()))
print(len(raw_data['Actor1Name']._append(raw_data['Actor2Name']).unique()))

2794
1790
1772
2794


In [19]:
print(len(train_df['EventCode'].unique()))
print(len(val_df['EventCode'].unique()))
print(len(test_df['EventCode'].unique()))
print(len(raw_data['EventCode'].unique()))

234
192
196
234


In [14]:
curate_df = pd.concat([train_df, val_df, test_df], axis=0)

unique_entity = curate_df['Actor1Name']._append(curate_df['Actor2Name']).unique()
unique_relation = curate_df['EventCode'].unique()

ent2id = {name: idx for idx, name in enumerate(unique_entity)}
rel2id = {name: idx for idx, name in enumerate(unique_relation)}

output_df = pd.DataFrame()
output_df['actor1id'] = curate_df['Actor1Name'].map(lambda x: ent2id[x])
output_df['eventid'] = curate_df['EventCode'].map(lambda x: rel2id[x])
output_df['actor2id'] = curate_df['Actor2Name'].map(lambda x: ent2id[x])
output_df['timid'] = curate_df['timid']

output_train = output_df[output_df['timid'] <= train_split]
output_val = output_df[(output_df['timid'] > train_split) & (output_df['timid'] <= valid_split)]
output_test = output_df[output_df['timid'] > valid_split]

In [21]:
train_stat_ent = Counter(output_train['actor1id']._append(output_train['actor2id']))
train_stat_rel = Counter(output_train['eventid'])

split_num_ent = sum(train_stat_ent.values()) // 3
split_num_rel = sum(train_stat_rel.values()) // 3
train_stat_ent = sorted(train_stat_ent.items(), key=lambda item: item[1], reverse=True)
train_stat_rel = sorted(train_stat_rel.items(), key=lambda item: item[1], reverse=True)

sparsity_split_ent = {}
sparsity_split_rel = {}

total_freq_ent = 0
for ontology_id, freq in train_stat_ent:
    if total_freq_ent <= split_num_ent:
        sparsity_split_ent[ontology_id] = 'high'
    elif split_num_ent < total_freq_ent <= 2 * split_num_ent:
        sparsity_split_ent[ontology_id] = 'mid'
    elif 2 * split_num_ent < total_freq_ent <= int(7 / 3 * split_num_ent):
        sparsity_split_ent[ontology_id] = 'relatively low'
    elif int(7 / 3 * split_num_ent) < total_freq_ent <= int(8 / 3 * split_num_ent):
        sparsity_split_ent[ontology_id] = 'moderately low'
    else:
        sparsity_split_ent[ontology_id] = 'extremely low'

    total_freq_ent += freq

total_freq_rel = 0
for ontology_id, freq in train_stat_rel:
    if total_freq_rel <= split_num_rel:
        sparsity_split_rel[ontology_id] = 'high'
    elif split_num_rel < total_freq_rel <= 2 * split_num_rel:
        sparsity_split_rel[ontology_id] = 'mid'
    elif 2 * split_num_rel < total_freq_rel <= int(7 / 3 * split_num_rel):
        sparsity_split_rel[ontology_id] = 'relatively low'
    elif int(7 / 3 * split_num_rel) < total_freq_rel <= int(8 / 3 * split_num_rel):
        sparsity_split_rel[ontology_id] = 'moderately low'
    else:
        sparsity_split_rel[ontology_id] = 'extremely low'

    total_freq_rel += freq

from collections import defaultdict
stat_dict = defaultdict(int)
for k, v in sparsity_split_ent.items():
    stat_dict[v] += 1

In [23]:
output_path = './data_tkg_EGIRIS'
output_train.to_csv(output_path + '/train.txt', header=None, index=None, sep='\t', mode='a')
output_val.to_csv(output_path + '/valid.txt', header=None, index=None, sep='\t', mode='a')
output_test.to_csv(output_path + '/test.txt', header=None, index=None, sep='\t', mode='a')

with open((os.path.join(output_path,'entity2id.txt')), 'w') as f:
    temp_csv_writer = csv.writer(f, delimiter='\t')
    for k,v in ent2id.items():
        temp_csv_writer.writerow([k,v])

with open((os.path.join(output_path,'relation2id.txt')), 'w') as f:
    temp_csv_writer = csv.writer(f, delimiter='\t')
    for k,v in rel2id.items():
        temp_csv_writer.writerow([k,v])

with open((os.path.join(output_path,'stat.txt')), 'w') as f:
    f.write(str(len(ent2id)) + '\t' + str(len(rel2id)))

with open((os.path.join(output_path,'sparsity.pkl')), 'wb') as f:
    pickle.dump([sparsity_split_ent, sparsity_split_rel], f)

In [24]:
json.dump(ent2id, open('./data_tkg_EGIRIS/ent2id_dict.json', 'w'), indent=4)
json.dump(rel2id, open('./data_tkg_EGIRIS/rel2id_dict.json', 'w'), indent=4)