In [1]:
import pandas as pd
import json
import os
from tqdm import tqdm
from collections import Counter, defaultdict
import pickle
import csv
import argparse

In [2]:
column_types = {'EventCode': str}
raw_data = pd.read_csv('./EGIRIS_ce_final.csv',  dtype=column_types, sep='\t')

In [3]:
raw_data

Unnamed: 0,Actor1Name,EventCode,Relation_choice,Actor2Name,Md5,day,ce_id,timid,Md5_list
0,US,057,Sign formal agreement,Iran,0b6cd037f3fc22ed82e0c147839ba9f9,20150219,0,0,0b6cd037f3fc22ed82e0c147839ba9f9
1,Canada government,011,Refuse to comment,Palestine,18918b73f1e5f11eb8734defabe9a804,20150219,-1,0,18918b73f1e5f11eb8734defabe9a804
2,European Union,057,Sign formal agreement,Egypt,edf3db2729f3e81b20e446b94f7a6b6a,20150219,-1,0,edf3db2729f3e81b20e446b94f7a6b6a
3,Egypt,190,Use conventional military force,Libya,1f6264640f54e7feb8e38e6968bf2dcf,20150219,-1,0,"1f6264640f54e7feb8e38e6968bf2dcf, 64af5061461b..."
4,EU,070,Provide aid,US,72dcb54c9b49830caefdad4bb7453d3c,20150219,-1,0,72dcb54c9b49830caefdad4bb7453d3c
...,...,...,...,...,...,...,...,...,...
455872,Dmitry Peskov,1222,Reject request for military aid,Ukraine,9592dd1b296aa51f5b2053e8311659e2,20220317,4395,2583,9592dd1b296aa51f5b2053e8311659e2
455873,Ukraine,093,Investigate military action,Russia,dafe15f7bc95e33b171b5663fc55b735,20220317,4395,2583,dafe15f7bc95e33b171b5663fc55b735
455874,Russia,1421,Conduct hunger strike for leadership change,Ukraine,0647d97fcb81a6f13c8b606058e4109a,20220317,4395,2583,0647d97fcb81a6f13c8b606058e4109a
455875,Ukraine,150,Exhibit military or police power,Russia,c43e61753acf9cd4c4617109f7130ae5,20220317,4396,2583,c43e61753acf9cd4c4617109f7130ae5


In [4]:
unique_time = raw_data['day'].unique()
assert all(unique_time == sorted(unique_time))
tim2id = {name: idx for idx, name in enumerate(unique_time)}
with open('./data_ce_EGIRIS/date2id.txt', 'w') as f:
    for k, v in tim2id.items():
        f.write(f"{k}\t{v}\n")

In [5]:
train_df = pd.read_csv('./EGIRIS_ce_train.csv',  dtype=column_types, sep='\t')
val_df = pd.read_csv('./EGIRIS_ce_val_final.csv',  dtype=column_types, sep='\t')
test_df = pd.read_csv('./EGIRIS_ce_test_final.csv',  dtype=column_types, sep='\t')
outlier_df = pd.read_csv('./EGIRIS_ce_outliers_final.csv',  dtype=column_types, sep='\t')

In [8]:
# use same id with tkg
ent2id = json.load(open('./data_tkg_EGIRIS/ent2id_dict.json'))
rel2id = json.load(open('./data_tkg_EGIRIS/rel2id_dict.json'))

In [9]:
len(ent2id)

2794

In [10]:
len(rel2id)

234

In [11]:
output_train = train_df
output_val = val_df
output_test = test_df

In [12]:
output_train = pd.DataFrame()
output_train['actor1id'] = train_df['Actor1Name'].map(lambda x: ent2id[x])
output_train['eventid'] = train_df['EventCode'].map(lambda x: rel2id[x])
output_train['actor2id'] = train_df['Actor2Name'].map(lambda x: ent2id[x])
output_train['timid'] = train_df['timid']
output_train['ceid'] = train_df['ce_id']

In [13]:
output_val = pd.DataFrame()
output_val['actor1id'] = val_df['Actor1Name'].map(lambda x: ent2id[x])
output_val['eventid'] = val_df['EventCode'].map(lambda x: rel2id[x])
output_val['actor2id'] = val_df['Actor2Name'].map(lambda x: ent2id[x])
output_val['timid'] = val_df['timid']
output_val['ceid'] = val_df['ce_id']

In [14]:
output_test = pd.DataFrame()
output_test['actor1id'] = test_df['Actor1Name'].map(lambda x: ent2id[x])
output_test['eventid'] = test_df['EventCode'].map(lambda x: rel2id[x])
output_test['actor2id'] = test_df['Actor2Name'].map(lambda x: ent2id[x])
output_test['timid'] = test_df['timid']
output_test['ceid'] = test_df['ce_id']

In [15]:
output_outlier = pd.DataFrame()
output_outlier['actor1id'] = outlier_df['Actor1Name'].map(lambda x: ent2id[x])
output_outlier['eventid'] = outlier_df['EventCode'].map(lambda x: rel2id[x])
output_outlier['actor2id'] = outlier_df['Actor2Name'].map(lambda x: ent2id[x])
output_outlier['timid'] = outlier_df['timid']
output_outlier['ceid'] = outlier_df['ce_id']

In [61]:
output_path = './data_ce_EGIRIS'
output_train.to_csv(output_path + '/train.txt', header=None, index=None, sep='\t', mode='a')
output_val.to_csv(output_path + '/valid.txt', header=None, index=None, sep='\t', mode='a')
output_test.to_csv(output_path + '/test.txt', header=None, index=None, sep='\t', mode='a')
output_outlier.to_csv(output_path + '/outliers.txt', header=None, index=None, sep='\t', mode='a')

In [None]:
# generate valid and test query

In [17]:
output_val_tkgid = output_val.copy()
output_val_tkgid_query_dfs = []
ceids = list(output_val_tkgid['ceid'].unique())
for ceid in ceids:
    query_df = output_val_tkgid[output_val_tkgid['ceid']==ceid]
    min_timid = query_df['timid'].min()
    query_df = query_df[query_df['timid']!=min_timid]
    output_val_tkgid_query_dfs.append(query_df)
output_val_tkgid_query_df = pd.concat(output_val_tkgid_query_dfs, ignore_index=True)

In [18]:
output_val_tkgid_query_df

Unnamed: 0,actor1id,eventid,actor2id,timid,ceid
0,300,43,9,1832,3452
1,9,36,300,1832,3452
2,9,3,300,1833,3452
3,807,3,9,1835,3452
4,9,9,300,1835,3452
...,...,...,...,...,...
23154,14,9,9,2216,3943
23155,178,9,9,2218,3943
23156,316,8,9,2218,3943
23157,178,9,9,2219,3943


In [19]:
output_val_tkgid_query_df = output_val_tkgid_query_df[output_val_tkgid_query_df['timid']>=1854]

In [20]:
len(output_val_tkgid_query_df)

23092

In [21]:
output_val_tkgid_query_df = output_val_tkgid_query_df[output_val_tkgid_query_df['timid']<=2218]

In [22]:
len(output_val_tkgid_query_df)

22887

In [23]:
output_val_tkgid_query_df_sorted = output_val_tkgid_query_df.sort_values(by=['timid'], ignore_index=True)

In [62]:
output_val_tkgid_query_df_sorted

Unnamed: 0,actor1id,eventid,actor2id,timid,ceid
0,242,36,681,1854,3452
1,9,72,81,1854,3460
2,2589,72,81,1854,3460
3,81,9,2589,1854,3460
4,81,3,2589,1854,3460
...,...,...,...,...,...
22882,10,0,214,2218,3904
22883,400,9,281,2218,3902
22884,162,9,281,2218,3902
22885,242,74,41,2218,3896


In [63]:
output_val_tkgid_query_df_sorted.to_csv('./data_ce_EGIRIS/valid_query.txt', header=None, index=None, sep='\t')

In [None]:
output_val_tkgid_query_df_sorted_quad = output_val_tkgid_query_df_sorted.drop('ceid', axis=1)
output_val_tkgid_query_df_sorted_quad.to_csv('./data_tkg_EGIRIS/valid_query.txt', header=None, index=None, sep='\t')

In [52]:
output_test_tkgid = output_test.copy()
output_test_tkgid_query_dfs = []
ceids = list(output_test_tkgid['ceid'].unique())
for ceid in ceids:
    query_df = output_test_tkgid[output_test_tkgid['ceid']==ceid]
    min_timid = query_df['timid'].min()
    query_df = query_df[query_df['timid']!=min_timid]
    output_test_tkgid_query_dfs.append(query_df)
output_test_tkgid_query_df = pd.concat(output_test_tkgid_query_dfs, ignore_index=True)

In [53]:
output_test_tkgid_query_df

Unnamed: 0,actor1id,eventid,actor2id,timid,ceid
0,9,88,1151,1853,3944
1,2,74,2712,1853,3944
2,15,5,282,1853,3944
3,2,88,20,1853,3944
4,219,108,1806,1853,3944
...,...,...,...,...,...
20948,89,10,55,2583,4396
20949,89,113,55,2583,4396
20950,89,129,55,2583,4396
20951,89,74,55,2583,4396


In [54]:
output_test_tkgid_query_df = output_test_tkgid_query_df[output_test_tkgid_query_df['timid']>=2219]

In [55]:
output_test_tkgid_query_df_sorted = output_test_tkgid_query_df.sort_values(by=['timid'], ignore_index=True)

In [56]:
output_test_tkgid_query_df_sorted

Unnamed: 0,actor1id,eventid,actor2id,timid,ceid
0,0,30,70,2219,3955
1,22,72,9,2219,3957
2,1958,54,70,2219,3955
3,1958,30,70,2219,3955
4,1958,27,70,2219,3955
...,...,...,...,...,...
20691,9,17,89,2583,4394
20692,2,44,0,2583,4392
20693,0,9,2,2583,4392
20694,89,20,55,2583,4394


In [64]:
output_test_tkgid_query_df_sorted.to_csv('./data_ce_EGIRIS/test_query.txt', header=None, index=None, sep='\t')

In [None]:
output_test_tkgid_query_df_sorted_quad = output_test_tkgid_query_df_sorted.drop('ceid', axis=1)
output_test_tkgid_query_df_sorted_quad.to_csv('./data_tkg_GDELT/test_query.txt', header=None, index=None, sep='\t')

In [65]:
output_train_tkgid = output_train.copy()
output_train_tkgid_query_dfs = []
ceids = list(output_train_tkgid['ceid'].unique())
for ceid in ceids:
    query_df = output_train_tkgid[output_train_tkgid['ceid']==ceid]
    min_timid = query_df['timid'].min()
    query_df = query_df[query_df['timid']!=min_timid]
    output_train_tkgid_query_dfs.append(query_df)
output_train_tkgid_query_df = pd.concat(output_train_tkgid_query_dfs, ignore_index=True)

In [68]:
output_train_tkgid_query_df.sort_values(by=['timid'])

Unnamed: 0,actor1id,eventid,actor2id,timid,ceid
0,53,57,47,1,0
449,76,9,16,1,6
194,32,10,179,1,3
94,2,36,43,1,1
95,2,48,43,1,1
...,...,...,...,...,...
144162,984,5,520,1874,3425
144166,10,30,520,1874,3425
144170,0,15,520,1879,3425
144171,984,42,520,1879,3425


In [67]:
tim2id[20200317]

1853

In [69]:
output_train_tkgid_query_df = output_train_tkgid_query_df[output_train_tkgid_query_df['timid']<=1853]

In [70]:
output_train_tkgid_query_df_sorted = output_train_tkgid_query_df.sort_values(by=['timid'], ignore_index=True)

In [71]:
output_train_tkgid_query_df_sorted

Unnamed: 0,actor1id,eventid,actor2id,timid,ceid
0,53,57,47,1,0
1,2,36,43,1,1
2,2,48,43,1,1
3,76,9,16,1,6
4,77,26,1177,1,6
...,...,...,...,...,...
145653,2,1,428,1853,3431
145654,2,2,0,1853,3431
145655,2,20,428,1853,3431
145656,403,0,1,1853,3445


In [72]:
output_train_tkgid_query_df_sorted.to_csv('./data_ce_EGIRIS/train_query.txt', header=None, index=None, sep='\t')

In [73]:
output_train_tkgid_query_df_sorted_quad = output_train_tkgid_query_df_sorted.drop('ceid', axis=1)

In [74]:
output_train_tkgid_query_df_sorted_quad

Unnamed: 0,actor1id,eventid,actor2id,timid
0,53,57,47,1
1,2,36,43,1
2,2,48,43,1
3,76,9,16,1
4,77,26,1177,1
...,...,...,...,...
145653,2,1,428,1853
145654,2,2,0,1853
145655,2,20,428,1853
145656,403,0,1,1853


In [75]:
output_train_tkgid_query_df_sorted_quad.to_csv('./data_tkg_EGIRIS/train_query.txt', header=None, index=None, sep='\t')