In [1]:
import pandas as pd
import json
import os
from tqdm import tqdm
from collections import Counter, defaultdict
import pickle
import csv
import argparse

In [2]:
column_types = {'EventCode': str}
raw_data = pd.read_csv('./GDELT_ce_final.csv',  dtype=column_types, sep='\t')

In [3]:
raw_data

Unnamed: 0,Actor1Name,Actor2Name,EventCode,Actor1ADM1Code,Actor2ADM1Code,EventADM1Code,day,timid,year,Md5,ce_id,Md5_list
0,IRAN,WENDY SHERMAN,46,IR,IR,US,20150219,0,2015,c109d1e16caa8aff4fced57a8c4045a0,0,c109d1e16caa8aff4fced57a8c4045a0
1,ISRAEL,IRAN,50,US,IS,US,20150219,0,2015,6d40d295cd67747ee9b56cc2c8099cfd,-1,6d40d295cd67747ee9b56cc2c8099cfd
2,ITALIAN,EGYPT,40,EG,EG,EG,20150219,0,2015,edf3db2729f3e81b20e446b94f7a6b6a,-1,edf3db2729f3e81b20e446b94f7a6b6a
3,THE US,CITIZEN,10,IS,IS,IS,20150219,0,2015,72dcb54c9b49830caefdad4bb7453d3c,-1,72dcb54c9b49830caefdad4bb7453d3c
4,KINGDOM,EGYPT,57,IZ,IZ,EG,20150219,0,2015,fea4f048e58509eb5ce3a95a356d3f3f,-1,fea4f048e58509eb5ce3a95a356d3f3f
...,...,...,...,...,...,...,...,...,...,...,...,...
1201876,UKRAINIAN,ISRAEL,51,UP,IS,IS,20220317,2583,2022,87a046d58f0a05e2c9c1bb9bd826b594,4532,87a046d58f0a05e2c9c1bb9bd826b594
1201877,UKRAINE,ISRAEL,42,UP,IS,UP,20220317,2583,2022,7b4bab0e38720b867a170944cf233c9b,4532,7b4bab0e38720b867a170944cf233c9b
1201878,UKRAINE,BRITISH,84,IR,IR,IR,20220317,2583,2022,3f48fb321a6a8e00f5f8a2c1fd756b87,4479,3f48fb321a6a8e00f5f8a2c1fd756b87
1201879,IRAN,BRITAIN,841,IR,IR,IR,20220317,2583,2022,079db1b369ae3926f797e3e88e072877,-1,079db1b369ae3926f797e3e88e072877


In [4]:
unique_time = raw_data['day'].unique()
assert all(unique_time == sorted(unique_time))
tim2id = {name: idx for idx, name in enumerate(unique_time)}
with open('./data_ce_GDELT/date2id.txt', 'w') as f:
    for k, v in tim2id.items():
        f.write(f"{k}\t{v}\n")

In [5]:
train_df = pd.read_csv('./GDELT_ce_train.csv',  dtype=column_types, sep='\t')
val_df = pd.read_csv('./GDELT_ce_val_final.csv',  dtype=column_types, sep='\t')
test_df = pd.read_csv('./GDELT_ce_test_final.csv',  dtype=column_types, sep='\t')
outlier_df = pd.read_csv('./GDELT_ce_outliers_final.csv',  dtype=column_types, sep='\t')

In [6]:
# use same id with tkg
ent2id = json.load(open('./data_tkg_GDELT/ent2id_dict.json'))
rel2id = json.load(open('./data_tkg_GDELT/rel2id_dict.json'))

In [7]:
len(ent2id)

1555

In [8]:
len(rel2id)

239

In [9]:
output_train = pd.DataFrame()
output_train['actor1id'] = train_df['Actor1Name'].map(lambda x: ent2id[x])
output_train['eventid'] = train_df['EventCode'].map(lambda x: rel2id[x])
output_train['actor2id'] = train_df['Actor2Name'].map(lambda x: ent2id[x])
output_train['timid'] = train_df['timid']
output_train['ceid'] = train_df['ce_id']

In [10]:
output_val = pd.DataFrame()
output_val['actor1id'] = val_df['Actor1Name'].map(lambda x: ent2id[x])
output_val['eventid'] = val_df['EventCode'].map(lambda x: rel2id[x])
output_val['actor2id'] = val_df['Actor2Name'].map(lambda x: ent2id[x])
output_val['timid'] = val_df['timid']
output_val['ceid'] = val_df['ce_id']

In [11]:
output_test = pd.DataFrame()
output_test['actor1id'] = test_df['Actor1Name'].map(lambda x: ent2id[x])
output_test['eventid'] = test_df['EventCode'].map(lambda x: rel2id[x])
output_test['actor2id'] = test_df['Actor2Name'].map(lambda x: ent2id[x])
output_test['timid'] = test_df['timid']
output_test['ceid'] = test_df['ce_id']

In [12]:
output_outlier = pd.DataFrame()
output_outlier['actor1id'] = outlier_df['Actor1Name'].map(lambda x: ent2id[x])
output_outlier['eventid'] = outlier_df['EventCode'].map(lambda x: rel2id[x])
output_outlier['actor2id'] = outlier_df['Actor2Name'].map(lambda x: ent2id[x])
output_outlier['timid'] = outlier_df['timid']
output_outlier['ceid'] = outlier_df['ce_id']

In [13]:
output_path = './data_ce_GDELT'
output_train.to_csv(output_path + '/train.txt', header=None, index=None, sep='\t', mode='a')
output_val.to_csv(output_path + '/valid.txt', header=None, index=None, sep='\t', mode='a')
output_test.to_csv(output_path + '/test.txt', header=None, index=None, sep='\t', mode='a')
output_outlier.to_csv(output_path + '/outliers.txt', header=None, index=None, sep='\t', mode='a')

In [None]:
# generate valid and test query

In [14]:
output_val_tkgid = output_val.copy()
output_val_tkgid_query_dfs = []
ceids = list(output_val_tkgid['ceid'].unique())
for idx, ceid in tqdm(enumerate(ceids), total=len(ceids)):
    query_df = output_val_tkgid[output_val_tkgid['ceid']==ceid]
    min_timid = query_df['timid'].min()
    query_df = query_df[query_df['timid']!=min_timid]
    output_val_tkgid_query_dfs.append(query_df)
output_val_tkgid_query_df = pd.concat(output_val_tkgid_query_dfs, ignore_index=True)

100%|███████████████████████████████████████████████████| 500/500 [00:00<00:00, 2208.62it/s]


In [15]:
output_val_tkgid_query_df

Unnamed: 0,actor1id,eventid,actor2id,timid,ceid
0,1,23,328,1832,3561
1,1,8,1,1832,3561
2,1,23,1,1832,3561
3,1,3,1,1832,3561
4,241,8,1,1832,3561
...,...,...,...,...,...
60687,898,17,1,2217,4060
60688,68,3,131,2217,4060
60689,44,3,131,2217,4060
60690,216,37,44,2217,4060


In [16]:
output_val_tkgid_query_df = output_val_tkgid_query_df[output_val_tkgid_query_df['timid']>=1854]

In [17]:
len(output_val_tkgid_query_df)

60624

In [18]:
output_val_tkgid_query_df = output_val_tkgid_query_df[output_val_tkgid_query_df['timid']<=2218]

In [19]:
len(output_val_tkgid_query_df)

59897

In [20]:
output_val_tkgid_query_df_sorted = output_val_tkgid_query_df.sort_values(by=['timid'], ignore_index=True)

In [21]:
output_val_tkgid_query_df_sorted

Unnamed: 0,actor1id,eventid,actor2id,timid,ceid
0,1,23,9,1854,3561
1,0,40,199,1854,3571
2,443,8,0,1854,3571
3,44,52,16,1854,3571
4,6,34,495,1854,3571
...,...,...,...,...,...
59892,0,0,28,2218,4003
59893,0,56,34,2218,4013
59894,13,22,1,2218,4014
59895,16,36,16,2218,4048


In [22]:
output_val_tkgid_query_df_sorted.to_csv('./data_ce_GDELT/valid_query.txt', header=None, index=None, sep='\t')

In [23]:
output_val_tkgid_query_df_sorted_quad = output_val_tkgid_query_df_sorted.drop('ceid', axis=1)

In [24]:
output_val_tkgid_query_df_sorted_quad

Unnamed: 0,actor1id,eventid,actor2id,timid
0,1,23,9,1854
1,0,40,199,1854
2,443,8,0,1854
3,44,52,16,1854
4,6,34,495,1854
...,...,...,...,...
59892,0,0,28,2218
59893,0,56,34,2218
59894,13,22,1,2218
59895,16,36,16,2218


In [25]:
output_val_tkgid_query_df_sorted_quad.to_csv('./data_tkg_GDELT/valid_query.txt', header=None, index=None, sep='\t')

In [26]:
output_test_tkgid = output_test.copy()
output_test_tkgid_query_dfs = []
ceids = list(output_test_tkgid['ceid'].unique())
for ceid in ceids:
    query_df = output_test_tkgid[output_test_tkgid['ceid']==ceid]
    min_timid = query_df['timid'].min()
    query_df = query_df[query_df['timid']!=min_timid]
    output_test_tkgid_query_dfs.append(query_df)
output_test_tkgid_query_df = pd.concat(output_test_tkgid_query_dfs, ignore_index=True)

In [27]:
output_test_tkgid_query_df

Unnamed: 0,actor1id,eventid,actor2id,timid,ceid
0,671,13,0,1835,4061
1,671,20,16,1835,4061
2,89,8,0,1843,4061
3,67,36,0,1843,4061
4,265,36,0,1843,4061
...,...,...,...,...,...
60746,88,60,1,2583,4534
60747,276,83,1,2583,4534
60748,1,23,88,2583,4534
60749,88,8,1,2583,4534


In [28]:
output_test_tkgid_query_df = output_test_tkgid_query_df[output_test_tkgid_query_df['timid']>=2219]

In [29]:
output_test_tkgid_query_df_sorted = output_test_tkgid_query_df.sort_values(by=['timid'], ignore_index=True)

In [30]:
output_test_tkgid_query_df_sorted

Unnamed: 0,actor1id,eventid,actor2id,timid,ceid
0,82,19,9,2219,4073
1,1,16,180,2219,4072
2,361,18,108,2219,4066
3,26,23,988,2219,4072
4,72,20,671,2219,4072
...,...,...,...,...,...
60447,7,1,72,2583,4510
60448,446,8,1,2583,4510
60449,73,23,446,2583,4510
60450,13,120,1,2583,4500


In [31]:
output_test_tkgid_query_df_sorted.to_csv('./data_ce_GDELT/test_query.txt', header=None, index=None, sep='\t')

In [32]:
output_test_tkgid_query_df_sorted_quad = output_test_tkgid_query_df_sorted.drop('ceid', axis=1)

In [33]:
output_test_tkgid_query_df_sorted_quad

Unnamed: 0,actor1id,eventid,actor2id,timid
0,82,19,9,2219
1,1,16,180,2219
2,361,18,108,2219
3,26,23,988,2219
4,72,20,671,2219
...,...,...,...,...
60447,7,1,72,2583
60448,446,8,1,2583
60449,73,23,446,2583
60450,13,120,1,2583


In [34]:
output_test_tkgid_query_df_sorted_quad.to_csv('./data_tkg_GDELT/test_query.txt', header=None, index=None, sep='\t')

In [35]:
output_train_tkgid = output_train.copy()
output_train_tkgid_query_dfs = []
ceids = list(output_train_tkgid['ceid'].unique())
for ceid in ceids:
    query_df = output_train_tkgid[output_train_tkgid['ceid']==ceid]
    min_timid = query_df['timid'].min()
    query_df = query_df[query_df['timid']!=min_timid]
    output_train_tkgid_query_dfs.append(query_df)
output_train_tkgid_query_df = pd.concat(output_train_tkgid_query_dfs, ignore_index=True)

In [36]:
output_train_tkgid_query_df.sort_values(by=['timid'])

Unnamed: 0,actor1id,eventid,actor2id,timid,ceid
0,16,20,21,1,0
1263,56,25,212,1,7
1813,14,23,1132,1,10
1812,15,0,7,1,10
1811,7,0,15,1,10
...,...,...,...,...,...
392433,7,23,34,1885,3516
392434,549,8,7,1885,3516
392436,7,23,549,1885,3516
392432,20,23,538,1885,3516


In [37]:
tim2id[20200317]

1853

In [38]:
output_train_tkgid_query_df = output_train_tkgid_query_df[output_train_tkgid_query_df['timid']<=1853]

In [39]:
output_train_tkgid_query_df_sorted = output_train_tkgid_query_df.sort_values(by=['timid'], ignore_index=True)

In [40]:
output_train_tkgid_query_df_sorted

Unnamed: 0,actor1id,eventid,actor2id,timid,ceid
0,16,20,21,1,0
1,20,8,159,1,5
2,20,76,20,1,5
3,159,23,20,1,5
4,20,19,160,1,5
...,...,...,...,...,...
397837,159,100,79,1853,3556
397838,57,66,81,1853,3556
397839,7,8,159,1853,3556
397840,99,15,21,1853,3545


In [41]:
output_train_tkgid_query_df_sorted.to_csv('./data_ce_GDELT/train_query.txt', header=None, index=None, sep='\t')

In [42]:
output_train_tkgid_query_df_sorted_quad = output_train_tkgid_query_df_sorted.drop('ceid', axis=1)

In [43]:
output_train_tkgid_query_df_sorted_quad

Unnamed: 0,actor1id,eventid,actor2id,timid
0,16,20,21,1
1,20,8,159,1
2,20,76,20,1
3,159,23,20,1
4,20,19,160,1
...,...,...,...,...
397837,159,100,79,1853
397838,57,66,81,1853
397839,7,8,159,1853
397840,99,15,21,1853


In [44]:
output_train_tkgid_query_df_sorted_quad.to_csv('./data_tkg_GDELT/train_query.txt', header=None, index=None, sep='\t')