In [1]:
import pandas as pd
import json
import os
from tqdm import tqdm
from collections import Counter, defaultdict
import pickle
import csv
import argparse

In [2]:
column_types = {'EventCode': str}
raw_data = pd.read_csv('./GDELT_ce_final.csv',  dtype=column_types, sep='\t')

In [3]:
raw_data

Unnamed: 0,Actor1Name,Actor2Name,EventCode,Actor1ADM1Code,Actor2ADM1Code,EventADM1Code,day,timid,year,Md5,ce_id,Md5_list
0,IRAN,WENDY SHERMAN,46,IR,IR,US,20150219,0,2015,c109d1e16caa8aff4fced57a8c4045a0,0,c109d1e16caa8aff4fced57a8c4045a0
1,ISRAEL,IRAN,50,US,IS,US,20150219,0,2015,6d40d295cd67747ee9b56cc2c8099cfd,-1,6d40d295cd67747ee9b56cc2c8099cfd
2,ITALIAN,EGYPT,40,EG,EG,EG,20150219,0,2015,edf3db2729f3e81b20e446b94f7a6b6a,-1,edf3db2729f3e81b20e446b94f7a6b6a
3,THE US,CITIZEN,10,IS,IS,IS,20150219,0,2015,72dcb54c9b49830caefdad4bb7453d3c,-1,72dcb54c9b49830caefdad4bb7453d3c
4,KINGDOM,EGYPT,57,IZ,IZ,EG,20150219,0,2015,fea4f048e58509eb5ce3a95a356d3f3f,-1,fea4f048e58509eb5ce3a95a356d3f3f
...,...,...,...,...,...,...,...,...,...,...,...,...
1201876,UKRAINIAN,ISRAEL,51,UP,IS,IS,20220317,2583,2022,87a046d58f0a05e2c9c1bb9bd826b594,4532,87a046d58f0a05e2c9c1bb9bd826b594
1201877,UKRAINE,ISRAEL,42,UP,IS,UP,20220317,2583,2022,7b4bab0e38720b867a170944cf233c9b,4532,7b4bab0e38720b867a170944cf233c9b
1201878,UKRAINE,BRITISH,84,IR,IR,IR,20220317,2583,2022,3f48fb321a6a8e00f5f8a2c1fd756b87,4479,3f48fb321a6a8e00f5f8a2c1fd756b87
1201879,IRAN,BRITAIN,841,IR,IR,IR,20220317,2583,2022,079db1b369ae3926f797e3e88e072877,-1,079db1b369ae3926f797e3e88e072877


In [4]:
unique_time = raw_data['day'].unique()
assert all(unique_time == sorted(unique_time))
tim2id = {name: idx for idx, name in enumerate(unique_time)}
with open('./data_tkg_GDELT/date2id.txt', 'w') as f:
    for k, v in tim2id.items():
        f.write(f"{k}\t{v}\n")

In [5]:
max_date = raw_data['day'].max()
valid_split = tim2id[raw_data['day'].max()-10000]
train_split = tim2id[raw_data['day'].max()-20000]

In [6]:
train_df = raw_data[raw_data['timid'] <= train_split]
val_df = raw_data[(raw_data['timid'] > train_split) & (raw_data['timid'] <= valid_split)]
test_df = raw_data[raw_data['timid'] > valid_split]

In [7]:
anchor_ent = (train_df['Actor1Name']._append(train_df['Actor2Name'])).unique()
anchor_rel = train_df['EventCode'].unique()

In [8]:
print(len(train_df))
print(len(val_df))
print(len(test_df))

942124
125522
134235


In [9]:
val_df = val_df[val_df['EventCode'].isin(anchor_rel)]
test_df = test_df[test_df['EventCode'].isin(anchor_rel)]
val_df = val_df[(val_df['Actor1Name'].isin(anchor_ent)) & (val_df['Actor2Name'].isin(anchor_ent))]
test_df = test_df[(test_df['Actor1Name'].isin(anchor_ent)) & (test_df['Actor2Name'].isin(anchor_ent))]

In [10]:
print(len(train_df))
print(len(val_df))
print(len(test_df))

942124
125522
134235


In [11]:
print(len(anchor_ent))
print(len(anchor_rel))

1555
239


In [12]:
print(len(train_df['Md5'].unique()))
print(len(val_df['Md5'].unique()))
print(len(test_df['Md5'].unique()))

216430
27659
27018


In [14]:
for df in [train_df, val_df, test_df]:
    md5s = set()
    for md5_list in df['Md5_list']:
        md5s.update(md5_list.split(', '))
    print(len(md5s))

218700
27931
27214


In [15]:
print(len(train_df['Actor1Name']._append(train_df['Actor2Name']).unique()))
print(len(val_df['Actor1Name']._append(val_df['Actor2Name']).unique()))
print(len(test_df['Actor1Name']._append(test_df['Actor2Name']).unique()))
print(len(raw_data['Actor1Name']._append(raw_data['Actor2Name']).unique()))

1555
1328
1342
1555


In [16]:
print(len(train_df['EventCode'].unique()))
print(len(val_df['EventCode'].unique()))
print(len(test_df['EventCode'].unique()))
print(len(raw_data['EventCode'].unique()))

239
215
213
239


In [17]:
curate_df = pd.concat([train_df, val_df, test_df], axis=0)

unique_entity = curate_df['Actor1Name']._append(curate_df['Actor2Name']).unique()
unique_relation = curate_df['EventCode'].unique()

ent2id = {name: idx for idx, name in enumerate(unique_entity)}
rel2id = {name: idx for idx, name in enumerate(unique_relation)}

output_df = pd.DataFrame()
output_df['actor1id'] = curate_df['Actor1Name'].map(lambda x: ent2id[x])
output_df['eventid'] = curate_df['EventCode'].map(lambda x: rel2id[x])
output_df['actor2id'] = curate_df['Actor2Name'].map(lambda x: ent2id[x])
output_df['timid'] = curate_df['timid']

output_train = output_df[output_df['timid'] <= train_split]
output_val = output_df[(output_df['timid'] > train_split) & (output_df['timid'] <= valid_split)]
output_test = output_df[output_df['timid'] > valid_split]

In [18]:
train_stat_ent = Counter(output_train['actor1id']._append(output_train['actor2id']))
train_stat_rel = Counter(output_train['eventid'])

split_num_ent = sum(train_stat_ent.values()) // 3
split_num_rel = sum(train_stat_rel.values()) // 3
train_stat_ent = sorted(train_stat_ent.items(), key=lambda item: item[1], reverse=True)
train_stat_rel = sorted(train_stat_rel.items(), key=lambda item: item[1], reverse=True)

sparsity_split_ent = {}
sparsity_split_rel = {}

total_freq_ent = 0
for ontology_id, freq in train_stat_ent:
    if total_freq_ent <= split_num_ent:
        sparsity_split_ent[ontology_id] = 'high'
    elif split_num_ent < total_freq_ent <= 2 * split_num_ent:
        sparsity_split_ent[ontology_id] = 'mid'
    elif 2 * split_num_ent < total_freq_ent <= int(7 / 3 * split_num_ent):
        sparsity_split_ent[ontology_id] = 'relatively low'
    elif int(7 / 3 * split_num_ent) < total_freq_ent <= int(8 / 3 * split_num_ent):
        sparsity_split_ent[ontology_id] = 'moderately low'
    else:
        sparsity_split_ent[ontology_id] = 'extremely low'

    total_freq_ent += freq

total_freq_rel = 0
for ontology_id, freq in train_stat_rel:
    if total_freq_rel <= split_num_rel:
        sparsity_split_rel[ontology_id] = 'high'
    elif split_num_rel < total_freq_rel <= 2 * split_num_rel:
        sparsity_split_rel[ontology_id] = 'mid'
    elif 2 * split_num_rel < total_freq_rel <= int(7 / 3 * split_num_rel):
        sparsity_split_rel[ontology_id] = 'relatively low'
    elif int(7 / 3 * split_num_rel) < total_freq_rel <= int(8 / 3 * split_num_rel):
        sparsity_split_rel[ontology_id] = 'moderately low'
    else:
        sparsity_split_rel[ontology_id] = 'extremely low'

    total_freq_rel += freq

from collections import defaultdict
stat_dict = defaultdict(int)
for k, v in sparsity_split_ent.items():
    stat_dict[v] += 1

In [19]:
output_path = './data_tkg_GDELT'
output_train.to_csv(output_path + '/train.txt', header=None, index=None, sep='\t', mode='a')
output_val.to_csv(output_path + '/valid.txt', header=None, index=None, sep='\t', mode='a')
output_test.to_csv(output_path + '/test.txt', header=None, index=None, sep='\t', mode='a')

with open((os.path.join(output_path,'entity2id.txt')), 'w') as f:
    temp_csv_writer = csv.writer(f, delimiter='\t')
    for k,v in ent2id.items():
        temp_csv_writer.writerow([k,v])

with open((os.path.join(output_path,'relation2id.txt')), 'w') as f:
    temp_csv_writer = csv.writer(f, delimiter='\t')
    for k,v in rel2id.items():
        temp_csv_writer.writerow([k,v])

with open((os.path.join(output_path,'stat.txt')), 'w') as f:
    f.write(str(len(ent2id)) + '\t' + str(len(rel2id)))

with open((os.path.join(output_path,'sparsity.pkl')), 'wb') as f:
    pickle.dump([sparsity_split_ent, sparsity_split_rel], f)

In [20]:
json.dump(ent2id, open('./data_tkg_GDELT/ent2id_dict.json', 'w'), indent=4)
json.dump(rel2id, open('./data_tkg_GDELT/rel2id_dict.json', 'w'), indent=4)