In [90]:
import json
import pandas as pd
from tqdm import tqdm
import os
from collections import Counter
import pickle as pkl

In [None]:
vicuna_raw = pd.read_csv('./EGIRIS_raw.csv', sep='\t', dtype='string')

In [None]:
dict_hier_id = json.load(open('../data/dict_hier_id.json', 'r'))
level1s = list(dict_hier_id.keys())
level1s_int = [str(int(x)) for x in level1s]
level2s, level3s = [], []
for level1, info in dict_hier_id.items():
    for level2, level3 in info.items():
        level2s.append(level2)
        level3s += level3
level2s_int = [str(int(x)) for x in level2s]
level3s_int = [str(int(x)) for x in level3s]

### drop NA entity

In [34]:
vicuna_dropna = vicuna_raw.dropna()

In [56]:
vicuna_dropna

Unnamed: 0,Subject,Relation_id,Relation_choice,Object,Md5,Date
0,ISIS-linked militants,011,Refuse to comment,Libya,c35549216da33b38cc5a73602fa0591a,20150219
1,United Kingdom foreign secretary Selwyn Lloyd,013,Make optimistic comment,United States president Dwight D. Eisenhower,3552e2c49ee502ec23019bfb07ba6fb6,20150219
2,European Union,071,Provide economic aid,Palestinian Authority,029d9dc7a91b489457e962f4d6bc0fe3,20150219
3,Defeat,060,Engage in material cooperation,Defeat,0941ffd343441274e465a9499d3c9db3,20150219
4,"Syria, Lebanon",050,Engage in diplomatic cooperation,Hamas political leadership,859fc96157484e73268e105e6222ce53,20150219
...,...,...,...,...,...,...
1417556,Ukraine,080,Yield or concede,Israel,33469c03a101913442242e89f18899ce,20220317
1417557,ICC Prosecutor Karim Khan,046,Engage in negotiation,Palestinian Authority Minister of Foreign Affa...,d52355726f828f51f7deaa065f4fa451,20220317
1417558,Israel,061,Cooperate economically,Ukraine,9e24beca44f3a9d8a966625f97b55814,20220317
1417559,Ukraine,194,Fight with artillery and tanks,Russia,cac2d7644682d7109ca0d2d7a5aac8d5,20220317


In [36]:
print(len(vicuna_dropna['Md5'].unique()))
print(len(pd.concat([vicuna_dropna['Subject'], vicuna_dropna['Object']]).unique()))

446657
323822


In [37]:
vicuna_dropna.to_csv(path_or_buf='./EGIRIS_dropna.csv', sep='\t', index=False)

In [38]:
print(len(vicuna_dropna[vicuna_dropna['Relation_id'].isin(level1s)]))
print(len(vicuna_dropna[vicuna_dropna['Relation_id'].isin(level2s)]))
print(len(vicuna_dropna[vicuna_dropna['Relation_id'].isin(level3s)]))

267922
972194
176539


### filter entity by tf df

In [80]:
def calculate_frequency(vicuna_dropna):
    # Entity frequency
    term_freq = {k: v for k, v in sorted(Counter(vicuna_dropna['Subject']._append(vicuna_dropna['Object'])).items(), key=lambda item: item[1], reverse=False)}

    # Document frequency
    df = pd.concat([vicuna_dropna[['Subject', 'Md5']], vicuna_dropna[['Object', 'Md5']].rename(columns={'Object':'Subject'})])
    unique_df = df.drop_duplicates()
    doc_freq = unique_df.groupby('Subject')['Md5'].nunique().sort_values(ascending=True).to_dict()

    return term_freq, doc_freq

In [81]:
m = 10
n = 10
data = vicuna_dropna.copy()
prev_num_rows = data.shape[0]
while True:
    
    term_freq, doc_freq = calculate_frequency(data)

    mask_tf = data['Subject'].map(term_freq).ge(m) & data['Object'].map(term_freq).ge(m)
    mask_df = data['Subject'].map(doc_freq).ge(n) & data['Object'].map(doc_freq).ge(n)

    # Filter raw_data
    data = data[mask_tf & mask_df]

    if data.shape[0] == prev_num_rows:
        entity_counter = Counter(data['Subject']._append(data['Object']))
        total_entity_filter = {k: v for k, v in sorted(entity_counter.items(), key=lambda item: item[1], reverse=False)}

        md5_counter = Counter(data['Md5'])
        total_md5_filter = {k: v for k, v in sorted(md5_counter.items(), key=lambda item: item[1], reverse=False)}

        print(f"filter version {m}_{n}: number of events: {len(data)}")
        print(f"filter version {m}_{n}: number of entities: {len(total_entity_filter)}, number of documents: {len(total_md5_filter)}")
        print()

        break
    else:
        prev_num_rows = data.shape[0]

filter version 10_10: number of events: 834197
filter version 10_10: number of entities: 6322, number of documents: 339233


In [82]:
vicuna_tfdf = data

In [83]:
vicuna_tfdf.to_csv(path_or_buf='./EGIRIS_tfdf.csv', sep='\t', index=False)

In [87]:
vicuna_tfdf = pd.read_csv('./EGIRIS_tfdf.csv', sep='\t', dtype='string')

In [88]:
vicuna_tfdf

Unnamed: 0,Subject,Relation_id,Relation_choice,Object,Md5,Date
0,European Union,071,Provide economic aid,Palestinian Authority,029d9dc7a91b489457e962f4d6bc0fe3,20150219
1,Democratic Party,0241,Appeal for leadership change,U.S. House of Representatives,7ab2ea40ced6f66d6335fc2da99021cc,20150219
2,US,057,Sign formal agreement,Iran,0b6cd037f3fc22ed82e0c147839ba9f9,20150219
3,United States,051,Praise or endorse,Israel,0e3390e4d38e679fc25d764528a781e4,20150219
4,Hamas,0212,Appeal for military cooperation,"Syria, Lebanon",859fc96157484e73268e105e6222ce53,20150219
...,...,...,...,...,...,...
834192,Philippines,057,Sign formal agreement,US Vice President Kamala Harris,1b65e3f5cfb24e5883b13ddde0d6fa92,20220317
834193,Ukraine,080,Yield or concede,Israel,33469c03a101913442242e89f18899ce,20220317
834194,Israel,061,Cooperate economically,Ukraine,9e24beca44f3a9d8a966625f97b55814,20220317
834195,Ukraine,194,Fight with artillery and tanks,Russia,cac2d7644682d7109ca0d2d7a5aac8d5,20220317


In [84]:
print(len(vicuna_tfdf['Md5'].unique()))
print(len(pd.concat([vicuna_tfdf['Subject'], vicuna_tfdf['Object']]).unique()))

339233
6322


In [89]:
print(len(vicuna_tfdf[vicuna_tfdf['Relation_id'].isin(level1s)]))
print(len(vicuna_tfdf[vicuna_tfdf['Relation_id'].isin(level2s)]))
print(len(vicuna_tfdf[vicuna_tfdf['Relation_id'].isin(level3s)]))

167188
559936
107073
