In [1]:
from bertopic import BERTopic
import hdbscan
from umap import UMAP
import argparse
import json
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

RANDOM_STATE = 2023

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('--n_neighbors', type=int, default=200,
                    help='how UMAP balances local versus global structure in the data')
parser.add_argument('--n_components', type=int, default=32,
                    help='n_dimension that UMAP will reduce to')
parser.add_argument('--min_dist', type=float, default=0.0,
                    help='point min distance after UMAP')
parser.add_argument('--min_topic_size', type=int, default=10,
                    help='minimum number of atomic events in a complex event after HDBSCAN')
parser.add_argument('--top_n_words', type=int, default=20,
                    help='topic n words for each cluster (complex event)')
parser.add_argument('--input_path', type=str, default='/storage_fast/ccye/FastChat/clustering/entlink/',
                    help='output path of bertopic model')
parser.add_argument('--output_path', type=str, default='/storage_fast/ccye/FastChat/clustering/entlink/outputs_noprob/',
                    help='output path of bertopic model')
parser.add_argument('--time_weight', type=float, default=1.0,
                    help='time feature weight')
parser.add_argument('--umap_emb_path', type=str, default='/storage_fast/ccye/FastChat/clustering/entlink/outputs/time_t1.0_n200_m10/',
                    help='umap embedding path')

args = parser.parse_args("")
# args = parser.parse_args()
input_path = args.input_path

curr_setting = 'time_t{}_n{}_m{}'.format(args.time_weight, args.n_neighbors, args.min_topic_size)
output_path = args.output_path + curr_setting + '/'

print('start loading documents')
docs = json.load(open(input_path + 'docs.json', 'r'))
print('end loading documents, length: ')
print(len(docs))

print('start loading document embeddings')
doc_embs = np.load(input_path + 'doc_embs.npy')
print('end loading document embeddings, shape: ')
print(doc_embs.shape)

print('start loading time embeddings')
time_embs = np.load(input_path + 'time_embs.npy')
time_embs = time_embs * args.time_weight
print('end loading time embeddings with weight {}, shape: '.format(args.time_weight))
print(time_embs.shape)

umap_embs = np.load(args.umap_emb_path + 'umap_embeddings.npy')
print('Umap embedding loaded from: ' + args.umap_emb_path + 'umap_embeddings.npy, shape: ')
print(umap_embs.shape)

start loading documents
end loading documents, length: 
275406
start loading document embeddings
end loading document embeddings, shape: 
(275406, 1024)
start loading time embeddings
end loading time embeddings with weight 1.0, shape: 
(275406,)
Umap embedding loaded from: /storage_fast/ccye/FastChat/clustering/entlink/outputs/time_t1.0_n200_m10/umap_embeddings.npy, shape: 
(275406, 32)


In [3]:
umap_model = UMAP(
    n_neighbors=args.n_neighbors, # how UMAP balances local versus global structure in the data.
    n_components=args.n_components, # n_dimension reduced to
    min_dist=args.min_dist,
    metric='cosine',
    low_memory=False,
    random_state=RANDOM_STATE)

hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=args.min_topic_size, # minimum number of atomic events in a complex event
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True)

bertopic_model = BERTopic(
    language = "english",
    top_n_words = args.top_n_words,
    min_topic_size = args.min_topic_size, # minimum number of atomic events in a complex event
    calculate_probabilities = False, # onlyu calculate the prob that a document belongs to the assigned CE
    umap_model = umap_model,
    hdbscan_model = hdbscan_model,
    verbose = True,
    save_path = output_path
)

topic_model = bertopic_model.fit_time_umap(docs, embeddings=doc_embs, time_embs=time_embs, umap_embs=umap_embs)

2023-06-05 21:51:24,237 - BERTopic - Making new dir: /storage_fast/ccye/FastChat/clustering/entlink/outputs_noprob/time_t1.0_n200_m10/
2023-06-05 21:51:24,243 - BERTopic - Set save path: /storage_fast/ccye/FastChat/clustering/entlink/outputs_noprob/time_t1.0_n200_m10/
2023-06-05 21:51:24,770 - BERTopic - Umap embedding loaded and saved at: /storage_fast/ccye/FastChat/clustering/entlink/outputs_noprob/time_t1.0_n200_m10/umap_embeddings.npy
2023-06-05 21:51:24,771 - BERTopic - shape: (275406, 32)
2023-06-05 21:51:24,772 - BERTopic - start umap emb normalization
2023-06-05 21:51:24,795 - BERTopic - Umap emb l2 normalized
2023-06-05 21:51:24,796 - BERTopic - start adding time feature
2023-06-05 21:51:24,813 - BERTopic - Time feature added, saved at: /storage_fast/ccye/FastChat/clustering/entlink/outputs_noprob/time_t1.0_n200_m10/umap_embeddings_time.npy
2023-06-05 21:51:24,814 - BERTopic - shape: (275406, 33)
2023-06-05 21:51:24,815 - BERTopic - start hdbscan clustering
2023-06-05 21:52:24

In [4]:
topic_model.topic_sizes_

{-1: 144644,
 0: 2219,
 1: 975,
 2: 797,
 3: 753,
 4: 684,
 5: 569,
 6: 569,
 7: 561,
 8: 538,
 9: 485,
 10: 483,
 11: 462,
 12: 427,
 13: 391,
 14: 389,
 15: 335,
 16: 333,
 17: 332,
 18: 317,
 19: 311,
 20: 305,
 21: 301,
 22: 299,
 23: 298,
 24: 297,
 25: 283,
 26: 273,
 27: 269,
 28: 240,
 29: 236,
 30: 235,
 31: 235,
 32: 228,
 33: 228,
 34: 227,
 35: 223,
 36: 222,
 37: 218,
 38: 214,
 39: 206,
 40: 204,
 41: 200,
 42: 197,
 43: 194,
 44: 193,
 45: 191,
 46: 191,
 47: 185,
 48: 183,
 49: 182,
 50: 182,
 51: 182,
 52: 181,
 53: 177,
 54: 176,
 55: 172,
 57: 171,
 56: 171,
 58: 170,
 59: 168,
 60: 167,
 61: 165,
 62: 160,
 63: 159,
 64: 159,
 65: 155,
 66: 155,
 67: 154,
 68: 154,
 69: 153,
 71: 152,
 70: 152,
 72: 148,
 73: 147,
 74: 143,
 75: 142,
 76: 142,
 77: 140,
 78: 140,
 79: 140,
 80: 139,
 81: 138,
 82: 138,
 84: 137,
 85: 137,
 83: 137,
 87: 136,
 86: 136,
 88: 134,
 89: 133,
 90: 133,
 91: 132,
 92: 132,
 93: 130,
 94: 129,
 95: 129,
 96: 128,
 97: 128,
 98: 127,
 100: 

In [5]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,144644,-1_is_of_are_and,"[is, of, are, and, their, the, this, that, for...",[Iran Uses 'Suicide Drones' to Surveil U.S. Mi...
1,0,2219,0_coronavirus_virus_cases_health,"[coronavirus, virus, cases, health, infected, ...",[Iran reports 2 more coronavirus deaths as Leb...
2,1,975,1_shot_stabbing_iof_stabbed,"[shot, stabbing, iof, stabbed, pchr, stab, wou...",[2 Palestinians shot dead in the West Bank aft...
3,2,797,2_vaccine_vaccines_vaccination_doses,"[vaccine, vaccines, vaccination, doses, pfizer...",[Israel said to be paying average of $47 per p...
4,3,753,3_qatar_doha_qatars_qatari,"[qatar, doha, qatars, qatari, emir, bahrain, d...",[Key developments in Qatar-Gulf crisis\nA gene...
...,...,...,...,...,...
3746,3745,10,3745_pa_frej_financial_unemployment,"[pa, frej, financial, unemployment, pas, ahcl,...","[UN urges Israel, PA to jointly address ‘dire’..."
3747,3746,10,3746_condensate_fuji_buyers_imports,"[condensate, fuji, buyers, imports, resume, cr...","[Japan, South Korea plan to resume Iran oil im..."
3748,3747,10,3747_tatour_tamimi_ahed_poems,"[tatour, tamimi, ahed, poems, khater, addameer...",[Palestinian poet jailed for online incitement...
3749,3748,10,3748_albu_kamal_tanf_mobilisation,"[albu, kamal, tanf, mobilisation, alharra, usl...",[Syria state media says U.S. bombs military po...


In [6]:
n_topics = len(topic_model.get_topics()) - 1

In [7]:
n_topics

3750

In [8]:
event_df = pd.read_csv('/storage_fast/ccye/FastChat/data_final/EGIRIS_entlink.csv', sep='\t', dtype='string')
md5_list = json.load(open('/storage_fast/ccye/FastChat/clustering/entlink/md5_list.json'))
md52topicid = {}
for idx, topicid in enumerate(topic_model.topics_):
    md52topicid[md5_list[idx]] = topicid
event_df['Topic'] = [md52topicid[_] for _ in event_df['Md5']]
md52nday = json.load(open('/storage_fast/ccye/FastChat/clustering/entlink/md52nday.json'))
event_df['Nday'] = [md52nday[_] for _ in event_df['Md5']]

In [9]:
event_df

Unnamed: 0,Subject,Relation_id,Relation_choice,Object,Md5,Date,Topic,Nday
0,European Union,071,Provide economic aid,Palestinian Authority,029d9dc7a91b489457e962f4d6bc0fe3,20150219,-1,0
1,US,057,Sign formal agreement,Iran,0b6cd037f3fc22ed82e0c147839ba9f9,20150219,15,0
2,United States,051,Praise or endorse,Israel,0e3390e4d38e679fc25d764528a781e4,20150219,-1,0
3,United States,111,Criticize or denounce,Islam,0b79f78beb4ffba834ab552c7ef020ea,20150219,-1,0
4,Hamas,1831,Carry out suicide bombing,Israel,859fc96157484e73268e105e6222ce53,20150219,852,0
...,...,...,...,...,...,...,...,...
531286,Russia,061,Cooperate economically,Ukraine,3f48fb321a6a8e00f5f8a2c1fd756b87,20220317,1203,2583
531287,Ukraine,080,Yield or concede,Israel,33469c03a101913442242e89f18899ce,20220317,28,2583
531288,Israel,061,Cooperate economically,Ukraine,9e24beca44f3a9d8a966625f97b55814,20220317,28,2583
531289,Ukraine,194,Fight with artillery and tanks,Russia,cac2d7644682d7109ca0d2d7a5aac8d5,20220317,24,2583


In [16]:
event_df.to_csv(path_or_buf='/storage_fast/ccye/FastChat/clustering/entlink/ce/t1_m10_raw.csv', sep='\t', index=False)

In [10]:
n_topics

3750

In [11]:
results = event_df.groupby(['Topic']).agg({'Nday':['count', 'max','min','mean']})
results.columns = ['n_events', 'nday_max', 'nday_min', 'nday_mean']
results['nday_range'] = results['nday_max'] -results['nday_min']

In [12]:
results

Unnamed: 0_level_0,n_events,nday_max,nday_min,nday_mean,nday_range
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-1,279035,2583,0,1219.129403,2583
0,4541,2015,1799,1873.764369,216
1,1821,410,215,308.075783,195
2,1633,2272,2070,2170.091243,202
3,1882,953,825,861.784803,128
...,...,...,...,...,...
3745,22,2474,2444,2459.818182,30
3746,21,1392,1359,1373.000000,33
3747,13,1319,1286,1304.230769,33
3748,13,1228,1215,1216.307692,13


In [13]:
results.loc[0:, :].mean()

n_events        67.268267
nday_max      1264.983467
nday_min      1226.736800
nday_mean     1245.151007
nday_range      38.246667
dtype: float64

In [14]:
results.loc[0:, :].max()

n_events      4541.000000
nday_max      2583.000000
nday_min      2574.000000
nday_mean     2578.113636
nday_range    2567.000000
dtype: float64

In [15]:
results.loc[0:, :].min()

n_events      10.000000
nday_max      18.000000
nday_min       0.000000
nday_mean      6.448276
nday_range     1.000000
dtype: float64