In [1]:
import pandas as pd
import os 
import glob
import json
import numpy as np
import copy
import re
import random

In [94]:
import stanza
from tqdm.notebook import tqdm

## Load the Korea Herald Dataset
- Store the dataset in `df`
- Copy `df` into two different versions:
- `df_month` and `df_topic`
- `df_month` is used to reduce the time frame into months **(i.e. 2017-07-23 17:24:31 -> 2017-07-01)**.
- `df_topic` is used to group articles by the assigned topics.

In [9]:
PATH="./data"
df_temp = []
if os.path.exists(PATH):
    for file in glob.glob(os.path.join(PATH, "*.json")):
        with open(file, "r") as f:
            data=json.load(f)
            df_temp.append(pd.DataFrame.from_dict(data))

df = pd.concat(df_temp, ignore_index=True)

In [5]:
df

Unnamed: 0,title,author,time,description,body,section
0,A snapshot of multiculturalism in South Korea,Lee Sun-young,2018-01-01 17:07:00,With birthrates persistently low and the senio...,With birthrates persistently low and the senio...,Social affairs
1,[Weekender] Korea’s dynamic 2017,Choi He-suk,2018-01-01 13:22:00,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs
2,People's Party members support Ahn's push for ...,Yonhap,2017-12-31 16:18:00,The leader of the center-left People's Party g...,The leader of the center-left People's Party g...,Politics
3,[Newsmaker] Panamanian vessel probed over susp...,Yonhap,2017-12-31 14:55:00,PYEONGTAEK -- South Korea has seized and insp...,PYEONGTAEK -- South Korea has seized and insp...,North Korea
4,Hong Kong ship crew questioned in S. Korea for...,AFP,2017-12-30 15:44:00,The crew of a Hong Kong-registered ship have b...,The crew of a Hong Kong-registered ship have b...,North Korea
...,...,...,...,...,...,...
23764,Korea Navy to build 1st dedicated training shi...,KH디지털2,2016-07-19 13:29:00,Korea's Navy will build a dedicated training s...,Korea's Navy will build a dedicated training s...,Defense
23765,Korean man gets 4-year jail term for Yasukuni ...,KH디지털2,2016-07-19 13:16:00,A Tokyo court handed down a four-year jail ter...,A Tokyo court handed down a four-year jail ter...,Social affairs
23766,N. Korean ferry to sail on 3-country tour rout...,KH디지털2,2016-07-19 13:13:00,The North Korean passenger ferry Mangyongbong ...,The North Korean passenger ferry Mangyongbong ...,North Korea
23767,Presidential office denounces allegations invo...,KH디지털2,2016-07-19 13:09:00,The presidential office Cheong Wa Dae on Tuesd...,The presidential office Cheong Wa Dae on Tuesd...,Politics


In [6]:
df_month = copy.deepcopy(df)  # Used to generalize time frame to "months" instead of hours and days.
df_topic = copy.deepcopy(df)  # Used to append topic column after running BERTopic

## Replacing datetime in `time` from the dataset
- To group the data instances in `df_month` by months.
- This allows us to effectively visualize `topic_model.visualize_topics_over_time(topics_over_time)`

In [7]:
pattern = re.compile(r"(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})")
for i, row in tqdm(df_month.iterrows(), total=df_month.shape[0]):
    parsed_datetime = list(pattern.findall(df_month.iloc[i][" time"])[0])[:3]
    parsed_datetime[2] = "01"
    df_month.iloc[i][" time"] = "-".join(parsed_datetime)

  0%|          | 0/23769 [00:00<?, ?it/s]

In [12]:
df_month

Unnamed: 0,title,author,time,description,body,section
0,A snapshot of multiculturalism in South Korea,Lee Sun-young,2018-01-01,With birthrates persistently low and the senio...,With birthrates persistently low and the senio...,Social affairs
1,[Weekender] Korea’s dynamic 2017,Choi He-suk,2018-01-01,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs
2,People's Party members support Ahn's push for ...,Yonhap,2017-12-01,The leader of the center-left People's Party g...,The leader of the center-left People's Party g...,Politics
3,[Newsmaker] Panamanian vessel probed over susp...,Yonhap,2017-12-01,PYEONGTAEK -- South Korea has seized and insp...,PYEONGTAEK -- South Korea has seized and insp...,North Korea
4,Hong Kong ship crew questioned in S. Korea for...,AFP,2017-12-01,The crew of a Hong Kong-registered ship have b...,The crew of a Hong Kong-registered ship have b...,North Korea
...,...,...,...,...,...,...
23764,Korea Navy to build 1st dedicated training shi...,KH디지털2,2016-07-01,Korea's Navy will build a dedicated training s...,Korea's Navy will build a dedicated training s...,Defense
23765,Korean man gets 4-year jail term for Yasukuni ...,KH디지털2,2016-07-01,A Tokyo court handed down a four-year jail ter...,A Tokyo court handed down a four-year jail ter...,Social affairs
23766,N. Korean ferry to sail on 3-country tour rout...,KH디지털2,2016-07-01,The North Korean passenger ferry Mangyongbong ...,The North Korean passenger ferry Mangyongbong ...,North Korea
23767,Presidential office denounces allegations invo...,KH디지털2,2016-07-01,The presidential office Cheong Wa Dae on Tuesd...,The presidential office Cheong Wa Dae on Tuesd...,Politics


## Using [BERTopic](https://maartengr.github.io/BERTopic/tutorial/algorithm/algorithm.html) for topic modeling & clustering
- BERTopic utilizes Sentence-BERT to create document embeddings.
- We use it to:
    1. Clustering articles according to their topic.
    2. Identify events per time frame.
    3. Calculate intra-topic pairwise document similarity and sort by time to discvoer **On-Issue** articles.
    4. Discover **Related-Issue** articles that are not **On-Issue** articles.

In [14]:
from bertopic import BERTopic

In [16]:
# Korea herald
model_path = "./bertopic_khearld_model.pth"
if not os.path.exists(model_path):
    '''
    BERTopic()
    
    - set `calculate_probabilities` to True to return the `probabilities`
    - set `nr_topics` to `auto` for automatic topic reduction with similarity over 0.9
    '''
    topic_model = BERTopic(calculate_probabilities=True)# BERTopic(calculate_probabilities=True, nr_topics="auto")
    topics, topic_prob = topic_model.fit_transform(body)
    topic_model.save(model_path)
else:
    print("Model already exists. Loading model from {} ...".format(model_path))
    topic_model = BERTopic.load(model_path)
    topics, topic_prob= topic_model.transform(df[" body"])

Model already exists. Loading model from ./bertopic_khearld_model.pth ...


In [18]:
print("topics (len): ", len(topics))
print(topics)
print("topic_prob (shape): ", topic_prob.shape)

topics (len):  23769
[ -1  -1  -1 ...  -1  -1 228]
topic_prob (shape):  (23769, 267)


## Create `df_topic` to group_by `topic`
- Assign the topics from BERTopic to every article

In [19]:
df_topic["topic"] = topics
print("Number of topics: ", len(df_topic.topic.unique()))
df_topic

Unnamed: 0,title,author,time,description,body,section,topic
0,A snapshot of multiculturalism in South Korea,Lee Sun-young,2018-01-01 17:07:00,With birthrates persistently low and the senio...,With birthrates persistently low and the senio...,Social affairs,-1
1,[Weekender] Korea’s dynamic 2017,Choi He-suk,2018-01-01 13:22:00,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs,-1
2,People's Party members support Ahn's push for ...,Yonhap,2017-12-31 16:18:00,The leader of the center-left People's Party g...,The leader of the center-left People's Party g...,Politics,-1
3,[Newsmaker] Panamanian vessel probed over susp...,Yonhap,2017-12-31 14:55:00,PYEONGTAEK -- South Korea has seized and insp...,PYEONGTAEK -- South Korea has seized and insp...,North Korea,-1
4,Hong Kong ship crew questioned in S. Korea for...,AFP,2017-12-30 15:44:00,The crew of a Hong Kong-registered ship have b...,The crew of a Hong Kong-registered ship have b...,North Korea,210
...,...,...,...,...,...,...,...
23764,Korea Navy to build 1st dedicated training shi...,KH디지털2,2016-07-19 13:29:00,Korea's Navy will build a dedicated training s...,Korea's Navy will build a dedicated training s...,Defense,-1
23765,Korean man gets 4-year jail term for Yasukuni ...,KH디지털2,2016-07-19 13:16:00,A Tokyo court handed down a four-year jail ter...,A Tokyo court handed down a four-year jail ter...,Social affairs,194
23766,N. Korean ferry to sail on 3-country tour rout...,KH디지털2,2016-07-19 13:13:00,The North Korean passenger ferry Mangyongbong ...,The North Korean passenger ferry Mangyongbong ...,North Korea,-1
23767,Presidential office denounces allegations invo...,KH디지털2,2016-07-19 13:09:00,The presidential office Cheong Wa Dae on Tuesd...,The presidential office Cheong Wa Dae on Tuesd...,Politics,-1


In [22]:
grouped_df = df_topic.groupby("topic")
print("topic: ", topic_model.get_topic(249))
print("topic_info: ", topic_model.get_topic_info(249))
grouped_df.get_group(249)

topic:  [('chung', 0.02572411034434645), ('daughter', 0.022459757141647354), ('choi', 0.02139059824953283), ('kyunghee', 0.015205122127697902), ('professors', 0.014865357205059956), ('chungs', 0.012596970783841435), ('chois', 0.009715327577183018), ('equestrian', 0.009033034636995506), ('geunhyes', 0.008581712224391578), ('academic', 0.008194653224262912)]
topic_info:       Topic  Count                              Name
187    249     17  249_chung_daughter_choi_kyunghee


Unnamed: 0,title,author,time,description,body,section,topic
279,Professors pick 'fight for right' as words sym...,Yonhap,2017-12-17 11:24:00,A group of college professors has picked the f...,A group of college professors has picked the f...,Social affairs,249
1104,High court keeps 3-yr jail term for Park confi...,Yonhap,2017-11-14 11:34:00,An appellate court on Tuesday maintained a low...,An appellate court on Tuesday maintained a low...,Social affairs,249
4355,Reporter indicted over defaming ruling party l...,최희석,2016-05-23 12:50:00,Prosecutors said Monday they have indicted a l...,Prosecutors said Monday they have indicted a l...,Social affairs,249
7595,High school cancels diploma of Choi Soon-sil's...,KH디지털2,2017-03-08 12:45:00,A local high school on Wednesday nullified the...,A local high school on Wednesday nullified the...,Social affairs,249
8162,Ex-university chief arrested in corruption probe,KH디지털2,2017-02-15 09:18:00,Special prosecutors on Wednesday formally arre...,Special prosecutors on Wednesday formally arre...,Social affairs,249
8551,Independent counsel questions three officials ...,손지영,2017-01-27 10:59:00,Three officials from Seoul-based Ewha Womans ...,Three officials from Seoul-based Ewha Womans U...,National,249
8609,Ex-Ewha Univ. chief faces arrest over Chung Yo...,Bak Se-hwan,2017-01-24 15:45:00,The special prosecutor’s investigation into ad...,The special prosecutor’s investigation into ad...,Social affairs,249
8614,Court to decide on arrest of ex-university chi...,KH디지털2,2017-01-24 11:45:00,A Seoul court was set to decide Tuesday on th...,A Seoul court was set to decide Tuesday on the...,Social affairs,249
8632,Park mentioned friend's daughter in ordering n...,KH디지털2,2017-01-23 17:26:00,President Park Geun-hye cited her friend's dau...,President Park Geun-hye cited her friend's dau...,Politics,249
8749,Ex-president of university grilled over influe...,KH디지털2,2017-01-18 11:02:00,A former president of a local university was q...,A former president of a local university was q...,Social affairs,249


In [23]:
grouped_df = df_topic.groupby("topic")
print("topic: ", topic_model.get_topic(22))
grouped_df.get_group(22)

topic:  [('kerry', 0.04392059739688497), ('kerrys', 0.011029036711967278), ('yun', 0.009754421236940567), ('talks', 0.008053554635578455), ('un', 0.0063385573271796366), ('beijing', 0.005611623179630056), ('midjune', 0.005421284650970458), ('discuss', 0.005369820623276402), ('washington', 0.005355711487655685), ('china', 0.004345671920401354)]


Unnamed: 0,title,author,time,description,body,section,topic
7751,Ex-US Secretary of State Kerry joins Carnegie ...,KH디지털2,2017-03-03 09:47:00,Former US Secretary of State John Kerry has jo...,Former US Secretary of State John Kerry has jo...,International,22
10340,Kerry has no plans to meet with N. Korean FM: ...,KH디지털2,2015-08-05 09:32:00,U.S. Secretary of State John Kerry has no plan...,U.S. Secretary of State John Kerry has no plan...,North Korea,22
11824,Kerry's remark about THAAD meant 'internal U.S...,KH디지털2,2015-05-21 09:52:00,U.S. Secretary of State John Kerry was referri...,U.S. Secretary of State John Kerry was referri...,Defense,22
11864,Kerry thanks lawmaker for helping U.S. ambassa...,KH디지털2,2015-05-19 17:28:00,U.S. Secretary of State John Kerry has thanked...,U.S. Secretary of State John Kerry has thanked...,Politics,22
11900,Park meets with Kerry on N. Korea,KH디지털2,2015-05-18 10:52:00,President Park Geun-hye met with U.S. Secretar...,President Park Geun-hye met with U.S. Secretar...,International,22
11904,"Yun, Kerry reaffirm strong alliance against N....",KH디지털2,2015-05-18 09:21:00,Foreign Minister Yun Byung-se and U.S. Secreta...,Foreign Minister Yun Byung-se and U.S. Secreta...,Foreign Policy,22
11905,N.K. missile test unacceptable: Kerry,Shin Hyon-hee,2015-05-17 20:40:00,U.S. Secretary of State John Kerry on Saturday...,U.S. Secretary of State John Kerry on Saturday...,North Korea,22
11915,"Kerry in South Korea to talk security, cyber i...",김연세,2015-05-17 18:43:00,U.S. Secretary of State John Kerry is in South...,U.S. Secretary of State John Kerry is in South...,Foreign Policy,22
11917,"Kerry due in Seoul for talks on N. Korea, alli...",KH디지털2,2015-05-17 10:49:00,U.S. Secretary of State John Kerry is schedule...,U.S. Secretary of State John Kerry is schedule...,North Korea,22
11998,N. Korea's missile test to be on agenda for Yu...,KH디지털2,2015-05-12 17:10:00,South Korean Foreign Minister Yun Byung-se and...,South Korean Foreign Minister Yun Byung-se and...,International,22


## On-Issue Example (an example between 12014 -> 11900)
- The On-Issue documents are from the same **topic**.
- This example shows that the relationship between ARG0 and ARG1 is important in identifying the relation between two "On-Issue" documents
- Here, we do two things:
    1. Identify the named entities using Stanza NER tool (Just to see whether there are overlapping entities).
    2. Build document embeddings for the two documents and evaluate their similarity.
    3. Check if **document similarity** is related to **On-Issue**.
    4. Calculate the pairwise **Sentence similarity** between these two documents (i.e., articles) using [Sentence-BERT](https://www.sbert.net/docs/quickstart.html#comparing-sentence-similarities).

**In this example, we use `topic 22` for the On-Issue analysis (the number 22 is subject to change as the topic index is not fixed)** 

In [24]:
grouped_df = df_topic.groupby("topic")
print("topic: ", topic_model.get_topic(22))
grouped_df.get_group(22)

topic:  [('kerry', 0.04392059739688497), ('kerrys', 0.011029036711967278), ('yun', 0.009754421236940567), ('talks', 0.008053554635578455), ('un', 0.0063385573271796366), ('beijing', 0.005611623179630056), ('midjune', 0.005421284650970458), ('discuss', 0.005369820623276402), ('washington', 0.005355711487655685), ('china', 0.004345671920401354)]


Unnamed: 0,title,author,time,description,body,section,topic
7751,Ex-US Secretary of State Kerry joins Carnegie ...,KH디지털2,2017-03-03 09:47:00,Former US Secretary of State John Kerry has jo...,Former US Secretary of State John Kerry has jo...,International,22
10340,Kerry has no plans to meet with N. Korean FM: ...,KH디지털2,2015-08-05 09:32:00,U.S. Secretary of State John Kerry has no plan...,U.S. Secretary of State John Kerry has no plan...,North Korea,22
11824,Kerry's remark about THAAD meant 'internal U.S...,KH디지털2,2015-05-21 09:52:00,U.S. Secretary of State John Kerry was referri...,U.S. Secretary of State John Kerry was referri...,Defense,22
11864,Kerry thanks lawmaker for helping U.S. ambassa...,KH디지털2,2015-05-19 17:28:00,U.S. Secretary of State John Kerry has thanked...,U.S. Secretary of State John Kerry has thanked...,Politics,22
11900,Park meets with Kerry on N. Korea,KH디지털2,2015-05-18 10:52:00,President Park Geun-hye met with U.S. Secretar...,President Park Geun-hye met with U.S. Secretar...,International,22
11904,"Yun, Kerry reaffirm strong alliance against N....",KH디지털2,2015-05-18 09:21:00,Foreign Minister Yun Byung-se and U.S. Secreta...,Foreign Minister Yun Byung-se and U.S. Secreta...,Foreign Policy,22
11905,N.K. missile test unacceptable: Kerry,Shin Hyon-hee,2015-05-17 20:40:00,U.S. Secretary of State John Kerry on Saturday...,U.S. Secretary of State John Kerry on Saturday...,North Korea,22
11915,"Kerry in South Korea to talk security, cyber i...",김연세,2015-05-17 18:43:00,U.S. Secretary of State John Kerry is in South...,U.S. Secretary of State John Kerry is in South...,Foreign Policy,22
11917,"Kerry due in Seoul for talks on N. Korea, alli...",KH디지털2,2015-05-17 10:49:00,U.S. Secretary of State John Kerry is schedule...,U.S. Secretary of State John Kerry is schedule...,North Korea,22
11998,N. Korea's missile test to be on agenda for Yu...,KH디지털2,2015-05-12 17:10:00,South Korean Foreign Minister Yun Byung-se and...,South Korean Foreign Minister Yun Byung-se and...,International,22


In [35]:
'''
On-Issue document examples

df.iloc[12014] -> df.iloc[11900]

'''
print("[ Date: {} ] \n[ Body ] : {}".format(df.iloc[12014][" time"], df.iloc[12014][" body"]))
print("\n[ Date: {} ] \n[ Body ] : {}".format(df.iloc[11900][" time"], df.iloc[11900][" body"]))

[ Date: 2015-05-12 09:45:00 ] 
[ Body ] : U.S. Secretary of State John Kerry will visit South Korea next week to discuss pending bilateral issues as well as President Park Geun-hye's trip to Washington, the State Department announced Monday.During the May 17-18 visit, Kerry will meet with President Park and Foreign Minister Yun Byung-se to "discuss a range of global, regional, and bilateral issues, as well as President Park's upcoming visit to the United States," the department said in a statement.Kerry's discussions in Seoul are expected to include the assessment of the situation on the Korean Peninsula, and Park's planned visit to the U.S. in mid-June, among other bilateral issues, a diplomatic source in Seoul said earlier in the day.According to government sources, Park plans to make an "official working visit" to the U.S. in mid-June for talks with Barack Obama to discuss ways to strengthen alliances and boost coordination to better deter the belligerent North Korea. Kerry last vis

In [34]:
# Document Similarity (between two positive, On-Issue documents)
on_issue1 = df.iloc[11900][" body"]
on_issue2 = df.iloc[12014][" body"]
on_issue_docs = [on_issue1, on_issue2]

from sentence_transformers import SentenceTransformer, util
sentence_model = SentenceTransformer("distilbert-base-nli-mean-tokens")
embeddings = sentence_model.encode(on_issue_docs, show_progress_bar=True)

print("Embedding (shape): ", embeddings.shape)
cos_sim = util.pytorch_cos_sim(embeddings[0], embeddings[1])

print("Document Similarity: ", cos_sim)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding (shape):  (2, 768)
Document Similarity:  tensor([[0.9066]])


In [41]:
# Document Similarity (between a document above and a "negative" document) (randomly sampled negative document)
neg_doc = df.iloc[1192][" body"]
neg_embeddings = sentence_model.encode([neg_doc, on_issue1], show_progress_bar=True)
cos_sim_neg = util.pytorch_cos_sim(neg_embeddings[0], neg_embeddings[1])
print("Document Similarity (negative): ", cos_sim_neg)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Document Similarity (negative):  tensor([[0.7320]])


In [120]:
group_df_sample = grouped_df.get_group(22)
doc_id, sample_docs = zip(*dict(group_df_sample[" body"]).items())

- Check to see if document similarity >= 0.9 represents On-Issue events

In [49]:
#Encode all sentences
sample_embeddings = sentence_model.encode(sample_docs)

#Compute cosine similarity between all pairs
cos_sim = util.pytorch_cos_sim(sample_embeddings, sample_embeddings)
# print("Sample Topic Inter-Doc Similarity: \n", cos_sim >= 0.9)

sim_docs = []
related_issue = []
for i in range(cos_sim.shape[0]):
    for j in range(cos_sim.shape[1]):
        if i != j and cos_sim[i][j].item() >= 0.90:
            sim_docs.append((i, j))  # Save the index for documents in `group_df_sample`
        elif i != j and cos_sim[i][j].item() < 0.90:
            related_issue.append((i, j))

In [50]:
sim_docs

[(2, 7),
 (4, 7),
 (4, 8),
 (4, 9),
 (4, 10),
 (4, 11),
 (4, 19),
 (7, 2),
 (7, 4),
 (7, 8),
 (7, 16),
 (7, 17),
 (8, 4),
 (8, 7),
 (8, 9),
 (8, 10),
 (8, 11),
 (8, 14),
 (8, 15),
 (8, 17),
 (8, 18),
 (8, 22),
 (9, 4),
 (9, 8),
 (9, 10),
 (9, 11),
 (9, 14),
 (9, 15),
 (9, 17),
 (9, 18),
 (9, 22),
 (10, 4),
 (10, 8),
 (10, 9),
 (10, 11),
 (10, 14),
 (10, 15),
 (10, 17),
 (10, 18),
 (10, 22),
 (11, 4),
 (11, 8),
 (11, 9),
 (11, 10),
 (11, 14),
 (11, 15),
 (11, 18),
 (14, 8),
 (14, 9),
 (14, 10),
 (14, 11),
 (14, 15),
 (14, 17),
 (14, 18),
 (14, 20),
 (15, 8),
 (15, 9),
 (15, 10),
 (15, 11),
 (15, 14),
 (15, 17),
 (15, 18),
 (15, 20),
 (16, 7),
 (17, 7),
 (17, 8),
 (17, 9),
 (17, 10),
 (17, 14),
 (17, 15),
 (17, 22),
 (18, 8),
 (18, 9),
 (18, 10),
 (18, 11),
 (18, 14),
 (18, 15),
 (19, 4),
 (20, 14),
 (20, 15),
 (22, 8),
 (22, 9),
 (22, 10),
 (22, 17)]

In [52]:
sim_docs[10]

(7, 16)

In [53]:
sample_docs[sim_docs[10][0]]

"U.S. Secretary of State John Kerry is in South Korea where he will be discussing security issues amid fresh fears of North Korean belligerence.Kerry arrived in Seoul on Sunday from Beijing and will see top South Korean officials on Monday, less than a week after South Korea's spy agency said North Korean leader Kim Jong-un ordered his defense chief executed with an anti-aircraft gun for complaining about the young ruler. The allegation, if true, adds to concerns about the erratic nature of Kim's rule, particularly after Pyongyang claimed last weekend it successfully test-fired a newly developed ballistic missile from a submarine.Those actions come despite a recent U.S. diplomatic overture to North Korea to discuss resuming denuclearization talks that have been stalled for the last three years. (AP)"

In [54]:
sample_docs[sim_docs[10][1]]

'U.S. Secretary of State John Kerry said Monday North Korean leader Kim Jong-un is "questionable in terms of judgment" and called his nuclear pursuit "one of the most serious issues on the planet today."Kerry made the remark at a press roundtable during a visit to the Laotian capital of Vientiane, talking about his upcoming trip to China aimed at discussing how to deal with North Korea in the wake of its fourth nuclear test, according to a State Department transcript."I look forward to having solid conversations -- serious conversations -- about one of the most serious issues on the planet today, which is a clearly reckless and dangerous, evolving security threat in the hands of somebody who is questionable in terms of judgment and has proven thus to China," Kerry said.Kerry declined to be more specific about what he\'ll be talking about with the Chinese, saying those talks should be private and he doesn\'t want to predetermine the discussions."We need to have the talks. That\'s why I\

- Check to see if document similarity < 0.9 represents Related-Issue events

In [57]:
related_issue

[(0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (0, 6),
 (0, 7),
 (0, 8),
 (0, 9),
 (0, 10),
 (0, 11),
 (0, 12),
 (0, 13),
 (0, 14),
 (0, 15),
 (0, 16),
 (0, 17),
 (0, 18),
 (0, 19),
 (0, 20),
 (0, 21),
 (0, 22),
 (1, 0),
 (1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (1, 6),
 (1, 7),
 (1, 8),
 (1, 9),
 (1, 10),
 (1, 11),
 (1, 12),
 (1, 13),
 (1, 14),
 (1, 15),
 (1, 16),
 (1, 17),
 (1, 18),
 (1, 19),
 (1, 20),
 (1, 21),
 (1, 22),
 (2, 0),
 (2, 1),
 (2, 3),
 (2, 4),
 (2, 5),
 (2, 6),
 (2, 8),
 (2, 9),
 (2, 10),
 (2, 11),
 (2, 12),
 (2, 13),
 (2, 14),
 (2, 15),
 (2, 16),
 (2, 17),
 (2, 18),
 (2, 19),
 (2, 20),
 (2, 21),
 (2, 22),
 (3, 0),
 (3, 1),
 (3, 2),
 (3, 4),
 (3, 5),
 (3, 6),
 (3, 7),
 (3, 8),
 (3, 9),
 (3, 10),
 (3, 11),
 (3, 12),
 (3, 13),
 (3, 14),
 (3, 15),
 (3, 16),
 (3, 17),
 (3, 18),
 (3, 19),
 (3, 20),
 (3, 21),
 (3, 22),
 (4, 0),
 (4, 1),
 (4, 2),
 (4, 3),
 (4, 5),
 (4, 6),
 (4, 12),
 (4, 13),
 (4, 14),
 (4, 15),
 (4, 16),
 (4, 17),
 (4, 18),
 (4, 20),
 (4, 21),
 (4, 22),
 (5, 0),
 (

In [59]:
print("\n", sample_docs[related_issue[12][0]], "\n")
print("\n", sample_docs[related_issue[12][1]], "\n")


 Former US Secretary of State John Kerry has joined the Carnegie Endowment for International Peace as its inaugural visiting distinguished statesman, the think tank announced Thursday.(Yonhap)"John Kerry's lifelong commitment to international peace and understanding embodies the mission and purpose of the Carnegie Endowment, "said Carnegie President William J. Burns in a statement. "His experience, wisdom, and belief in the power of diplomacy are needed now more than ever."Kerry will focus on conflict resolution and global environmental challenges, the think tank said. (Yonhap) 


 KING KHALID MILITARY CITY, Saudi Arabia (AFP) -- Secretary of State John Kerry said US and Russian officials would meet later Saturday on Syrian opposition complaints of truce violations but that peace talks should go ahead as planned.Asked at the end of a visit to Saudi Arabia whether indirect dialogue in Geneva between the Syrian regime and opposition could go ahead as planned on Monday, Kerry said: "Yes,

## On-Issue vs. Related-Issue Event Extraction 
Using `SRL-BERT (AllenNLP )` and `TransformerSummarizer (!pip install bert-extractive-summarizer)`, we do the following:
1. Divide them into `On-Issue` and `Related-Issue` groups.
2. For the `On-Issue` group, we sort them by time and apply `SRL-BERT` to see if there is a switch between ARG0 and ARG1 in document1 and document2. Then, we apply `TransformerSummarizer` to summarize the entire document, and use the first sentence as the **Event**. If the events overlap, leave the first one and delete the others.
3. For the `Related-Issue` group, we simply apply `TransformerSummarizer` to summarize the entire document.

In [169]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging

# SRL-BERT from Alllenlp - (Example)
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz")
predictor.predict(
    sentence="Did Uriah honestly think he could beat the game in under three hours?."
)

{'verbs': [{'verb': 'Did',
   'description': '[V: Did] Uriah honestly think he could beat the game in under three hours ? .',
   'tags': ['B-V',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O']},
  {'verb': 'think',
   'description': 'Did [ARG0: Uriah] [ARGM-ADV: honestly] [V: think] [ARG1: he could beat the game in under three hours] ? .',
   'tags': ['O',
    'B-ARG0',
    'B-ARGM-ADV',
    'B-V',
    'B-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'O',
    'O']},
  {'verb': 'could',
   'description': 'Did Uriah honestly think he [V: could] beat the game in under three hours ? .',
   'tags': ['O',
    'O',
    'O',
    'O',
    'O',
    'B-V',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O']},
  {'verb': 'beat',
   'description': 'Did Uriah honestly think [ARG0: he] [ARGM-MOD: could] [V: beat] [ARG1:

In [192]:
import pickle
a = {"a": {(1, 10): 0.987, (2, 10): 0.911}, "b": {(3,8): 0.789, (9, 8): 0.889}}
with open("dict_demo", "wb") as fp:
    pickle.dump(a, fp, protocol=pickle.HIGHEST_PROTOCOL)
with open("dict_demo", "rb") as fp:    
    b = pickle.load(fp)
print(b)

{'a': {(1, 10): 0.987, (2, 10): 0.911}, 'b': {(3, 8): 0.789, (9, 8): 0.889}}


In [None]:
topic_idx_on_issue = {}  # Dict[topic_id1 : {Document1: sim_score, ...}, topic_id2 : {Document1: sim_score, ...}, ..., topic_idn]
topic_idx_related_issue = {}

for topic_idx in tqdm(topics[topics != -1]):
    group_df_sample = grouped_df.get_group(topic_idx) # Document instances for those in the same topic (i.e., `topic_idx`)
    topic_idx2doc = dict(group_df_sample[" body"])
    doc_idx, docs = zip(*topic_idx2doc.items())
    key2idx = dict(zip(list(range(len(doc_idx))), doc_idx))  # `key2idx` is Dict[key: document_idx]

    #Encode all documents
    doc_embeddings = sentence_model.encode(docs)
    #Compute cosine similarity between all pairs
    cos_sim = util.pytorch_cos_sim(doc_embeddings, doc_embeddings)
    
    on_issue = {}
    related_issue = {}
    for i in range(cos_sim.shape[0]):
        for j in range(cos_sim.shape[1]):
            if i != j and cos_sim[i][j].item() >= 0.90:
                if (i, j) not in on_issue.keys() and (j, i) not in on_issue.keys():
                    on_issue[(key2idx[i], key2idx[j])] = cos_sim[i][j].item()
            elif i != j and cos_sim[i][j].item() < 0.90:
                if (i, j) not in related_issue.keys() and (j, i) not in related_issue.keys():
                    related_issue[(key2idx[i], key2idx[j])] = cos_sim[i][j].item()
                    
    # TODO: Store the `on_issue` and `related_issue` per topic at `topic_idx_on_issue` and `topic_idx_related_issue`
    topic_idx_on_issue[topic_idx] = on_issue
    topic_idx_related_issue[topic_idx] = related_issue

path_onissue = "topic_idx_on_issue.pkl"
path_relissue = "topic_idx_related_issue.pkl"
if not os.path.exists(path_onissue):
    with open(path_onissue, "wb") as fp:
        pickle.dump(topic_idx_on_issue, fp, protocol=pickle.HIGHEST_PROTOCOL)
        
if not os.path.exists(path_relissue):
    with open(path_relissue, "wb") as fp:
        pickle.dump(topic_idx_related_issue, fp, protocol=pickle.HIGHEST_PROTOCOL)

  0%|          | 0/13016 [00:00<?, ?it/s]

### On-Issue example

1. Sort the articles in the **same topic** in time (ascending order).
2. Apply `SRL-BERT` to observe the switch between ARG0 and ARG1 in the $K$ documents

In [182]:
# On-Issue example
on_issue_pairs = list(on_issue.keys())
rand_idx = np.random.randint(0, len(on_issue))
on_issue_sample1 = on_issue_pairs[rand_idx][0]
on_issue_sample2 = on_issue_pairs[rand_idx][1]

# TODO: Sort the similar pairs by time

print("[ Date: {} ] \n[ Body ] : {}".format(df.iloc[on_issue_sample1][" time"], df.iloc[on_issue_sample1][" body"]))
print("\n[ Date: {} ] \n[ Body ] : {}".format(df.iloc[on_issue_sample2][" time"], df.iloc[on_issue_sample2][" body"]))

[ Date: 2016-03-06 15:00:00 ] 
[ Body ] : South Korea will announce its own North Korea sanctions this week after the United Nations Security Council voted to impose stronger sanctions on the belligerent nation, a South Korean official said Sunday.The sanctions will likely include banning the entry of ships to South Korean ports from third-party countries that have been to North Korea and blacklisting more organizations and personnel related to the North's weapons of mass destruction, the official said on the condition of anonymity.The measures will be made public by the Prime Minister's Office of the South early this week, the official added.The UNSC unanimously adopted Resolution 2270 on Thursday, tightening the screws on the communist nation that sparked global outrage with its fourth nuclear test on Jan. 6 and its long-range missile launch on Feb. 7 in violation of U.N. rules.South Korea has used practically all its cards that could pressure the North toward abandoning its nuclear 

- Using `SRL-BERT` to extract ARG0 and ARG1 in On-Issue article 1 and article 2 to see if they **switched** places.

In [166]:
ent_types = ["GPE", "PERSON", "ORG", "LOC"]

doc1 = nlp(df.iloc[on_issue_sample1][" body"])
doc2 = nlp(df.iloc[on_issue_sample2][" body"])
doc1ents = set()
doc2ents = set()
for ent in doc1.ents:
    if ent.type in ent_types:
        doc1ents.add(ent.text)
for ent in doc2.ents:
    if ent.type in ent_types:
        doc2ents.add(ent.text)

print("[ doc1ents ]\n", doc1ents)
print("\n[ doc2ents ]\n", doc2ents)

[ doc1ents ]
 {'Beijing', 'Liberty Korea Party', 'South Korea', 'UN', 'US', 'Russia', 'Washington', 'CNN', 'McCain', 'North Korea', 'the Armed Services Committee', 'China', 'UNSC', 'Hawaii', 'Seoul', 'Pyongyang', 'the North Korean Foreign Ministry', 'Sen', 'John McCain', 'Choi He-suk', 'UN Security Council', 'YonhapAccording', 'Ministry of Foreign Affairs', 'North Korea`s'}

[ doc2ents ]
 {'Korean Central News Agency', 'Seoul', 'the United Nations Security Council', 'Pyongyang', 'DPRK', 'South Korea', 'UN', 'North Korea', 'ICBM', 'US', "Democratic People's Republic of Korea", 'North', 'UNSC'}


In [177]:
# Check to see if the entities from NER exist as ARG0 or ARG1 in the given docs
doc1ents_list = list(doc1ents)
doc2ents_list = list(doc2ents)
doc1verbs = set()  # TODO: What's the use of VERBs?
doc2verbs = set()

print(df.iloc[on_issue_sample1][" body"])

max_seq_len = 512
try:
    on_issue1_pred = predictor.predict(df.iloc[on_issue_sample1][" body"])  # TODO: max_seq_length = 512
    on_issue2_pred = predictor.predict(df.iloc[on_issue_sample2][" body"])
except RuntimeError:
    print("RuntimeError: The size of tensor a ({}) must match the size of tensor b ({}) at non-singleton dimension 1".format(len(on_issue1_pred), max_seq_len))
    from transformers import BertTokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    tok1 = tokenizer.tokenize(df.iloc[on_issue_sample1][" body"])
    tok2 = tokenizer.tokenize(df.iloc[on_issue_sample2][" body"])
    if len(tok1) > max_seq_len:
        doc_seq1 = tokenizer.convert_tokens_to_string(tok1[:max_seq_len - 5])
        on_issue1_pred = predictor.predict(doc_seq1)  # TODO: max_seq_length = 512
    if len(tok2) > max_seq_len:
        doc_seq2 = tokenizer.convert_tokens_to_string(tok2[:max_seq_len - 5])
        on_issue2_pred = predictor.predict(doc_seq2)

arg0_doc1 = []
arg1_doc1 = []
arg0_doc2 = []
arg1_doc2 = []
for i, pred in enumerate(on_issue1_pred["verbs"]):
    doc1verbs.add(pred["verb"])
    arg_list = re.findall(r"\[(ARG\d[A-Za-z0-9_.: ]+)\]", pred["description"])
    for arg in arg_list:
        if "ARG0" in arg and any(ent in arg for ent in doc1ents_list):
            arg0_doc1.append(arg)
        elif "ARG1" in arg and any(ent in arg for ent in doc1ents_list):
            arg1_doc1.append(arg)
    
for i, pred in enumerate(on_issue2_pred["verbs"]):
    doc2verbs.add(pred["verb"])
    arg_list = re.findall(r"\[(ARG\d[A-Za-z0-9_.: ]+)\]", pred["description"])
    for arg in arg_list:
        if "ARG0" in arg and any(ent in arg for ent in doc2ents_list):
            arg0_doc2.append(arg)
        elif "ARG1" in arg and any(ent in arg for ent in doc2ents_list):
            arg1_doc2.append(arg)

print("[ doc1verbs ]\n", doc1verbs)
print("\n[ doc2verbs ]\n", doc2verbs)
print("\n[ ARG0 in doc1 ]", arg0_doc1)
print("\n[ ARG1 in doc1 ]", arg1_doc1)
print("\n[ ARG0 in doc2 ]", arg0_doc2)
print("\n[ ARG1 in doc2 ]", arg1_doc2)

North Korea on Monday warned of a “last resort” attack against the US, ahead of UN Security Council’s deliberation on the latest US-led sanctions against the regime. The UNSC is set to deliberate on the resolution drafted by the US on Monday, and Washington and Beijing are said to be wrangling over clauses that would limit the supply of oil to North Korea.  UN Security Council meeting held on Sept. 4 following North Korea`s nuclear test on the previous day. YonhapAccording to local news reports citing unnamed diplomatic sources, the latest sanctions are likely to include some measures limiting the inflow of crude oil into North Korea.One unnamed source claimed that “effectual” measures for controlling the oil supply are likely to be included, while another was quoted as saying that China and Russia were considered unlikely to exercise their rights to veto a resolution. Cutting off North Korea’s oil supplies has recently gained support as the most effective means to restrain Pyongyang, 

### Related-Issue example

In [160]:
# Related-Issue example
related_issue_pairs = list(related_issue.keys())
rand_idx = np.random.randint(0, len(related_issue))
related_issue_sample1 = related_issue_pairs[rand_idx][0]
related_issue_sample2 = related_issue_pairs[rand_idx][1]

print("[ Date: {} ] \n[ Body ] : {}".format(df.iloc[related_issue_sample1][" time"], df.iloc[related_issue_sample1][" body"]))
print("\n[ Date: {} ] \n[ Body ] : {}".format(df.iloc[related_issue_sample2][" time"], df.iloc[related_issue_sample2][" body"]))

[ Date: 2017-10-06 15:34:00 ] 
[ Body ] : US Customs and Border Protection says it is ready to block US imports of seafood _ as well as any other goods _ produced by North Korean laborers who work in China.An Associated Press investigation tracked salmon, squid and cod processed by North Koreans working at Chinese factories and shipped to American stores, including Walmart and ALDI. The North Korean workers found in Chinese factories aren't allowed to leave, and receive only a fraction of their pay _ most goes straight to the North Korean state. This means that American consumers buying seafood labeled "Caught in the USA, Processed in China" may inadvertently be subsidizing the government of Kim Jong Un as it builds nuclear weapons, and also supporting forced labor. In this Sept. 2, 2017, photo, a worker stacks crates at the Yanbian Shenghai Industry & Trade Co. Ltd., which hires some North Korean workers to process seafood in the city of Hunchun in northeastern China's Jilin province.

## `TransformerSummarizer` for Text Summarization
- `TransformerSummarizer` using XLNet-base model to produce a summarized version of the On-Issue and Related-Issue articles.

In [161]:
from summarizer import TransformerSummarizer

In [163]:
model = TransformerSummarizer(transformer_type="XLNet",transformer_model_key="xlnet-base-cased")
full = ''.join(model(df.iloc[on_issue_sample1][" body"], min_length=60))
print(full)

North Korea on Monday warned of a “last resort” attack against the US, ahead of UN Security Council’s deliberation on the latest US-led sanctions against the regime. Cutting off North Korea’s oil supplies has recently gained support as the most effective means to restrain Pyongyang, and to prevent further nuclear and missile provocations. ”Claiming that North Korea is “ready to take any final resort,” the North Korean Foreign Ministry hinted at a series of provocations. The move, which has largely been discredited in the US, may be gaining some traction in the US. McCain also stated that the US must make it known to North Korea that the price of aggression against the US is “extinction.”


## NER using Stanza (for PERSON, ORG and GPE)
- Here, we apply named entity recognition (NER) tool from Stanford's Stanza to extract PERSON, ORG, GPE, EVENT

## Stanza Example
- **Stanza** will later be used to extract named entities and POS tags from the articles in the Korea Herald
- The following block simply deals with the basic NER tutorial example from the official [website](https://stanfordnlp.github.io/stanza/).

In [93]:
# stanza.download('en')
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')
doc = nlp("Chris Manning teaches at Stanford University. He lives in the Bay Area.")
print(*[f'entity: {ent.text}\ttype: {ent.type}' for ent in doc.ents], sep='\n')

2021-05-29 09:44:00 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2021-05-29 09:44:00 INFO: Use device: gpu
2021-05-29 09:44:00 INFO: Loading: tokenize
2021-05-29 09:44:00 INFO: Loading: ner
2021-05-29 09:44:01 INFO: Done loading processors!


entity: Chris Manning	type: PERSON
entity: Stanford University	type: ORG
entity: the Bay Area	type: LOC


## Visualizing Topics
- `topic_model.get_topic_info()` identifies the topics, count of documents that are associated with each topic, and the topic's name
- `topic_model.get_topic(topic_id)` returns the word-wise probability for a given topic_id

In [60]:
freq = topic_model.get_topic_info()
print("Number of topics: ", len(freq["Topic"].unique()))
freq

Number of topics:  268


Unnamed: 0,Topic,Count,Name
0,-1,10753,-1_presidential_opposition_moon_political
1,57,821,57_japanese_japan_japans_tokyo
2,170,663,170_talks_meeting_discuss_beijing
3,265,617,265_prosecutors_choi_prosecution_scandal
4,194,561,194_victim_suspect_sexual_murder
...,...,...,...
263,160,10,160_dinner_melania_trump_chopsticks
264,89,10,89_train_rajin_railway_coal
265,103,10,103_care_patients_centers_sex
266,75,10,75_universiade_gymnastics_unification_secretar...


In [63]:
topic_model.get_topic(freq.iloc[10]["Topic"])

[('ballistic', 0.013203544875101546),
 ('missile', 0.013021940785550155),
 ('missiles', 0.011689697625838059),
 ('launch', 0.01121248576969845),
 ('rocket', 0.010134248867486694),
 ('icbm', 0.006965266830722244),
 ('intercontinental', 0.0050852210293444305),
 ('launches', 0.004922212570814694),
 ('flew', 0.0047412265128027155),
 ('pyongyang', 0.0038268246850366996)]

In [64]:
vis_topic_path = "./topic_korea_herald.html"
if not os.path.exists(vis_topic_path):
    fig = topic_model.visualize_topics()
    fig.write_html(vis_topic_path)

# Visualizing probabilities of a topic found in a document
- Need to set `calculate_probabilities` to **True** in `BERTopic` (computationally expensive).
- This step visualizes the probability of an article being assigned to the topic.

In [68]:
print("Topic (c-TF-IDF value for each word): ", topic_model.get_topics())

Topic (c-TF-IDF value for each word):  {-1: [('presidential', 0.001669799636024878), ('opposition', 0.0016505037767788543), ('moon', 0.0016191178083979846), ('political', 0.0015263583880778349), ('lee', 0.001489438602868024), ('ruling', 0.0014674341720740567), ('office', 0.0014325065543177597), ('missile', 0.0013755320550805528), ('year', 0.0013734002512777797), ('security', 0.0013683473649563716)], 0: [('evacuees', 0.0), ('evacuations', 0.0), ('evacuationhe', 0.0), ('evacuationfrom', 0.0), ('evacuationdesigned', 0.0), ('evacuationand', 0.0), ('evacuation', 0.0), ('evacuateddozens', 0.0), ('evacuated', 0.0), ('evacuators', 0.0)], 1: [('volkswagen', 0.07619518163210456), ('audi', 0.029499080378632046), ('carmaker', 0.02765257363886671), ('cars', 0.026592631535137082), ('german', 0.020427711793358717), ('software', 0.018232759382865827), ('diesel', 0.014114912414917115), ('volkswagens', 0.012469647502568596), ('automaker', 0.009888111012597725), ('carmakers', 0.008921018612331171)], 2: [

In [74]:
sample_idx = np.random.randint(df.shape[0])
print("\n[ Topic Number ]: ", topics[sample_idx])
topic_num = topics[sample_idx]
print("\n[ Words in topic] : ", topic_model.get_topic(topic_num))
print("\n[ Description] : ", df.iloc[sample_idx][" description"])
print("\n[ Topic probability (sample_idx) ] :", max(topic_prob[sample_idx]))
topic_model.visualize_distribution(topic_prob[sample_idx])


[ Topic Number ]:  88

[ Words in topic] :  [('virus', 0.0289962236592729), ('mosquitoes', 0.011910911910097728), ('mosquito', 0.009788019083350374), ('infected', 0.009114711895432013), ('pigs', 0.0089400169379566), ('mosquitoborne', 0.008823750575277803), ('abortion', 0.008363750527617812), ('livestock', 0.008006824466419973), ('farm', 0.007854267450622739), ('infection', 0.00671693079871792)]

[ Description] :  South Korea convened an emergency meeting Tuesday to assess and formulate a response to the Zika virus threat after the World Health Organization declared it an international health hazard.At the gathering, chaired by the Minister of Health and Welfare Chung Chin-youb in Seoul, the government said there is no need to be overly concerned about the virus at present, but said countermeasures will be set up.The meeting c...

[ Topic probability (sample_idx) ] : 0.9975137872449615


In [123]:
timestamps = df_month[" time"].tolist()
topics_over_time = topic_model.topics_over_time(body, topics, timestamps, nr_bins=20)
topics_over_time


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



Unnamed: 0,Topic,Words,Frequency,Timestamp,Name
0,-1,"year, years, saenuri, political, presidential",532,2014-12-30 21:41:45.600,-1_presidential_opposition_moon_politica...
1,0,"evacuees, evacuations, evacuationhe, evacuatio...",1,2014-12-30 21:41:45.600,0_evacuees_evacuations_evacuationhe_evac...
2,2,"australia, australian, australias, australians...",2,2014-12-30 21:41:45.600,2_australian_australia_australias_julie
3,7,"germany, german, munich, berlin, unification",11,2014-12-30 21:41:45.600,7_german_germany_berlin_hamburg
4,8,"lunar, realmeterthose, weekrealmeter, poll, ho...",1,2014-12-30 21:41:45.600,8_percentage_rating_poll_polling
...,...,...,...,...,...
2994,248,"ice, kma, kilju, earthquake, freezing",2,2017-11-07 04:48:00.000,248_punggyeri_tunnel_magnitude_bomb
2995,249,"professors, choi, influencepeddling, jaemok, h...",1,2017-11-07 04:48:00.000,249_chung_daughter_choi_kyunghee
2996,255,"bill, budget, bills, revision, referendum",7,2017-11-07 04:48:00.000,255_bill_bills_parties_budget
2997,262,"hosseinioun, trial, detention, lawyers, gaddafi",1,2017-11-07 04:48:00.000,262_court_justices_constitutional_justic...


In [122]:
topic_time_path = "./topics_over_time_topic_top10.html"
if not os.path.exists(topic_time_path):
    fig_time = topic_model.visualize_topics_over_time(topics_over_time, top_n=10)
    fig_time.write_html(topic_time_path)
fig_time


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



## Update Topics (To be updated)
When you have trained a model and viewed the topics and the words that represent them,
you might not be satisfied with the representation. Perhaps you forgot to remove
stopwords or you want to try out a different `n_gram_range`. We can use the function `update_topics` to update 
the topic representation with new parameters for `c-TF-IDF`: 

In [75]:
# topic_model.update_topics(docs, topics, n_gram_range=(1, 2))

In [217]:
on_issue1 = df.iloc[12014][" body"]
on_issue2 = df.iloc[11900][" body"]

In [219]:
on_issue1

'U.S. Secretary of State John Kerry will visit South Korea next week to discuss pending bilateral issues as well as President Park Geun-hye\'s trip to Washington, the State Department announced Monday.During the May 17-18 visit, Kerry will meet with President Park and Foreign Minister Yun Byung-se to "discuss a range of global, regional, and bilateral issues, as well as President Park\'s upcoming visit to the United States," the department said in a statement.Kerry\'s discussions in Seoul are expected to include the assessment of the situation on the Korean Peninsula, and Park\'s planned visit to the U.S. in mid-June, among other bilateral issues, a diplomatic source in Seoul said earlier in the day.According to government sources, Park plans to make an "official working visit" to the U.S. in mid-June for talks with Barack Obama to discuss ways to strengthen alliances and boost coordination to better deter the belligerent North Korea. Kerry last visited South Korea in February 2014.Bef

In [223]:
# on_issue1_pred = predictor.predict(on_issue1)
on_issue1_pred

{'verbs': [{'verb': 'will',
   'description': 'U.S. Secretary of State John Kerry [V: will] visit South Korea next week to discuss pending bilateral issues as well as President Park Geun - hye \'s trip to Washington , the State Department announced Monday . During the May 17 - 18 visit , Kerry will meet with President Park and Foreign Minister Yun Byung - se to " discuss a range of global , regional , and bilateral issues , as well as President Park \'s upcoming visit to the United States , " the department said in a statement . Kerry \'s discussions in Seoul are expected to include the assessment of the situation on the Korean Peninsula , and Park \'s planned visit to the U.S. in mid - June , among other bilateral issues , a diplomatic source in Seoul said earlier in the day . According to government sources , Park plans to make an " official working visit " to the U.S. in mid - June for talks with Barack Obama to discuss ways to strengthen alliances and boost coordination to better d

In [224]:
on_issue2_pred = predictor.predict(on_issue2)

In [236]:
on_issue2_pred["verbs"]

[{'verb': 'met',
  'description': '[ARG0: President Park Geun - hye] [V: met] [ARG1: with U.S. Secretary of State John Kerry] [ARGM-ADV: on North Korea and bilateral issues] [ARGM-TMP: Monday] , an official said , amid growing threats from the communist nation . The presidential official did not give any further details . Kerry was to meet with Foreign Minister Yun Byung - se later in the day . Kerry flew into Seoul on Sunday afternoon from Beijing , where he voiced hope that a nuclear deal with Iran will send a positive message to North Korea . President Park Geun - hye ( right ) meets U.S. Secretary of State John Kerry in Seoul , Monday . ( Yonhap)In recent weeks , the North has committed a series of provocative acts , including an alleged test - launch of a ballistic missile from a submarine and the firing of artillery shells into the waters just north of the Yellow Sea border with the South . In their talks , the second round this year , Yun and Kerry are expected to reaffirm that 

In [228]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("meets")

'meet'

In [241]:
doc1 = nlp(on_issue1)
doc2 = nlp(on_issue2)

ent_types = ["GPE", "PERSON", "ORG", "LOC"]
doc1ents = set()
doc2ents = set()
for ent in doc1.ents:
    if ent.type in ent_types:
        doc1ents.add(ent.text)

for ent in doc2.ents:
    if ent.type in ent_types:
        doc2ents.add(ent.text)

print("[ doc1ents ]\n", doc1ents)
print("\n[ doc2ents ]\n", doc2ents)

[ doc1ents ]
 {'Kerry', 'the United States', 'Barack Obama', 'North Korea', 'Yun Byung', 'Washington', 'the Korean Peninsula', 'Northeast Asia', 'Seoul', 'State', 'Park', "Park Geun-hye's", 'Beijing', 'Japan', 'the State Department', 'Xi Jinping', 'U.S.', 'South Korea', 'John Kerry'}

[ doc2ents ]
 {'Kerry', 'Barack Obama', 'Yongsan', 'North Korea', 'Iran', 'Washington', 'Park Geun-hye', 'Yun Byung-se', 'Seoul', 'Yun', 'State', 'Yellow Sea', 'Park', 'North', 'Beijing', 'Korea University', 'Pyongyang', 'U.S.', 'South', 'John Kerry'}


In [251]:
# Check to see if the entities from NER exist as ARG0 or ARG1 in the given docs
doc1ents_list = list(doc1ents)
doc2ents_list = list(doc2ents)
doc1verbs = set()  # TODO: What's the use of VERBs?
doc2verbs = set()

arg0_doc1 = []
arg1_doc1 = []
arg0_doc2 = []
arg1_doc2 = []
for i, pred in enumerate(on_issue1_pred["verbs"]):
    doc1verbs.add(pred["verb"])
    arg_list = re.findall(r"\[(ARG\d[A-Za-z0-9_.: ]+)\]", pred["description"])
    for arg in arg_list:
        if "ARG0" in arg and any(ent in arg for ent in doc1ents_list):
            arg0_doc1.append(arg)
        elif "ARG1" in arg and any(ent in arg for ent in doc1ents_list):
            arg1_doc1.append(arg)
    
for i, pred in enumerate(on_issue2_pred["verbs"]):
    doc2verbs.add(pred["verb"])
    arg_list = re.findall(r"\[(ARG\d[A-Za-z0-9_.: ]+)\]", pred["description"])
    for arg in arg_list:
        if "ARG0" in arg and any(ent in arg for ent in doc2ents_list):
            arg0_doc2.append(arg)
        elif "ARG1" in arg and any(ent in arg for ent in doc2ents_list):
            arg1_doc2.append(arg)

print("[ doc1verbs ]\n", doc1verbs)
print("\n[ doc2verbs ]\n", doc2verbs)
print("\n[ ARG0 in doc1 ]", arg0_doc1)
print("\n[ ARG1 in doc1 ]", arg1_doc1)
print("\n[ ARG0 in doc2 ]", arg0_doc2)
print("\n[ ARG1 in doc2 ]", arg1_doc2)

[ doc1verbs ]
 {'expected', 'pending', 'meet', 'said', 'include', 'visit', 'advance', 'According', 'plans', 'make', 'boost', 'Jinping', 'deter', 'discuss', 'are', 'travel', 'strengthen', 'announced', 'visited', 'will', 'planned', 'does', 'visiting'}

[ doc2verbs ]
 {'focus', 'expected', 'heading', 'has', 'meet', 'said', 'visit', 'flew', 'Yonhap)In', 'voiced', 'plans', 'send', 'make', 'was', 'alleged', 'meets', 'did', 'give', 'are', 'growing', 'is', 'met', 'will', 'reaffirm', 'preparing', 'committed', 'dealing', 'including', 'deliver'}

[ ARG0 in doc1 ] ['ARG0: U.S. Secretary of State John Kerry', 'ARG0: U.S. Secretary of State John Kerry', 'ARG0: the State Department', 'ARG0: Kerry', 'ARG0: Kerry', 'ARG0: Park', 'ARG0: Park', 'ARG0: Kerry', 'ARG0: Kerry', 'ARG0: Kerry', 'ARG0: Kerry', 'ARG0: the State Department', 'ARG0: the source in Seoul']

[ ARG1 in doc1 ] ['ARG1: South Korea', 'ARG1: the planned visit to the U.S. of President Xi', 'ARG1: ways to strengthen alliances and boost coor

In [248]:
# Find all patterns that are contained with the square bracket [] - for verb
re.findall(r"\[([A-Za-z0-9_.: ]+)\]", on_issue1_pred["verbs"][1]["description"])

['ARG0: U.S. Secretary of State John Kerry', 'V: visit', 'ARG1: South Korea']

In [247]:
# Find all patterns that ar contained with the square bracket [ARG#: ...]
re.findall(r"\[(ARG\d[A-Za-z0-9_.: ]+)\]", on_issue1_pred["verbs"][1]["description"])

['ARG0: U.S. Secretary of State John Kerry', 'ARG1: South Korea']

In [258]:
# North Korea nuclear provocation vs. Trump tweets about missile launch

In [None]:
# Document Similarity (between two positive, On-Issue documents)
on_issue1 = df.iloc[11900][" body"]
on_issue2 = df.iloc[12014][" body"]
on_issue_docs = [on_issue1, on_issue2]

from sentence_transformers import SentenceTransformer, util
sentence_model = SentenceTransformer("distilbert-base-nli-mean-tokens")
embeddings = sentence_model.encode(on_issue_docs, show_progress_bar=True)

print("Embedding (shape): ", embeddings.shape)
cos_sim = util.pytorch_cos_sim(embeddings[0], embeddings[1])

print("Document Similarity: ", cos_sim)