In [618]:
import torch
import pandas as pd
import os 
import glob
import json
import numpy as np
import copy
import re
import random
import pickle
from collections import defaultdict, OrderedDict

In [2]:
import stanza
from tqdm.notebook import tqdm

## Load the Korea Herald Dataset
- Store the dataset in `df`
- Copy `df` into two different versions:
- `df_month` and `df_topic`
- `df_month` is used to reduce the time frame into months **(i.e. 2017-07-23 17:24:31 -> 2017-07-01)**.
- `df_topic` is used to group articles by the assigned topics.

In [524]:
PATH="./data"
df_temp = []
if os.path.exists(PATH):
    for file in glob.glob(os.path.join(PATH, "*.json")):
        with open(file, "r") as f:
            data=json.load(f)
            df_temp.append(pd.DataFrame.from_dict(data))

df = pd.concat(df_temp, ignore_index=True)

In [540]:
df.head(5)

Unnamed: 0,title,author,time,description,body,section
0,A snapshot of multiculturalism in South Korea,Lee Sun-young,2018-01-01 17:07:00,With birthrates persistently low and the senio...,With birthrates persistently low and the senio...,Social affairs
1,[Weekender] Korea’s dynamic 2017,Choi He-suk,2018-01-01 13:22:00,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs
2,People's Party members support Ahn's push for ...,Yonhap,2017-12-31 16:18:00,The leader of the center-left People's Party g...,The leader of the center-left People's Party g...,Politics
3,[Newsmaker] Panamanian vessel probed over susp...,Yonhap,2017-12-31 14:55:00,PYEONGTAEK -- South Korea has seized and insp...,PYEONGTAEK -- South Korea has seized and insp...,North Korea
4,Hong Kong ship crew questioned in S. Korea for...,AFP,2017-12-30 15:44:00,The crew of a Hong Kong-registered ship have b...,The crew of a Hong Kong-registered ship have b...,North Korea


In [537]:
df_month = copy.deepcopy(df)  # Used to generalize time frame to "months" instead of hours and days.
df_topic = copy.deepcopy(df)  # Used to append topic column after running BERTopic

## Replacing datetime in `time` from the dataset
- To group the data instances in `df_month` by months.
- This allows us to effectively visualize `topic_model.visualize_topics_over_time(topics_over_time)`

In [539]:
pattern = re.compile(r"(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})")
for i, row in tqdm(df.iterrows(), total=df_month.shape[0]):
    parsed_datetime = list(pattern.findall(df.iloc[i][" time"])[0])[:3]
    parsed_datetime[2] = "01"
    df_month.iloc[i][" time"] = "-".join(parsed_datetime)
df_month.head(5)

  0%|          | 0/23769 [00:00<?, ?it/s]

Unnamed: 0,title,author,time,description,body,section
0,A snapshot of multiculturalism in South Korea,Lee Sun-young,2018-01-01,With birthrates persistently low and the senio...,With birthrates persistently low and the senio...,Social affairs
1,[Weekender] Korea’s dynamic 2017,Choi He-suk,2018-01-01,From North Korea’s nuclear weapons program nea...,From North Korea’s nuclear weapons program nea...,Social affairs
2,People's Party members support Ahn's push for ...,Yonhap,2017-12-01,The leader of the center-left People's Party g...,The leader of the center-left People's Party g...,Politics
3,[Newsmaker] Panamanian vessel probed over susp...,Yonhap,2017-12-01,PYEONGTAEK -- South Korea has seized and insp...,PYEONGTAEK -- South Korea has seized and insp...,North Korea
4,Hong Kong ship crew questioned in S. Korea for...,AFP,2017-12-01,The crew of a Hong Kong-registered ship have b...,The crew of a Hong Kong-registered ship have b...,North Korea


## Using [BERTopic](https://maartengr.github.io/BERTopic/tutorial/algorithm/algorithm.html) for topic modeling & clustering
- BERTopic utilizes Sentence-BERT to create document embeddings.
- We use it to:
    1. Clustering articles according to their topic.
    2. Identify events per time frame.
    3. Calculate intra-topic pairwise document similarity and sort by time to discvoer **On-Issue** articles.
    4. Discover **Related-Issue** articles that are not **On-Issue** articles.

In [10]:
from bertopic import BERTopic

In [11]:
# Korea herald
model_path = "./bertopic_khearld_model.pth"
if not os.path.exists(model_path):
    '''
    BERTopic()
    
    - set `calculate_probabilities` to True to return the `probabilities`
    - set `nr_topics` to `auto` for automatic topic reduction with similarity over 0.9
    '''
    topic_model = BERTopic(calculate_probabilities=True)# BERTopic(calculate_probabilities=True, nr_topics="auto")
    topics, topic_prob = topic_model.fit_transform(body)
    topic_model.save(model_path)
else:
    print("Model already exists. Loading model from {} ...".format(model_path))
    topic_model = BERTopic.load(model_path)
    topics, topic_prob= topic_model.transform(df[" body"])

Model already exists. Loading model from ./bertopic_khearld_model.pth ...


In [12]:
print("topics (len): ", len(topics))
print(topics)
print("topic_prob (shape): ", topic_prob.shape)

topics (len):  23769
[ -1  -1  -1 ...  -1  -1 228]
topic_prob (shape):  (23769, 267)


## Create `df_topic` to group_by `topic`
- Assign the topics from BERTopic to every article

In [590]:
df_topic["topic"] = topics
group_df = df_topic.groupby("topic")
group_df.get_group(22).head(5)

Unnamed: 0,title,author,time,description,body,section,topic
7751,Ex-US Secretary of State Kerry joins Carnegie ...,KH디지털2,2017-03-03 09:47:00,Former US Secretary of State John Kerry has jo...,Former US Secretary of State John Kerry has jo...,International,22
10340,Kerry has no plans to meet with N. Korean FM: ...,KH디지털2,2015-08-05 09:32:00,U.S. Secretary of State John Kerry has no plan...,U.S. Secretary of State John Kerry has no plan...,North Korea,22
11824,Kerry's remark about THAAD meant 'internal U.S...,KH디지털2,2015-05-21 09:52:00,U.S. Secretary of State John Kerry was referri...,U.S. Secretary of State John Kerry was referri...,Defense,22
11864,Kerry thanks lawmaker for helping U.S. ambassa...,KH디지털2,2015-05-19 17:28:00,U.S. Secretary of State John Kerry has thanked...,U.S. Secretary of State John Kerry has thanked...,Politics,22
11900,Park meets with Kerry on N. Korea,KH디지털2,2015-05-18 10:52:00,President Park Geun-hye met with U.S. Secretar...,President Park Geun-hye met with U.S. Secretar...,International,22


In [559]:
# Randomly sample some topic_idx
topic_idx = random.sample(list(topic_idx_on_issue.keys()), 1)[0]
print("Number of article pairs: {}".format(len(topic_idx_on_issue[topic_idx])))
print("[ Topic_idx : {} ]".format(topic_idx))
print(">> TOPIC INFO >> {}".format(topic_model.get_topic(topic_idx)), end="\n")

Number of article pairs: 2162
[ Topic_idx : 210 ]
>> TOPIC INFO >> [('sanctions', 0.0177671787659192), ('coal', 0.012351948965624646), ('entities', 0.0066345724897857415), ('unsc', 0.006430243703105032), ('oil', 0.005954842057650002), ('imports', 0.00583182142390142), ('treasury', 0.004771416630785954), ('currency', 0.004246924392019113), ('iron', 0.0040253793251685595), ('blacklisted', 0.0038836375205140265)]


# On-Issue & Related-Issue Event Extraction
This part explains the extraction of On-Issue and Related-Issue events

1. Using `SentenceTransformer`, we generate the Seed article embedding (`seed_embedding`) and the intra-topic article embeddings (`doc_embeddings`).
2. Using the document vectors, we calculate cosine similarity for every `doc_embedding` against the `seed_embedding` to see its relevance.
3. After filtering doc_embeddings that have cosine similarity < 0.9, we sort them by time and select the closest one.
4. After Step 3, we have `on_issue_df`, a collection of **On-Issue articles**
5. Using `on_issue_df`, we select `related_issue_df`, which are a collection of articles that are NOT On-Issue. We treat these articles as the **Related-Issue articles**.

In [608]:
from sentence_transformers import SentenceTransformer, util
from summarizer import TransformerSummarizer
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
sentence_model = SentenceTransformer("distilbert-base-nli-mean-tokens")
summarizer_model = TransformerSummarizer(transformer_type="XLNet",transformer_model_key="xlnet-base-cased")

### Construct new dataframe for topic based on similarity score > 0.90 (Seed Article)

In [502]:
def on_issue_select(seed_idx, group_df, sentence_model):
    '''
    Greedily chooses On-Issue events given a seed article after constructing a cosine-similarity based graph
    
    Args:
    
    seed_idx - seed document index from the given topic (i.e., Issue)
    group_df- contains info about the docs and seed doc within the same topic
    sentence_model - Sentence encoder model (e.g., SentenceTransformer("distilbert-base-nli-mean-tokens"))
    
    Return:
    
    onissue_list : pd.DataFrame
    
    '''
    # Termination criteria
    if group_df.shape[0] < 1:
        return None
    
    else:
        topic_idx2doc = dict(group_df[" body"])
        doc_idx, docs = zip(*topic_idx2doc.items())
        key2idx = dict(zip(list(range(len(doc_idx))), doc_idx))  # `key2idx` is Dict[key: document_idx]
        idx2key = {v: k for k, v in key2idx.items()}

        doc_embeddings = torch.from_numpy(sentence_model.encode(docs))
        seed_embedding = doc_embeddings[idx2key[seed_idx]].unsqueeze(0)
        cos_sim = util.pytorch_cos_sim(seed_embedding, doc_embeddings)

        sim_doc_idx = []
        for i, sim_score in enumerate(cos_sim[0]):
            if sim_score.item() > 0.9 and sim_score.item() < 0.98:  # Prevent same articles from being included
                sim_doc_idx.append(key2idx[i])
        if len(sim_doc_idx) < 1:  # No document with cosine similiarity > 0.9, then terminate
            return None
        group_df_seed = group_df.loc[sim_doc_idx]
        group_df_seed.sort_values(by=[" time"], inplace=True)  # Sort by time
        next_seed_doc = group_df_seed.iloc[0] # Greedily select the "closest" article in time
        next_seed_idx = group_df_seed.index[0]

        # Remove documents that are earlier in time
        next_group_df = group_df[group_df[" time"] >= group_df.loc[next_seed_idx][" time"]]
        next_events = on_issue_select(next_seed_idx, next_group_df, sentence_model)
        
    onissue_list = []
    onissue_list.append(next_seed_doc)  # The next event selected
    if next_events is not None:
        onissue_list.extend(next_events)
    
    return onissue_list

In [511]:
def related_issue_select(on_issue_df, group_df, sentence_model):
    '''
    Choose Related-Issue articles based on On-Issue articles from the same topic.
    
    Args:
    
    on_issue_df - On-Issue articles retrieved by `on_issue_select()` method.
    group_df- contains info about the docs and seed doc within the same topic.
    sentence_model - Sentence encoder model (e.g., SentenceTransformer("distilbert-base-nli-mean-tokens")).
    
    Return:
    
    related_issue_df : pd.DataFrame
    
    '''
    on_issue_idx = list(on_issue_df.index)
    rel_issue_idx = [rel_idx for rel_idx in group_df.index if rel_idx not in on_issue_idx]
    related_issue_df = group_df.loc[rel_issue_idx]
    
    return related_issue_df

In [653]:
def select_seed(topic_model, topic_idx, group_df, summarizer, tokenizer):
    '''
    Select seeds based on the following criteria:
    1. Choose the top-3 most similar documents to the given Issue (ie., topic from BERTopic).
    (By choosing the documents with the most top-10 terms for the chosen topic.)
    2. Summarize the documents using XLNet summarizer to reduce time complexity.
    3. Choose the article with the earliest (i.e., least) date (heuristic to ensure that there are subsequent articles coming along).
    '''
    idx2overlap = {}
    for i, row in group_df.iterrows():
        overlap = 0
        topic_terms, c_tfidf = map(list, zip(*topic_model.get_topic(topic_idx)))
        summ_doc = summarizer(row[" body"], min_length=60).lower()  # Summarize the body
        summ_doc_tok = tokenizer.tokenize(summ_doc)  # Tokenize the temrs to match the terms from BERTopic
        overlap = len([term for term in topic_terms if term in summ_doc_tok])
        idx2overlap[i] = overlap
    idx2overlap_sorted = OrderedDict(sorted(idx2overlap.items(), key=lambda item: item[1], reverse=True))
    candidate_seeds, _ = map(list, zip(*list(idx2overlap_sorted.items())[:3]))
    candidate_docs = group_df.loc[candidate_seeds]
    candidate_docs.sort_values(by=[" time"], inplace=True)

    return candidate_docs.iloc[0]

In [661]:
# Example of `select_seed()`
seed_sample = select_seed(topic_model, 22, group_df.get_group(22), summarizer_model, tokenizer)
seed_sample.name

11905

In [662]:
def generate_issue_df(topic_idx, group_df, sentence_model):
    '''
    Args
    
    topic_idx - Topic index from the List of topics from BERTopic() for each article
    group_df - using pandas.DataFrame for dataset that has been grouped by `topics`
    issue_type - Decides if issue_type is either `onissue` or `relissue`
    '''
    # Group articles by topic
    group_df_sample = group_df.get_group(topic_idx)
    group_df_sample.sort_values(by=[" time"], inplace=True)  # Sort by time (Greedy search according to minimum time distance)
    
    seed_doc = select_seed(topic_model, topic_idx, group_df_sample, summarizer_model, tokenizer)
    seed_idx = seed_doc.name
    
    on_issue_docs = on_issue_select(seed_idx, group_df_sample, sentence_model)
    on_issue_df = pd.DataFrame(on_issue_docs)
    related_issue_docs = related_issue_select(on_issue_df, group_df_sample, sentence_model)
    related_issue_df = related_issue_docs
        
    return on_issue_df, related_issue_df

- Try to generate **On-Issue** and **Related-Issue** articles from topic-grouped DataFrames

In [663]:
for topic_idx in tqdm(topics[topics != -1]):
    topic_idx = 22

    # Generate On-Issue and Related-Issue articles
    on_issue_df, related_issue_df = generate_issue_df(topic_idx, group_df, sentence_model)
    
    break

  0%|          | 0/13016 [00:00<?, ?it/s]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [664]:
on_issue_df

Unnamed: 0,title,author,time,description,body,section,topic
12019,Kerry to visit S. Korea next week: source,배지숙,2015-05-11 21:30:00,U.S. Secretary of State John Kerry will visit ...,U.S. Secretary of State John Kerry will visit ...,North Korea,22
11998,N. Korea's missile test to be on agenda for Yu...,KH디지털2,2015-05-12 17:10:00,South Korean Foreign Minister Yun Byung-se and...,South Korean Foreign Minister Yun Byung-se and...,International,22
11917,"Kerry due in Seoul for talks on N. Korea, alli...",KH디지털2,2015-05-17 10:49:00,U.S. Secretary of State John Kerry is schedule...,U.S. Secretary of State John Kerry is schedule...,North Korea,22
11915,"Kerry in South Korea to talk security, cyber i...",김연세,2015-05-17 18:43:00,U.S. Secretary of State John Kerry is in South...,U.S. Secretary of State John Kerry is in South...,Foreign Policy,22
11900,Park meets with Kerry on N. Korea,KH디지털2,2015-05-18 10:52:00,President Park Geun-hye met with U.S. Secretar...,President Park Geun-hye met with U.S. Secretar...,International,22
19149,Kerry discusses N.K. nuclear test with Russian FM,KH디지털2,2016-01-12 09:31:00,U.S. Secretary of State John Kerry spoke by ph...,U.S. Secretary of State John Kerry spoke by ph...,North Korea,22


In [665]:
related_issue_df

Unnamed: 0,title,author,time,description,body,section,topic
14594,Kerry heads to India seeking economic gains,최희석,2015-01-10 14:51:00,U.S. Secretary of State John Kerry left late F...,U.S. Secretary of State John Kerry left late F...,International,22
12014,Kerry to visit S. Korea next week,KH디지털2,2015-05-12 09:45:00,U.S. Secretary of State John Kerry will visit ...,U.S. Secretary of State John Kerry will visit ...,International,22
11905,N.K. missile test unacceptable: Kerry,Shin Hyon-hee,2015-05-17 20:40:00,U.S. Secretary of State John Kerry on Saturday...,U.S. Secretary of State John Kerry on Saturday...,North Korea,22
11904,"Yun, Kerry reaffirm strong alliance against N....",KH디지털2,2015-05-18 09:21:00,Foreign Minister Yun Byung-se and U.S. Secreta...,Foreign Minister Yun Byung-se and U.S. Secreta...,Foreign Policy,22
11864,Kerry thanks lawmaker for helping U.S. ambassa...,KH디지털2,2015-05-19 17:28:00,U.S. Secretary of State John Kerry has thanked...,U.S. Secretary of State John Kerry has thanked...,Politics,22
11824,Kerry's remark about THAAD meant 'internal U.S...,KH디지털2,2015-05-21 09:52:00,U.S. Secretary of State John Kerry was referri...,U.S. Secretary of State John Kerry was referri...,Defense,22
10340,Kerry has no plans to meet with N. Korean FM: ...,KH디지털2,2015-08-05 09:32:00,U.S. Secretary of State John Kerry has no plan...,U.S. Secretary of State John Kerry has no plan...,North Korea,22
19056,Kerry to visit China for talks on N. Korea,Cho Chung-un,2016-01-16 10:44:00,U.S. Secretary of State John Kerry will visit ...,U.S. Secretary of State John Kerry will visit ...,,22
18902,"FM Yun, Kerry reaffirm strong response to N. K...",이우영,2016-01-24 20:38:00,SEOUL (Yonhap) – South Korean Foreign Minister...,SEOUL (Yonhap) – South Korean Foreign Minister...,,22
18866,Kerry: N.K. leader 'questionable in terms of j...,KH디지털2,2016-01-26 09:16:00,U.S. Secretary of State John Kerry said Monday...,U.S. Secretary of State John Kerry said Monday...,North Korea,22


In [518]:
def print_onissue(onissue_df, on_issue_name=""):
    '''
    Args
    
    onissue_df - pandas.DataFrame for On-Issue articles
    on_issue_name - String that summarizes the issue
    
    '''
    print("[ Issue ]\n")
    print(on_issue_name, end="\n\n")

    print("[ On-Issue Events]\n")
    detailed_info_list = {}
    for i, (idx, row) in enumerate(onissue_df.iterrows()):
        detailed_info = defaultdict(list)
        doc_summ = nlp(row[" body"])
        print(row["title"], end="  -->  ")
        for ent in doc_summ.ents:
            if ent.type == "PERSON":
                detailed_info["Person"].append(ent.text)
            elif ent.type == "ORG":
                detailed_info["Organization"].append(ent.text)
            elif ent.type == "LOC" or ent.type == "GPE":
                detailed_info["Place"].append(ent.text)

        detailed_info_list[idx] = detailed_info

    print("\n\n[ Detailed Information (per event)]")
    for doc_idx, detail_dict in detailed_info_list.items():
        print("\nEvent: {}\n".format(df.iloc[doc_idx]["title"]))
        for ent_type, ent_list in detail_dict.items():
            print("\t- {} : ".format(ent_type), end=" ")
            for ent in set(ent_list):
                print(ent, end=", ")
            print()

In [519]:
print_onissue(on_issue_df, on_issue_name="U.S. and South Korea on North Korea Missile Test")

[ Issue ]

U.N. Sanctions on North Korea

[ On-Issue Events]

Kerry to visit S. Korea next week: source  -->  N. Korea's missile test to be on agenda for Yun, Kerry talks  -->  Kerry due in Seoul for talks on N. Korea, alliance  -->  Kerry in South Korea to talk security, cyber issues  -->  Park meets with Kerry on N. Korea  -->  Kerry discusses N.K. nuclear test with Russian FM  -->  

[ Detailed Information (per event)]

Event: Kerry to visit S. Korea next week: source

	- Place :  Beijing, Japan, North Korea, Washington, the Korean Peninsula, South Korea, U.S., Seoul, Northeast Asia, 
	- Organization :  State, 
	- Person :  Barack Obama, Kerry, John Kerry, Yun Byung, Park Geun-hye's, Park, 

Event: N. Korea's missile test to be on agenda for Yun, Kerry talks

	- Person :  Kerry, Noh, John Kerry, Kim Jong-un, Yun, Noh Kwang, Park Geun-hye's, Park, Yun Byung-se, 
	- Place :  Asia, Beijing, North Korea, North Korea's, South Korea, U.S., Seoul, 
	- Organization :  U.N. Security Council,

In [517]:
def print_relissue(relissue_df, rel_issue_name=""):
    '''
    Args
    
    relissue_df - pandas.DataFrame for Related-Issue articles
    rel_issue_name - String that summarizes the issue
    
    '''
    print("[ Issue ]\n")
    print(rel_issue_name, end="\n\n")

    print("[ Related-Issue Events]\n")
    detailed_info_list = {}
    for i, (idx, row) in enumerate(relissue_df.iterrows()):
        detailed_info = defaultdict(list)
        doc_summ = nlp(row[" body"])
        print(row["title"], end=" , ")
        for ent in doc_summ.ents:
            if ent.type == "PERSON":
                detailed_info["Person"].append(ent.text)
            elif ent.type == "ORG":
                detailed_info["Organization"].append(ent.text)
            elif ent.type == "LOC" or ent.type == "GPE":
                detailed_info["Place"].append(ent.text)

        detailed_info_list[idx] = detailed_info

    print("\n\n[ Detailed Information (per event)]")
    for doc_idx, detail_dict in detailed_info_list.items():
        print("\nEvent: {}\n".format(df.iloc[doc_idx]["title"]))
        for ent_type, ent_list in detail_dict.items():
            print("\t- {} : ".format(ent_type), end=" ")
            for ent in set(ent_list):
                print(ent, end=", ")
            print()

In [520]:
print_relissue(related_issue_df, rel_issue_name="U.S. and South Korea on North Korea Missile Test")

[ Issue ]

U.S. on North Korea

[ Related-Issue Events]

Kerry heads to India seeking economic gains , Kerry to visit S. Korea next week , N.K. missile test unacceptable: Kerry , Yun, Kerry reaffirm strong alliance against N. Korea , Kerry thanks lawmaker for helping U.S. ambassador during knife attack , Kerry's remark about THAAD meant 'internal U.S. discussions': State Department , Kerry has no plans to meet with N. Korean FM: State Department , Kerry to visit China for talks on N. Korea , FM Yun, Kerry reaffirm strong response to N. Korea's nuke test , Kerry: N.K. leader 'questionable in terms of judgment' , FM Yun, Kerry agree on closer coordination over N.K. sanctions , FM Yun, Kerry agree on closer coordination over N.K. sanctions , Kerry: U.S., Russia to meet Saturday on Syria ceasefire , Response to NK nuclear test key focus of trilateral talks: State Department , Top US, Chinese diplomats hold talks in New York , FM Yun, Kerry reaffirm strong alliance amid uncertainties , Ex-U

### Naive Summarization of each article using XLNet TextSummarizer
- Without SRL-BERT and ARG0 <-> ARG1 switch analysis

In [186]:
from summarizer import TransformerSummarizer
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')  # Tokenize to retrieve sentence-level information

2021-06-04 06:28:08 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2021-06-04 06:28:08 INFO: Use device: gpu
2021-06-04 06:28:08 INFO: Loading: tokenize
2021-06-04 06:28:08 INFO: Loading: ner
2021-06-04 06:28:09 INFO: Done loading processors!


### `TransformerSummarizer` for Text Summarization
- `TransformerSummarizer` using XLNet-base model to produce a summarized version of the On-Issue and Related-Issue articles.

In [133]:
summarizer_model = TransformerSummarizer(transformer_type="XLNet",transformer_model_key="xlnet-base-cased")
# Example
full = ''.join(summarizer_model(df.iloc[on_issue_sample1][" body"], min_length=60))
print(full)

North Korea criticized South Korea Tuesday for reaching a deal with Japan over Tokyo's wartime sex slavery, calling it a "humiliating agreement" that missed holding the Japanese state responsible for such atrocities.


In [None]:
stop_iter = 10
print("[ Issue ]\n")
print("South Korea Comfort Women Issue regarding Japan's War Crimes\n")

print("[ On-Issue Events]\n")
detailed_info = defaultdict(list)
detailed_info_list = {}
for i, (idx, row) in enumerate(df_onissue.iterrows()):
    text_summ = ''.join(summarizer_model(row[" body"]))
    doc_summ = nlp(text_summ)
    # first_sent = doc_summ.sentences[0].text
    # print(first_sent, end="\n --> \n")
    print(row["title"], end="\n --> \n")
    for ent in doc_summ.ents:
        if ent.type == "PERSON":
            detailed_info["Person"].append(ent.text)
        elif ent.type == "ORG":
            detailed_info["Organization"].append(ent.text)
        elif ent.type == "LOC" or ent.type == "GPE":
            detailed_info["Place"].append(ent.text)
    
    detailed_info_list[idx] = detailed_info
    
    if i > stop_iter:
        break

print("[ Detailed Information (per event)]")
for doc_idx, detail_dict in detailed_info_list.items():
    print("\nEvent: {}\n".format(df.iloc[doc_idx]["title"]))
    for ent_type, ent_list in detail_dict.items():
        print("\t- {} : {}".format(ent_type, set(ent_list)))

## Visualizing Topics
- `topic_model.get_topic_info()` identifies the topics, count of documents that are associated with each topic, and the topic's name
- `topic_model.get_topic(topic_id)` returns the word-wise probability for a given topic_id

In [60]:
freq = topic_model.get_topic_info()
print("Number of topics: ", len(freq["Topic"].unique()))
freq

Number of topics:  268


Unnamed: 0,Topic,Count,Name
0,-1,10753,-1_presidential_opposition_moon_political
1,57,821,57_japanese_japan_japans_tokyo
2,170,663,170_talks_meeting_discuss_beijing
3,265,617,265_prosecutors_choi_prosecution_scandal
4,194,561,194_victim_suspect_sexual_murder
...,...,...,...
263,160,10,160_dinner_melania_trump_chopsticks
264,89,10,89_train_rajin_railway_coal
265,103,10,103_care_patients_centers_sex
266,75,10,75_universiade_gymnastics_unification_secretar...


In [63]:
topic_model.get_topic(freq.iloc[10]["Topic"])

[('ballistic', 0.013203544875101546),
 ('missile', 0.013021940785550155),
 ('missiles', 0.011689697625838059),
 ('launch', 0.01121248576969845),
 ('rocket', 0.010134248867486694),
 ('icbm', 0.006965266830722244),
 ('intercontinental', 0.0050852210293444305),
 ('launches', 0.004922212570814694),
 ('flew', 0.0047412265128027155),
 ('pyongyang', 0.0038268246850366996)]

In [64]:
vis_topic_path = "./topic_korea_herald.html"
if not os.path.exists(vis_topic_path):
    fig = topic_model.visualize_topics()
    fig.write_html(vis_topic_path)

# Visualizing probabilities of a topic found in a document
- Need to set `calculate_probabilities` to **True** in `BERTopic` (computationally expensive).
- This step visualizes the probability of an article being assigned to the topic.

In [570]:
sample_idx = np.random.randint(df.shape[0])
print("\n[ Topic Number ]: ", topics[sample_idx])
topic_num = topics[sample_idx]
print("\n[ Words in topic] : ", topic_model.get_topic(topic_num))
print("\n[ Description] : ", df.iloc[sample_idx][" description"])
print("\n[ Topic probability (sample_idx) ] :", max(topic_prob[sample_idx]))
topic_model.visualize_distribution(topic_prob[sample_idx])


[ Topic Number ]:  228

[ Words in topic] :  [('deployment', 0.008912433461129935), ('armament', 0.007390715914431122), ('defense', 0.006806755180168765), ('tactical', 0.00581316304826488), ('nuclear', 0.005508305437870392), ('missile', 0.004542051017076492), ('terminal', 0.004320222663567678), ('nukes', 0.003861616867407047), ('deploy', 0.0036770829583211016), ('allies', 0.002893668542359177)]

[ Description] :  North Koreans are feeling insecure about infrastructure damage caused by the reclusive country's latest nuclear test but remain indifferent to its outcome, U.S.-based media Radio Free Asia said Sunday, citing Japanese media outlet Asia Press.A resident in Ryanggang Province bordering China told Asia Press that a number of apartments located some 150 kilometers away from the Punggye-ri nuclear test site in the country...

[ Topic probability (sample_idx) ] : 0.9637397343767966


In [123]:
timestamps = df_month[" time"].tolist()
topics_over_time = topic_model.topics_over_time(body, topics, timestamps, nr_bins=20)
topics_over_time


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



Unnamed: 0,Topic,Words,Frequency,Timestamp,Name
0,-1,"year, years, saenuri, political, presidential",532,2014-12-30 21:41:45.600,-1_presidential_opposition_moon_politica...
1,0,"evacuees, evacuations, evacuationhe, evacuatio...",1,2014-12-30 21:41:45.600,0_evacuees_evacuations_evacuationhe_evac...
2,2,"australia, australian, australias, australians...",2,2014-12-30 21:41:45.600,2_australian_australia_australias_julie
3,7,"germany, german, munich, berlin, unification",11,2014-12-30 21:41:45.600,7_german_germany_berlin_hamburg
4,8,"lunar, realmeterthose, weekrealmeter, poll, ho...",1,2014-12-30 21:41:45.600,8_percentage_rating_poll_polling
...,...,...,...,...,...
2994,248,"ice, kma, kilju, earthquake, freezing",2,2017-11-07 04:48:00.000,248_punggyeri_tunnel_magnitude_bomb
2995,249,"professors, choi, influencepeddling, jaemok, h...",1,2017-11-07 04:48:00.000,249_chung_daughter_choi_kyunghee
2996,255,"bill, budget, bills, revision, referendum",7,2017-11-07 04:48:00.000,255_bill_bills_parties_budget
2997,262,"hosseinioun, trial, detention, lawyers, gaddafi",1,2017-11-07 04:48:00.000,262_court_justices_constitutional_justic...


In [122]:
topic_time_path = "./topics_over_time_topic_top10.html"
if not os.path.exists(topic_time_path):
    fig_time = topic_model.visualize_topics_over_time(topics_over_time, top_n=10)
    fig_time.write_html(topic_time_path)
fig_time


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

