In [9]:
pip install transformers==3.0.2 sentence_transformers==0.3.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
import re
import os
import json
import pickle
import numpy as np
import pandas as pd
import random
import torch 
from torch import nn
import seaborn as sns
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
cd /content/drive/MyDrive/Capstone - Causal Narratives Extraction/Phrase BERT code/phrase-bert-topic-model-master/phrase-topic-model/

/content/drive/.shortcut-targets-by-id/1zZCiCdXv6HLRAkEj-TagubvK1e49OsAz/Capstone - Causal Narratives Extraction/Phrase BERT code/phrase-bert-topic-model-master/phrase-topic-model


In [13]:
from model.dae_model import DictionaryAutoencoder
from model_utils import run_epoch, text_to_topic, rank_topics_by_percentage

In [14]:
cd /content/drive/MyDrive/Capstone - Causal Narratives Extraction/Phrase BERT code/

/content/drive/.shortcut-targets-by-id/1zZCiCdXv6HLRAkEj-TagubvK1e49OsAz/Capstone - Causal Narratives Extraction/Phrase BERT code


not changing the variable name "headline" from now on to save time, but the file is actually ones_NBC_2019.csv

In [15]:
headline = pd.read_csv('ones_NBC_2019.csv')
data_args = headline["args"].str.split("', '", n = 1, expand = True)
headline["arg0"]= data_args[0].apply(lambda x : '' if x is None else x.strip("('").strip())
headline["arg1"]= data_args[1].apply(lambda x : '' if x is None else x.strip("')"))
headline.loc[headline["arg1"] != headline["arg1"],"arg1"] = 'NA'
headline.loc[headline["arg0"] != headline["arg0"],"arg0"] = 'NA'

In [16]:
headline.head(5)

Unnamed: 0,id,index,outlet,political_leaning,date_publish,text,args,year,arg0,arg1
0,55295569,polusa_55295569_2_0,NBC News,CENTER,2019-01-01 01:01:00,/ Updated / Source: Reuters SEOUL - North Kore...,('the United States continues to demand unilat...,2019,the United States continues to demand unilater...,"but he may have to seek a ""new path"""
1,55295569,polusa_55295569_3_0,NBC News,CENTER,2019-01-01 01:01:00,"In his New Year address, Kim said there would ...",('the United States takes corresponding action...,2019,the United States takes corresponding action.,there would be faster progress on denucleariza...
2,55295569,polusa_55295569_4_0,NBC News,CENTER,2019-01-01 01:01:00,He added that he is willing to meet U.S. Presi...,('to produce results that the international co...,2019,to produce results that the international comm...,to meet U.S. President Donald Trump at any time
3,55295569,polusa_55295569_5_0,NBC News,CENTER,2019-01-01 01:01:00,"North Korea however would have ""no option but ...","('States', 'North Korea')",2019,States,North Korea
4,55295569,polusa_55295569_5_3,NBC News,CENTER,2019-01-01 01:01:00,"North Korea however would have ""no option but ...","('States', 'our sovereignty""')",2019,States,"our sovereignty"""


In [17]:
# list of arg0s and arg1s
headline_arg0s = headline['arg0']
headline_arg1s = headline['arg1']
headline_arg0s = headline_arg0s.tolist()
headline_arg1s = headline_arg1s.tolist()
# list of all args for POLUSA headlines
headline_args = headline_arg0s + headline_arg1s

In [18]:
# construct text_list
headline_text = headline['text']
headline_text_list = headline_text.tolist()

In [19]:
# construct dictionaries of word2id
headline_word2id = {val : idx for idx, val in enumerate(set(headline_args))}
len(headline_word2id.keys()), len(headline_args) # duplicates

(73735, 123370)

In [20]:
# construct dictionaries of id2word
headline_id2word = {val: key for key, val in headline_word2id.items()}

In [22]:
# contruct dictionaries of id2freq
headline_id2freq = headline_id2word.copy()
headline_freq = [(arg, headline_args.count(arg)) for arg in set(headline_args)]
i = 0
for key, val in headline_id2freq.items():
    headline_id2freq[key] = headline_freq[i][1]
    i += 1

In [23]:
# load the Phrase-BERT model through the sentence-BERT interface
model_path = "/content/drive/MyDrive/Capstone - Causal Narratives Extraction/Phrase BERT code/pooled_context_para_triples_p=0.8/"
model = SentenceTransformer(model_path)

In [None]:
# compute phrase embeddings using Phrase-BERT
headline_phrase_embs = model.encode(set(headline_args), batch_size=8, show_progress_bar=True)
headline_embs = np.asarray(headline_phrase_embs)

In [None]:
# save the results
topic_model_data_path = "/content/drive/MyDrive/Capstone - Causal Narratives Extraction/Phrase BERT code/"
np.save(os.path.join(topic_model_data_path, 'ones_NBC_2019_embs_matrix_np'), headline_embs)

In [24]:
# set seed
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f9f5d0e7370>

In [25]:
topic_model_data_path = "/content/drive/MyDrive/Capstone - Causal Narratives Extraction/Phrase BERT code/"

In [26]:
headline_embs_matrix_np = np.load(os.path.join(topic_model_data_path, f"ones_NBC_2019_embs_matrix_np.npy"))
print(f"Loaded headline word embedding from {topic_model_data_path}")
print(f"Loaded vocab size of {len(headline_word2id)} (including phrases)")

Loaded headline word embedding from /content/drive/MyDrive/Capstone - Causal Narratives Extraction/Phrase BERT code/
Loaded vocab size of 73735 (including phrases)


In [27]:
len(headline_embs_matrix_np)

73735

Below code is adapted from Phrase-Bert: https://github.com/sf-wa-326/phrase-bert-topic-model

In [28]:
# word frequency and filter info

# compute the length (in n-grams)
# setting word_threshould really high to include every phrase, reset to lower value to remove longer phrases
word_threshold = 100

headline_len_words = [0] * len(headline_id2word)
for (id, word) in headline_id2word.items():
    headline_len_words[id] = len(word.split(' '))
# setting word_len to 10 because 
# and args contain empty spaces at the beginning and end of the strings
headline_indices_to_remove_based_on_len = [id 
                                           for id, word_len 
                                           in enumerate(headline_len_words) 
                                           if (word_len > word_threshold )]

print(len(headline_indices_to_remove_based_on_len)) # 0

0


In [29]:
# keeping every token / word, reset freq_threshold to remove lower frequency words
freq_threshold = 0

headline_sorted_ids = [k for k, v in sorted(headline_id2freq.items(), key=lambda item: item[1])]
headline_sorted_ids.reverse()
headline_indices_to_remove_based_on_freq = [k for k, v in headline_id2freq.items() if v <= freq_threshold ]
headline_to_be_removed = list(set(headline_indices_to_remove_based_on_freq + headline_indices_to_remove_based_on_len))

In [30]:
# encode the text_lists
headline_text_rep_list = model.encode(headline_text, batch_size = 8, show_progress_bar = True)

Batches:   0%|          | 0/7711 [00:00<?, ?it/s]

In [31]:
emb_model = "phrase-bert"
print(f"Building sentence model by using {emb_model} as embedding model")

headline_uid_input_vector_list = [(i, headline_text_rep_list[i]) for i in range(len(headline_text_rep_list))]
print(f"Computed {len(headline_uid_input_vector_list)} positive examples")

Building sentence model by using phrase-bert as embedding model
Computed 61685 positive examples


In [32]:
# setting the argument num_negative_samples for negative sampling
num_neg_samples = 10 # default in the original model

headline_uid_input_vector_list_neg = []
indices = list(range(len(headline_uid_input_vector_list)))
for idx in range(len(headline_uid_input_vector_list)):
    indices_candidate = indices
    neg_indices = random.sample(indices_candidate, num_neg_samples)
    neg_samples = [headline_uid_input_vector_list[neg_i][1] for neg_i in neg_indices]
    neg_vector = np.mean(neg_samples, axis=0)
    headline_uid_input_vector_list_neg.append(neg_vector)
print(f"Computed {len(headline_uid_input_vector_list_neg)} negative examples")

Computed 61685 negative examples


In [33]:
# set up hyperparameters
headline_net_params = {}
headline_net_params["mode"] = "bert"
headline_net_params["embedding"] = headline_embs_matrix_np
headline_net_params["d_hid"] = 100
headline_net_params["num_rows"] = 100  # number of topics
headline_net_params["num_sub_topics"] = 0
headline_net_params["word_dropout_prob"] = 0.2
headline_net_params["vrev"] = headline_id2word  # idx to word map
headline_net_params["device"] = 'cuda'
headline_net_params["pred_world"] = False

In [34]:
headline_net = DictionaryAutoencoder(net_params=headline_net_params)
headline_net.to('cuda')

DictionaryAutoencoder(
  (embeddings): Embedding(73735, 768)
  (W_proj): Linear(in_features=768, out_features=100, bias=True)
  (act): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (W_att): Linear(in_features=100, out_features=768, bias=True)
  (W_out): Linear(in_features=768, out_features=73735, bias=True)
)

In [35]:
# training specs (default from original code)
num_epochs = 500
batch_size = 1024
ortho_weight = 1e-5
world_clas_weight = 0.0
headline_optim = torch.optim.Adam(headline_net.parameters(), lr=1e-4)
interpret_interval = int(np.ceil(num_epochs / 10))
h_model = 2

In [36]:
# iterating through batches
headline_batch_intervals = [
    (start, start + batch_size)
    for start in range(0, len(headline_uid_input_vector_list), batch_size)]
    # batch_intervals = batch_intervals[:100]
headline_split = int(np.ceil(len(headline_batch_intervals) * 0.9))
headline_batch_intervals_train = headline_batch_intervals[:headline_split]
headline_batch_intervals_valid = headline_batch_intervals[headline_split:]

In [37]:
import argparse
parser = argparse.ArgumentParser()
"""parser.add_argument("--lr", type=float, default=1e-4)
"""
args = parser.parse_args(args=[])
args.device = 'cuda:' + '0'
args.triplet_loss_margin = 1.0
args.triplet_loss_weight = 1.0
args.ortho_weight = 1e-5
args.neighbour_loss_weight = 1e-7
args.offset_loss_weight = 1e-4

In [38]:
# headline training
print("\n" + "=" * 70)
for epoch in range(num_epochs):
    # training
    headline_net.train()
    train_mode = True
    print(f"Epoch {epoch}")
    run_epoch(headline_net, headline_optim, headline_batch_intervals_train,
              headline_uid_input_vector_list, headline_uid_input_vector_list_neg,
              args, train_mode, h_model, epoch, 100)

    # validation
    headline_net.eval()
    train_mode = False
    with torch.no_grad():
        run_epoch(
                headline_net,
                headline_optim,
                headline_batch_intervals_valid,
                headline_uid_input_vector_list,
                headline_uid_input_vector_list_neg,
                args, 
                train_mode,
                h_model,
                epoch,
                200
        )

    if (epoch + 1) % interpret_interval == 0:
        print("Topics with probability argmax")
        topics_print_list = headline_net.rank_vocab_for_topics(
            word_embedding_matrix=headline_embs_matrix_np,
            to_be_removed=headline_to_be_removed)
        print("=" * 70)

    print()
    print()
    print()
    print("=" * 70)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
topic 95 : he still supports Trump, to start a business to help individuals get the coveted permits., remarks, themselves,, us that we’re behind the curve”, who at the Trump campaign directed Roger Stone, he can run, at least a little bit, against both parties,', "It's all the better"), a car plowed into people on a busy sidewalk,, their prospects are brightening in the Rust Belt., the military does not consent.
[24557 37005 52359 33828 55434 67255 13823  5487 10383 53035]
topic 96 : we have some f---ing issues\',"', "the final franchise agreement with Comcast 'was not good for me,"), she’ll be quick to tell you it comes in all different shapes and sizes., on and, cultural shift away from overworking., PrEP, is more than 99 percent effective in preventing infection, serious investigations can actually take, their operation,", governments do not meet their responsibility to investigate and prosecute atrocities., her quick 

In [39]:
# headline print topic list
print("Finally after training")
headline_net.eval()

print("Topics with probability argmax")
prob_over_vocab_np, topics_print_list = headline_net.rank_vocab_for_topics(
            word_embedding_matrix=headline_embs_matrix_np, to_be_removed=headline_to_be_removed
    )
print("=" * 70)

Finally after training
Topics with probability argmax
100
100
[16201 44520 26595 25551  8191 26113 31811 28324 30354 53901]
topic 0 : Detectives from the Victor Valley Sheriff’s Station and investigators from the San Bernardino County Coroner’s Division responded to the scene, "Mueller hasn't looked at Trump's relationship the German bank,", 'But if'), the murder, and grant the, take on the most powerful industry in California, Both parties should embrace this opportunity, and boosting production at five existing factories., "Facebook's $5 billion 'sweetheart deal'", 'question:'), of the worst attacks in years against India., and has put off some key regulations until 2022.
[10989 32843 70260 21244 33441 25723 10010 19376 61330 29364]
topic 1 : said, providing temporary refuge, the risk of breast cancer., that I find so profound about the book and profound about the exercise of thinking about it,, colluding with Russians, airstrikes in 1999, made even more chaotic, we could not call th

In [34]:
pd.DataFrame(topics_print_list).to_csv(f"ones_TopicsToWords_NBC_2019_100.csv",index=False)
np.save(f"ones_argsToTopics_NBC_2019_100.npy",prob_over_vocab_np)

In [40]:
# after training we evaluate all the topics percentage in the dataset and rank the topics by percentage
uid_list, vector_list = zip(*headline_uid_input_vector_list)
topic_pred_list = text_to_topic(vector_list, headline_net, 'cuda')

topic_id_ranked, topic_percentage_ranked = rank_topics_by_percentage( topic_pred_list )

for rank, (topic_id, topic_percentage) in enumerate( zip(topic_id_ranked, topic_percentage_ranked)):
    print(
            f"Rank: {rank}, Topic_id: {topic_id}, Topic Words: {topics_print_list[topic_id]}, \
            Topic Percentage: {topic_percentage}"
        )

100%|██████████| 155/155 [00:00<00:00, 669.71it/s]

Rank: 0, Topic_id: 94, Topic Words: topic 94 : and the situation is only expected to accelerate as, They left Virginia, which was a record-breaking year,”, I don’t know how I’m going to live,", it highly unlikely that the government will be able to issue the fourth-quarter GDP report due, How can you be the president’s lawyer and surrogate on TV, having "welcomed" the help of a "hostile" foreign government and having obstructed the probe into an attack on an American election., His bill would automatically end future emergency declarations after 30 days, 5 dead, 21 injured, I’ve decided to sever all ties with the fascist government of Colombia.,             Topic Percentage: 8.68
Rank: 1, Topic_id: 61, Topic Words: topic 61 : to discuss options for Venezuela., the U.S. focus in the business community has been very much on China, on steel tariffs, and to a certain degree the possibility of U.S. auto tariffs,, who they say, I can tell you that most people want to be paid enough to live."




In [41]:
prob_over_vocab_df = pd.DataFrame(prob_over_vocab_np)
prob_over_vocab_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,73725,73726,73727,73728,73729,73730,73731,73732,73733,73734
0,1e-05,1.3e-05,1.6e-05,1.4e-05,1.2e-05,2e-05,1.3e-05,1.5e-05,1.5e-05,1.6e-05,...,1.5e-05,1.6e-05,1.7e-05,1.2e-05,1.7e-05,1.3e-05,1.5e-05,1.1e-05,1.3e-05,1.4e-05
1,1.2e-05,1.3e-05,1.5e-05,1.3e-05,1.2e-05,9e-06,1.4e-05,1.2e-05,2.9e-05,1.2e-05,...,1.2e-05,9e-06,1.4e-05,1.2e-05,1.3e-05,1.4e-05,1.6e-05,1.5e-05,1.3e-05,1.3e-05
2,1.1e-05,1.2e-05,1.4e-05,1.3e-05,1.1e-05,1.1e-05,9e-06,8e-06,1e-05,1.1e-05,...,1e-05,9e-06,1.2e-05,1.2e-05,9e-06,1.6e-05,1.5e-05,1.8e-05,1e-05,1.7e-05
3,1.3e-05,1.7e-05,1.7e-05,1.4e-05,1.5e-05,1.1e-05,1.3e-05,1.1e-05,1.1e-05,1e-05,...,1.1e-05,1.3e-05,1.2e-05,1.1e-05,1.1e-05,1.3e-05,1.4e-05,1.9e-05,1.1e-05,1.4e-05
4,1.9e-05,1.1e-05,7e-06,1.1e-05,1.1e-05,1.4e-05,1.1e-05,1.3e-05,1e-05,1e-05,...,1.6e-05,1.4e-05,1.3e-05,2e-05,1.2e-05,1.3e-05,1.4e-05,1.1e-05,1.5e-05,1e-05


In [42]:
prob_over_vocab_df.idxmax() # for each column, find the row number of the max

0        12
1        94
2        19
3        94
4        94
         ..
73730    23
73731    64
73732    94
73733    89
73734    91
Length: 73735, dtype: int64

In [43]:
df = pd.DataFrame(prob_over_vocab_df.idxmax())
df = df.reset_index()
df

Unnamed: 0,index,0
0,0,12
1,1,94
2,2,19
3,3,94
4,4,94
...,...,...
73730,73730,23
73731,73731,64
73732,73732,94
73733,73733,89


In [44]:
# construct a dictionary with arg ids as keys and topic ids as values
argid2topic_dict = df[0].to_dict()

In [45]:
# construct a dictionary with topic ids as keys and arg ids as values
# where only one topic is assigned to each argument
topic2argid_dict = df.groupby(0)['index'].apply(list).to_dict()

In [46]:
argid2topic_dict

{0: 12,
 1: 94,
 2: 19,
 3: 94,
 4: 94,
 5: 36,
 6: 49,
 7: 56,
 8: 1,
 9: 75,
 10: 79,
 11: 5,
 12: 61,
 13: 55,
 14: 60,
 15: 58,
 16: 24,
 17: 2,
 18: 8,
 19: 55,
 20: 96,
 21: 7,
 22: 37,
 23: 47,
 24: 58,
 25: 87,
 26: 94,
 27: 34,
 28: 9,
 29: 64,
 30: 87,
 31: 38,
 32: 2,
 33: 49,
 34: 55,
 35: 48,
 36: 20,
 37: 85,
 38: 53,
 39: 59,
 40: 64,
 41: 55,
 42: 53,
 43: 53,
 44: 44,
 45: 44,
 46: 4,
 47: 18,
 48: 58,
 49: 88,
 50: 5,
 51: 31,
 52: 9,
 53: 12,
 54: 38,
 55: 94,
 56: 4,
 57: 24,
 58: 69,
 59: 28,
 60: 94,
 61: 5,
 62: 9,
 63: 83,
 64: 55,
 65: 7,
 66: 61,
 67: 22,
 68: 88,
 69: 61,
 70: 43,
 71: 37,
 72: 35,
 73: 3,
 74: 89,
 75: 44,
 76: 44,
 77: 32,
 78: 90,
 79: 56,
 80: 65,
 81: 22,
 82: 11,
 83: 52,
 84: 38,
 85: 5,
 86: 44,
 87: 0,
 88: 35,
 89: 1,
 90: 1,
 91: 21,
 92: 90,
 93: 94,
 94: 22,
 95: 56,
 96: 94,
 97: 50,
 98: 9,
 99: 85,
 100: 35,
 101: 52,
 102: 14,
 103: 4,
 104: 9,
 105: 35,
 106: 58,
 107: 58,
 108: 23,
 109: 83,
 110: 79,
 111: 17,
 112: 79,
 1

In [47]:
topic2argid_dict[0]

[87,
 176,
 186,
 888,
 1031,
 1199,
 1440,
 1508,
 1636,
 1722,
 1905,
 1972,
 2012,
 2161,
 2432,
 2813,
 3136,
 3362,
 3416,
 3609,
 3781,
 3827,
 4116,
 4663,
 4790,
 4946,
 5092,
 5677,
 5810,
 5909,
 6187,
 6383,
 6713,
 6880,
 7139,
 7178,
 7926,
 8009,
 8081,
 8133,
 8191,
 8482,
 8606,
 8777,
 8877,
 9017,
 9070,
 9173,
 9302,
 9940,
 9969,
 10159,
 10493,
 10564,
 10713,
 10725,
 10892,
 10968,
 10976,
 11016,
 11058,
 11089,
 11346,
 11879,
 12182,
 12265,
 12627,
 12638,
 13082,
 13191,
 13272,
 13478,
 13585,
 13759,
 14512,
 14607,
 14873,
 15177,
 15365,
 15368,
 15955,
 16146,
 16201,
 16595,
 16642,
 16791,
 17072,
 17225,
 17442,
 17542,
 17546,
 17629,
 17674,
 17687,
 17921,
 18131,
 18393,
 18595,
 18734,
 19072,
 19133,
 19231,
 19582,
 19722,
 19861,
 20102,
 20246,
 20271,
 20345,
 20641,
 20690,
 21067,
 21386,
 21630,
 22671,
 22719,
 22735,
 22944,
 23756,
 24223,
 24367,
 24858,
 25377,
 25388,
 25438,
 25551,
 25554,
 25566,
 25613,
 25879,
 26282,
 26378,


Construct a new dataframe with columns: sentence, arg0, arg1, arg0id, arg1id, arg0topicid, arg1topicid

In [48]:
headline_topic_df = headline.copy()
headline_topic_df.head(5)

Unnamed: 0,id,index,outlet,political_leaning,date_publish,text,args,year,arg0,arg1
0,55295569,polusa_55295569_2_0,NBC News,CENTER,2019-01-01 01:01:00,/ Updated / Source: Reuters SEOUL - North Kore...,('the United States continues to demand unilat...,2019,the United States continues to demand unilater...,"but he may have to seek a ""new path"""
1,55295569,polusa_55295569_3_0,NBC News,CENTER,2019-01-01 01:01:00,"In his New Year address, Kim said there would ...",('the United States takes corresponding action...,2019,the United States takes corresponding action.,there would be faster progress on denucleariza...
2,55295569,polusa_55295569_4_0,NBC News,CENTER,2019-01-01 01:01:00,He added that he is willing to meet U.S. Presi...,('to produce results that the international co...,2019,to produce results that the international comm...,to meet U.S. President Donald Trump at any time
3,55295569,polusa_55295569_5_0,NBC News,CENTER,2019-01-01 01:01:00,"North Korea however would have ""no option but ...","('States', 'North Korea')",2019,States,North Korea
4,55295569,polusa_55295569_5_3,NBC News,CENTER,2019-01-01 01:01:00,"North Korea however would have ""no option but ...","('States', 'our sovereignty""')",2019,States,"our sovereignty"""


In [49]:
# create arg0_id and arg1_id columns
headline_arg0id_list = []
for arg0 in headline_topic_df['arg0']:
    headline_arg0id_list.append(headline_word2id[arg0])
headline_topic_df['arg0_id'] = headline_arg0id_list

headline_arg1id_list = []
for arg1 in headline_topic_df['arg1']:
    headline_arg1id_list.append(headline_word2id[arg1])
headline_topic_df['arg1_id'] = headline_arg1id_list

In [50]:
# create columns for arg0_topicid and arg1_topicid
headline_arg0topic_list = []
for arg0 in headline_arg0id_list:
    headline_arg0topic_list.append(argid2topic_dict[arg0])
headline_topic_df['arg0_topicid'] = headline_arg0topic_list

headline_arg1topic_list = []
for arg1 in headline_arg1id_list:
    headline_arg1topic_list.append(argid2topic_dict[arg1])
headline_topic_df['arg1_topicid'] = headline_arg1topic_list

In [51]:
headline_topic_df

Unnamed: 0,id,index,outlet,political_leaning,date_publish,text,args,year,arg0,arg1,arg0_id,arg1_id,arg0_topicid,arg1_topicid
0,55295569,polusa_55295569_2_0,NBC News,CENTER,2019-01-01 01:01:00,/ Updated / Source: Reuters SEOUL - North Kore...,('the United States continues to demand unilat...,2019,the United States continues to demand unilater...,"but he may have to seek a ""new path""",47682,72886,93,61
1,55295569,polusa_55295569_3_0,NBC News,CENTER,2019-01-01 01:01:00,"In his New Year address, Kim said there would ...",('the United States takes corresponding action...,2019,the United States takes corresponding action.,there would be faster progress on denucleariza...,61368,1843,19,35
2,55295569,polusa_55295569_4_0,NBC News,CENTER,2019-01-01 01:01:00,He added that he is willing to meet U.S. Presi...,('to produce results that the international co...,2019,to produce results that the international comm...,to meet U.S. President Donald Trump at any time,36454,39049,38,19
3,55295569,polusa_55295569_5_0,NBC News,CENTER,2019-01-01 01:01:00,"North Korea however would have ""no option but ...","('States', 'North Korea')",2019,States,North Korea,28763,6826,95,53
4,55295569,polusa_55295569_5_3,NBC News,CENTER,2019-01-01 01:01:00,"North Korea however would have ""no option but ...","('States', 'our sovereignty""')",2019,States,"our sovereignty""",28763,30707,95,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61680,55209149,polusa_55209149_0_4,NBC News,CENTER,2019-08-30 21:59:00,Ex-Canadian PM apologizes for 'rooting' for Hu...,"(""to hit Trump's Mar-a-Lago"", 'Campbell, the f...",2019,"""to hit Trump's Mar-a-Lago"", 'Campbell, the fi...",,37473,0,94,12
61681,55361022,polusa_55361022_0_0,NBC News,CENTER,2019-08-31 08:24:00,Taliban forces attack Afghan city amid peace t...,('good negotiations going on with the Islamist...,2019,good negotiations going on with the Islamist g...,troop withdrawal from Afghanistan.,16711,26274,43,98
61682,55284781,polusa_55284781_0_0,NBC News,CENTER,2019-08-31 12:39:00,Trump faces more 2020 danger if Democrat score...,('Democrat scores upset in N. Carolina special...,2019,Democrat scores upset in N. Carolina special e...,Trump faces more 2020 danger,32258,7594,72,6
61683,55284781,polusa_55284781_0_4,NBC News,CENTER,2019-08-31 12:39:00,Trump faces more 2020 danger if Democrat score...,"('were', 'Trump faces more 2020 danger')",2019,were,Trump faces more 2020 danger,9895,7594,35,6


In [47]:
# save the dataframe as a csv file
topic_model_data_path = "/content/drive/MyDrive/Capstone - Causal Narratives Extraction/Phrase BERT code/"
np.save(os.path.join(topic_model_data_path, 'ones_NBC_2019_100_topic_df'), headline_topic_df)
headline_topic_df.to_csv('/content/drive/MyDrive/Capstone - Causal Narratives Extraction/Phrase BERT code/ones_NBC_2019_100_topic_df.csv')

In [48]:
with open( os.path.join(topic_model_data_path, 'ones_NBC_2019_100_topic_model.pt'), "wb") as f:
    torch.save(headline_net, f)
    print(f"Saved model at { os.path.join(topic_model_data_path) }")

Saved model at /content/drive/MyDrive/Capstone - Causal Narratives Extraction/Phrase BERT code/


construct a table to keep track of how many times each topic of cause has caused each topic of effect

In [52]:
headline_topic_count_df = headline_topic_df.copy()
headline_topic_count_df.tail(5)

Unnamed: 0,id,index,outlet,political_leaning,date_publish,text,args,year,arg0,arg1,arg0_id,arg1_id,arg0_topicid,arg1_topicid
61680,55209149,polusa_55209149_0_4,NBC News,CENTER,2019-08-30 21:59:00,Ex-Canadian PM apologizes for 'rooting' for Hu...,"(""to hit Trump's Mar-a-Lago"", 'Campbell, the f...",2019,"""to hit Trump's Mar-a-Lago"", 'Campbell, the fi...",,37473,0,94,12
61681,55361022,polusa_55361022_0_0,NBC News,CENTER,2019-08-31 08:24:00,Taliban forces attack Afghan city amid peace t...,('good negotiations going on with the Islamist...,2019,good negotiations going on with the Islamist g...,troop withdrawal from Afghanistan.,16711,26274,43,98
61682,55284781,polusa_55284781_0_0,NBC News,CENTER,2019-08-31 12:39:00,Trump faces more 2020 danger if Democrat score...,('Democrat scores upset in N. Carolina special...,2019,Democrat scores upset in N. Carolina special e...,Trump faces more 2020 danger,32258,7594,72,6
61683,55284781,polusa_55284781_0_4,NBC News,CENTER,2019-08-31 12:39:00,Trump faces more 2020 danger if Democrat score...,"('were', 'Trump faces more 2020 danger')",2019,were,Trump faces more 2020 danger,9895,7594,35,6
61684,55242664,polusa_55242664_0_0,NBC News,CENTER,2019-08-31 21:43:00,"5 dead, 21 injured after motorist opens fire i...",('suspect continued shooting at innocent civil...,2019,suspect continued shooting at innocent civilia...,"5 dead, 21 injured",65671,30361,26,94


In [53]:
headline_topic_count_df = headline_topic_count_df[['arg0_topicid', 'arg1_topicid']]
headline_topic_count_df

Unnamed: 0,arg0_topicid,arg1_topicid
0,93,61
1,19,35
2,38,19
3,95,53
4,95,32
...,...,...
61680,94,12
61681,43,98
61682,72,6
61683,35,6


In [54]:
headline_topic_count_df = headline_topic_count_df.explode('arg0_topicid')
headline_topic_count_df = headline_topic_count_df.explode('arg1_topicid')
headline_topic_count_df['index'] = headline_topic_count_df.index
headline_topic_count_df

Unnamed: 0,arg0_topicid,arg1_topicid,index
0,93,61,0
1,19,35,1
2,38,19,2
3,95,53,3
4,95,32,4
...,...,...,...
61680,94,12,61680
61681,43,98,61681
61682,72,6,61682
61683,35,6,61683


In [55]:
headline_topic_count_df = headline_topic_count_df.groupby(['arg0_topicid', 'arg1_topicid'])['index'].count().unstack(fill_value=0)
headline_topic_count_df

arg1_topicid,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
arg0_topicid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2,2,5,1,5,6,5,4,4,7,...,3,6,0,4,15,1,0,0,1,4
1,11,19,11,13,14,9,9,15,16,17,...,9,9,0,9,71,1,7,0,10,9
2,9,11,10,4,8,5,4,3,9,12,...,1,5,0,5,40,1,2,0,6,2
3,2,11,4,7,10,14,6,5,15,12,...,1,10,0,4,47,2,1,0,8,5
4,8,12,2,6,9,11,8,22,18,19,...,5,11,0,13,51,1,5,0,13,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1,3,4,0,2,3,6,1,1,3,...,1,1,0,1,14,1,2,0,1,1
96,1,2,2,3,4,4,1,5,8,10,...,0,1,0,2,27,3,3,0,5,1
97,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
98,5,6,7,5,15,12,10,4,6,22,...,4,4,0,3,37,0,6,0,5,3


In [56]:
headline_topic_count_df.sum().sum()

61685

In [57]:
np.save(os.path.join(topic_model_data_path, 'ones_NBC_2019_100_causal'), headline_topic_count_df)
headline_topic_count_df.to_csv('/content/drive/MyDrive/Capstone - Causal Narratives Extraction/Phrase BERT code/ones_NBC_2019_100_causal.csv', index = False)

In [62]:
headline_topic_causal_df = pd.read_csv('ones_NBC_2019_100_causal.csv')
headline_topic_non_causal_df = headline_topic_causal_df.copy()
headline_topic_non_causal_df = [index = 'arg0_topicid']

TypeError: ignored