In [1]:
import pandas as pd 
import numpy as np
import os
import nltk
nltk.download("omw-1.4")
nltk.download("wordnet")
nltk.download("punkt")

import yaml
with open("../../src/configs.yml", "r") as configs:
    configs = yaml.safe_load(configs)

from src.utils.data_loader import Headlines

DATAPATH = configs["DATAPATH"]
ROOTPATH = configs["ROOTPATH"]

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/yijingch/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yijingch/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yijingch/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/yijingch/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yijingch/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
from src.utils.dict_configuration import dictionary2016, dictionary2020

year = 2020
if year == 2016:
    cand1 = "trump"
    cand2 = "clinton"
    dictionary = dictionary2016
else:
    cand1 = "biden"
    cand2 = "trump"
    dictionary = dictionary2020

dictionary.construct_overlap_matrix()

## Load data

In [9]:
print("Loading headlines...")
headlines = Headlines(ROOTPATH + "data/", year=year)
print("Cleaning headlines...")
headlines.clean(lemmatize=False, stemming=True)

Loading headlines...
Cleaning headlines...
cleaning...


In [4]:
from src.utils.dict_based_topics import DictBasedTopicModel

save_output = True
drop_no_topic = True
topvec_out_cache_fpath = ROOTPATH+"output/cache-topvec-min2"
wordvec_out_cache_fpath = ROOTPATH+"output/cache-wordvec-min2"

if not os.path.exists(topvec_out_cache_fpath):
    os.mkdir(topvec_out_cache_fpath)
    print("New output folder created for topvecs!")

if not os.path.exists(wordvec_out_cache_fpath):
    os.mkdir(wordvec_out_cache_fpath)
    print("New output folder created for wordvecs!")

In [5]:
# runtime:
# 111 mins -- for 2020
# 115 mins -- for 2016

print("Analyzing headlines...")
headlinetopics = DictBasedTopicModel(dictionary=dictionary, text_input=headlines, text_type="headline")
print("- Building wordvec...")
headlinetopics.build_wordvec_df(drop_no_topic=drop_no_topic, save_output=save_output, output_cache_fpath=wordvec_out_cache_fpath)
print("- Building topvec...")
headlinetopics.build_topvec_df(save_output=save_output, output_cache_fpath=topvec_out_cache_fpath) 

Analyzing headlines...
- Building wordvec...
Finished counting topic keywords: trump2016
Finished counting topic keywords: clinton2016
Rate of coverage for trump2016: 0.8463640954424946
Rate of coverage for clinton2016: 0.8728246235551778
Now we can save wordvecs as well! [new!]...
- Building topvec...
Finished computing topic vector: trump2016
Finished computing topic vector: clinton2016


In [None]:
# Analyzing headlines...
# - Building wordvec...
# Finished counting topic keywords: trump2016
# Finished counting topic keywords: clinton2016
# Rate of coverage for trump2016: 0.8463640954424946
# Rate of coverage for clinton2016: 0.8728246235551778
# Now we can save wordvecs as well! [new!]...
# - Building topvec...
# Finished computing topic vector: trump2016
# Finished computing topic vector: clinton2016


# Analyzing headlines...
# - Building wordvec...
# Finished counting topic keywords: biden2020
# Finished counting topic keywords: trump2020
# Rate of coverage for biden2020: 0.876492214225894
# Rate of coverage for trump2020: 0.8975569546768105
# Now we can save wordvecs as well! [new!]...
# - Building topvec...
# Finished computing topic vector: biden2020
# Finished computing topic vector: trump2020