## CorEx Topic Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Python support modules
import re
import string
import datetime
import pickle 
from collections import Counter

# Spacy
import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
#import en_core_web_smf
from spacy.pipeline import SentenceSegmenter

# CorEx
from corextopic import corextopic as ct
from corextopic import vis_topic as vt

# NLTK
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import words, stopwords, wordnet

# Sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Tara8082/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
headlines_2016 = pd.read_csv('/Users/Tara8082/GIT/ProjectGIT/Project_4/miscellaneous_support/final_processed_2016.csv', index_col = 0)
pd.set_option('max_colwidth', 100)

headlines_2017 = pd.read_csv('/Users/Tara8082/GIT/ProjectGIT/Project_4/miscellaneous_support/final_processed_2017.csv', index_col = 0)
pd.set_option('max_colwidth', 100)

headlines_2018 = pd.read_csv('/Users/Tara8082/GIT/ProjectGIT/Project_4/miscellaneous_support/final_processed_2018.csv', index_col = 0)
pd.set_option('max_colwidth', 100)

headlines_2019 = pd.read_csv('/Users/Tara8082/GIT/ProjectGIT/Project_4/miscellaneous_support/final_processed_2019.csv', index_col = 0)
pd.set_option('max_colwidth', 100)

headlines_2020 = pd.read_csv('/Users/Tara8082/GIT/ProjectGIT/Project_4/miscellaneous_support/final_processed_2020.csv', index_col = 0)
pd.set_option('max_colwidth', 100)

In [33]:
headlines_2017.head(10)

Unnamed: 0,content,clean_content,processed
97353,"Judge rules PricewaterhouseCoopers was negligent in Colonial Bank failure, opening auditor to mi...",judge rules pricewaterhousecoopers was negligent in colonial bank failure opening auditor to mil...,judge rule pricewaterhousecoopers negligent colonial bank failure auditor million potential damage
97354,"While other assets have soared in 2017, yields on longer-term U.S. Treasury bonds have remained ...",while other assets have soared in yields on longer term u s treasury bonds have remained stubbor...,asset soar yield term treasury bond remain stubbornly low unlikely change
97355,There are signs that the Crimson Tide’s reign of dominance may be loosening—and Monday’s College...,there are signs that the crimson tide s reign of dominance may be loosening and monday s college...,sign crimson tide reign dominance loosen college football playoff semifinal ultimate test
97356,"How did your school do in our college rankings? Search, compare, and re-rank based on what matte...",how did your school do in our college rankings search compare and re rank based on what matters ...,school college ranking search compare rank base matter
97357,Try our GOP tax plan calculator. Calculate your taxes and discover the possible effect of the ne...,try our gop tax plan calculator calculate your taxes and discover the possible effect of the new...,try gop tax plan calculator calculate taxis discover effect law
97358,"Have your cake and eat it too? This weight-loss strategy allows you to eat whatever you want, so...",have your cake and eat it too this weight loss strategy allows you to eat whatever you want so l...,cake eat weight loss strategy allow eat whatever hour window
97359,A Trump administration plan would put safety regulators in the role of promoting oil development...,a trump administration plan would put safety regulators in the role of promoting oil development,trump administration plan safety regulator role promote oil development
97360,A weather phenomenon that has wreaked havoc on commodity prices in the past could return in 2018...,a weather phenomenon that has wreaked havoc on commodity prices in the past could return in,weather phenomenon wreak havoc commodity price past return
97361,"A surge in generosity, an eye on taxes: Charities benefit as donors try to squeeze in gifts befo...",a surge in generosity an eye on taxes charities benefit as donors try to squeeze in gifts before...,surge generosity eye taxis charity benefit donor try squeeze gift tax law take effect
97362,"Meet Ripple, the currency that’s running laps around bitcoin https://t.co/sIUU0ZHjWm",meet ripple the currency that s running laps around bitcoin,meet ripple currency run lap bitcoin


In [34]:
processed_headlines_2016 = headlines_2016['processed'].astype(str)
processed_headlines_2017 = headlines_2017['processed'].astype(str)
processed_headlines_2018 = headlines_2018['processed'].astype(str)
processed_headlines_2019 = headlines_2019['processed'].astype(str)
processed_headlines_2020 = headlines_2020['processed'].astype(str)


In [36]:
type(processed_headlines_2020)

pandas.core.series.Series

## CorEx Topic Modeling (2020)

In [37]:
vectorizer = CountVectorizer(max_features=40000,
                             stop_words='english', token_pattern="\\b[a-z][a-z]+\\b",
                             binary=True)

doc_word = vectorizer.fit_transform(processed_headlines_2020)
words = list(np.asarray(vectorizer.get_feature_names()))


In [38]:
%%time 

# n_hidden is the number of topics
# Words is your list of words used in your corpus

topic_model = ct.Corex(n_hidden=15, words=words, seed=1)
topic_model.fit(doc_word, words=words, docs=processed_headlines_2020)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))

0: trump, president, biden, joe, presidential, democratic, election, donald, vote, campaign
1: coronavirus, covid, test, outbreak, spread, pandemic, vaccine, positive, hong, kong
2: police, kill, protest, george, shoot, floyd, officer, protester, arrest, black
3: star, game, player, nfl, nba, fan, film, league, football, movie
4: market, stock, oil, investor, price, company, economy, sale, quarter, cut
5: court, supreme, justice, charge, attorney, judge, prosecutor, prison, bryant, sentence
6: restaurant, los, angeles, food, zoo, water, san, park, wildfire, animal
7: south, minister, prime, korea, north, carolina, ship, boris, johnson, cruise
8: law, security, federal, administration, government, rule, secretary, facebook, department, leader
9: health, live, gov, dr, wear, mask, cuomo, update, care, fauci
10: eu, app, apple, bank, google, deal, trade, tiktok, brexit, amazon
11: billion, tesla, boeing, electric, car, musk, vehicle, elon, loan, uber
12: nasa, hurricane, astronaut, space,

In [21]:
# Let's check out different topics
topic_model.get_top_docs(topic = 1, n_docs=5)

[('talk daniel goldman lawyer democrat house intelligence committee impeachment president trump john bolton criticize democrat commit impeachment malpractice',
  0.0),
 ('joke convey wrong message afford receive terrible misinformation michigan ag dana nessel react pres trump refusal wear mask front camera tour ford factory',
  0.0),
 ('attorney bill barr tell pierretabc potential pardon roger stone commutation sentence pres trump prerogative feel appropriate prosecution sentence fair',
  0.0),
 ('sen chuck schumer pres trump blue comment outcome election determine live count dement person american live count',
  0.0),
 ('republican senator include mitch mcconnell mitt romney john cornyn susan collins blast presidential debate',
  0.0)]

In [24]:
# Look for elbow to determine topic model number

plt.figure(figsize=(10,5))
plt.bar(range(topic_model.tcs.shape[0]), topic_model.tcs, color='#4e79a7', width=0.5)
plt.xlabel('Topic', fontsize=16)
plt.ylabel('Total Correlation (nats)', fontsize=16);


In [51]:
topic_model = ct.Corex(n_hidden=15, words=words,
                       max_iter=200, verbose=False, seed=1)

topic_model.fit(doc_word, words=words, docs=processed_headlines_2020, 
                anchors=[['coronavirus'], 
                         ['police'], 
                         ['wildfire'],
                         ['korea'],
                         ['facebook'],
                         ['cuomo'],
                         ['supreme']], anchor_strength=4)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: police,shoot,kill,officer,suspect,attack,arrest,injure,bomb,dead
1: wildfire,hurricane,zika,matthew,rescue,earthquake,water,coast,virus,storm
2: korea,north,south,syria,syrian,carolina,minister,aleppo,turkey,rebel
3: facebook,askforbes,apple,iphone,user,google,news,valley,microsoft,mon
4: penny,mike,lee,ap,tim,moran,kaine,james,chris,gov
5: trump,donald,clinton,hillary,supreme,obama,presidential,campaign,debate,president
6: fall,america,japan,set,chinese,tell,india,record,john,black
7: rio,olympic,car,olympics,drive,self,medal,athlete,electric,tesla
8: wall,street,journal,st,look,louis,chasevoice,jpmorgan,ganesh,janan
9: star,school,student,prince,college,actor,ali,film,prize,award
10: pope,die,castro,francis,mom,martin,fidel,birthday,celebrate,photo
11: court,law,judge,sentence,assault,sexual,prison,federal,right,justice
12: billion,fund,million,finance,company,tax,personal,pay,buy,ceo
13: fastft,stock,market,bank,oil,rate,brexit,low,uk,price
14: city,york,near,change,strike,park,c

In [31]:
# Trying to anchor on coronavirus with a few topics

topic_model = ct.Corex(n_hidden=15, words=words,
                       max_iter=200, verbose=False, seed=1)

topic_model.fit(doc_word, words=words, docs=processed_headlines, 
                anchors=[['coronavirus'], ['coronavirus'], ['coronavirus'], ['coronavirus']], anchor_strength=5)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: coronavirus,spread,report,confirm,country,quarantine,italy,surpass,update,rise
1: coronavirus,pandemic,death,toll,infect,lockdown,daily,surge,wuhan,fear
2: coronavirus,outbreak,positive,infection,amid,restriction,measure,travel,slow,impact
3: coronavirus,test,novel,response,crisis,close,worldwide,continue,epicenter,tally
4: storm,climate,coast,water,flood,wind,sea,rain,oil,weather
5: hong,kong,sexual,china,hurricane,abuse,beijing,epstein,nasa,jeffrey
6: court,supreme,minister,prime,brexit,johnson,boris,rule,eu,judge
7: police,kill,officer,shoot,north,korea,south,military,iran,protest
8: company,billion,market,stock,investor,bank,business,million,fund,pay
9: health,covid,patient,vaccine,care,student,school,medical,hospital,disease
10: biden,joe,presidential,democratic,election,sen,voter,vote,candidate,sander
11: charge,city,wear,mask,york,mayor,murder,prosecutor,guilty,blasio
12: los,angeles,california,park,baby,san,home,wildfire,child,family
13: star,game,player,film,fan,music,movie

## 2019 Topics

In [42]:
%% time

vectorizer = CountVectorizer(max_features=40000,
                             stop_words='english', token_pattern="\\b[a-z][a-z]+\\b",
                             binary=True)

doc_word = vectorizer.fit_transform(processed_headlines_2019)
words = list(np.asarray(vectorizer.get_feature_names()))

# n_hidden is the number of topics
# Words is your list of words used in your corpus

topic_model = ct.Corex(n_hidden=15, words=words, seed=1)
topic_model.fit(doc_word, words=words, docs=processed_headlines_2019)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))


0: house, impeachment, democratic, presidential, rep, democrat, biden, mueller, candidate, sen
1: police, hong, kong, kill, shoot, arrest, charge, officer, suspect, authority
2: china, trade, market, company, billion, bank, stock, investor, economy, rate
3: scientist, water, rescue, los, angeles, food, nbcnewsmach, animal, plastic, eat
4: trump, president, donald, pres, administration, ukraine, senate, congress, vice, border
5: social, medium, movie, film, facebook, james, netflix, meghan, book, harry
6: school, student, college, game, star, university, san, football, nfl, player
7: federal, court, health, supreme, law, judge, rule, care, abortion, lawsuit
8: minister, prime, johnson, boris, election, pm, party, parliament, israel, netanyahu
9: north, south, saudi, korea, iran, carolina, arabia, syria, nuclear, korean
10: news, abc, military, security, troop, force, fox, sri, maduro, afghanistan
11: brexit, deal, eu, wall, uk, street, fund, britain, european, journal
12: yankees, elect

## 2018 Topics

In [47]:
vectorizer = CountVectorizer(max_features=40000,
                             stop_words='english', token_pattern="\\b[a-z][a-z]+\\b",
                             binary=True)

doc_word = vectorizer.fit_transform(processed_headlines_2018)
words = list(np.asarray(vectorizer.get_feature_names()))

# n_hidden is the number of topics
# Words is your list of words used in your corpus

topic_model = ct.Corex(n_hidden=15, words=words, seed=1)
topic_model.fit(doc_word, words=words, docs=processed_headlines_2018)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))

0: police, kill, hurricane, storm, crash, officer, suspect, authority, california, rescue
1: election, senate, sen, kavanaugh, democrat, vote, republican, brett, democratic, midterm
2: trump, president, pres, donald, administration, obama, bush, analysis, mccain, john
3: court, sexual, supreme, charge, judge, justice, allegation, assault, abuse, attorney
4: north, korea, south, kim, korean, jong, carolina, summit, olympics, pope
5: white, house, mueller, cohen, special, counsel, michael, tell, robert, cnn
6: saudi, khashoggi, arabia, nbcnewsmach, scientist, jamal, prince, cancer, nasa, study
7: china, trade, tariff, brexit, eu, deal, billion, chinese, street, wall
8: star, film, movie, music, award, song, stephen, actor, actress, williams
9: shoot, school, gun, florida, parkland, mass, shooter, musk, elon, pittsburgh
10: student, college, player, university, football, nfl, nbcout, game, soccer, team
11: die, mother, girl, boy, home, daughter, dog, family, son, father
12: facebook, comp

## 2017 Topics

In [48]:
vectorizer = CountVectorizer(max_features=30000,
                             stop_words='english', token_pattern="\\b[a-z][a-z]+\\b",
                             binary=True)

doc_word = vectorizer.fit_transform(processed_headlines_2017)
words = list(np.asarray(vectorizer.get_feature_names()))

# n_hidden is the number of topics
# Words is your list of words used in your corpus

topic_model = ct.Corex(n_hidden=15, words=words, seed=1)
topic_model.fit(doc_word, words=words, docs=processed_headlines_2017)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))

0: police, hurricane, shoot, kill, irma, puerto, vega, rico, officer, florida
1: trump, president, house, pres, white, donald, senate, gop, care, administration
2: stock, bank, market, investor, sale, rate, profit, growth, uk, price
3: sexual, star, harassment, nfl, player, misconduct, allegation, bowl, prince, assault
4: north, korea, south, missile, korean, kim, carolina, jong, kong, hong
5: attack, syria, flight, isis, san, islamic, terror, syrian, airline, force
6: election, moore, roy, alabama, clinton, penny, hillary, candidate, presidential, joe
7: court, judge, charge, supreme, ban, travel, federal, prison, sentence, murder
8: cancer, drug, study, opioid, patient, brain, disease, treatment, valley, doctor
9: minister, saudi, prime, arabia, pope, myanmar, rohingya, francis, catalonia, independence
10: wall, street, apple, amazon, tech, company, journal, iphone, google, store
11: news, paul, fox, fbi, guilty, ryan, social, medium, abc, director
12: statue, die, confederate, jimmy

## 2016 Topics

In [49]:
vectorizer = CountVectorizer(max_features=30000,
                             stop_words='english', token_pattern="\\b[a-z][a-z]+\\b",
                             binary=True)

doc_word = vectorizer.fit_transform(processed_headlines_2016)
words = list(np.asarray(vectorizer.get_feature_names()))

# n_hidden is the number of topics
# Words is your list of words used in your corpus

topic_model = ct.Corex(n_hidden=15, words=words, seed=1)
topic_model.fit(doc_word, words=words, docs=processed_headlines_2016)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))

0: trump, donald, clinton, hillary, obama, presidential, campaign, president, debate, sander
1: police, shoot, kill, attack, suspect, officer, death, arrest, charge, injure
2: fastft, stock, market, oil, bank, rate, profit, price, low, bond
3: north, south, syria, court, korea, syrian, carolina, aleppo, hurricane, supreme
4: star, die, lee, moran, david, george, actor, martin, age, james
5: rio, plane, flight, olympic, olympics, airline, air, passenger, egyptair, medal
6: billion, apple, finance, million, personal, fund, company, askforbes, iphone, tax
7: zika, health, drug, cancer, study, virus, patient, care, drive, doctor
8: warren, prince, valley, music, college, harry, taylor, award, elizabeth, silicon
9: live, watch, news, paul, facebook, social, ryan, medium, abc, coverage
10: player, game, team, nfl, football, soccer, sport, chris, christie, nba
11: san, water, park, francisco, flint, rescue, zoo, galaxy, note, ice
12: eu, minister, prime, britain, opinion, foreign, referendum,