# Topic Modeling on Headline Data

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Python support modules
import re
import string
import datetime
import pickle 
from collections import Counter

# Spacy
import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
#import en_core_web_smf
from spacy.pipeline import SentenceSegmenter

# CorEx
#from corextopic import corextopic as ct
#from corextopic import vis_topic as vt

# NLTK
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import words, stopwords, wordnet

# Sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

# Helper functions in py file
#from preprocessing_tweets import cleaned_tweet

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Tara8082/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
with open('/Users/Tara8082/GIT/ProjectGIT/Project_4/processed_headlines.pkl', 'rb') as read_file:
    headlines = pickle.load(read_file)

In [23]:
headlines.head(10)

Unnamed: 0,content,clean_content,processed
0,"Biden leads Trump among Hispanic voters, 62% t...",biden leads trump among hispanic voters to a w...,biden lead trump hispanic voter telemundo poll
1,“We can’t keep up with the laundry.” Covid-19 ...,we can t keep up with the laundry covid has...,laundry covid tourism industry upside create h...
2,A large English study showed the number of peo...,a large english study showed the number of peo...,english study covid antibody decline significa...
3,"The leaders of Microsoft, Coca-Cola, American ...",the leaders of microsoft coca cola american ai...,leader microsoft coca colon american airline c...
4,"After seven months of isolation, the pull of g...",after seven months of isolation the pull of ge...,month isolation pull get strong covid hospital...
5,Doctors have begun to unlock the mystery behin...,doctors have begun to unlock the mystery behin...,doctor begin unlock mystery covid hauler badly...
6,Is eating in covered outdoor setups less risky...,is eating in covered outdoor setups less risky...,eat cover outdoor setup risky inside
7,Investors are relying on polls showing former ...,investors are relying on polls showing former ...,investor rely poll former vice president joe b...
8,More than 91 million ballots have been cast ah...,more than million ballots have been cast ahead...,million ballot cast ahead election party fight...
9,Pricing an item at $1.99 instead of $2.00 is a...,pricing an item at instead of is a common mark...,price item instead common market strategy cons...


## Setting Up Word Vectorizers

In [5]:
cv = CountVectorizer()
doc_word = cv.fit_transform(headlines['processed'])
vect = pd.DataFrame(doc_word.toarray(),columns=cv.get_feature_names())
vect

Unnamed: 0,10,100,1000000,11,12,14,15,16,17,20,...,zumpee,zumper,zumwalt,zuniga,zuoling,zupevc,zura,zurabichvili,zurcher,zurich
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
524996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
524997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
524998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
tfidf = TfidfVectorizer()
doc_word = tfidf.fit_transform(headlines['processed'])
tfidf_matrix = pd.DataFrame(doc_word.toarray(),columns=cv.get_feature_names())
tfidf_matrix

Unnamed: 0,10,100,1000000,11,12,14,15,16,17,20,...,zumpee,zumper,zumwalt,zuniga,zuoling,zupevc,zura,zurabichvili,zurcher,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
524996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
524997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
524998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Topic Modeling

## LDA

In [16]:
headlines.head(2)

Unnamed: 0,content,clean_content,processed
0,"Biden leads Trump among Hispanic voters, 62% t...",biden leads trump among hispanic voters to a w...,biden lead trump hispanic voter telemundo poll
1,“We can’t keep up with the laundry.” Covid-19 ...,we can t keep up with the laundry covid has...,laundry covid tourism industry upside create h...


In [9]:
processed_headlines = headlines['processed']

In [18]:
%%time

vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, ngram_range=(1, 3))

svd_model = TruncatedSVD(n_components=10, 
                        algorithm='randomized',
                        n_iter=50,
                        random_state=42)

# pipeline of tf-idf + SVD, fit and applied to docs: 

svd_transformer = Pipeline([('tfidf', vectorizer),
                           ('svd', svd_model)])

svd_matrix = svd_transformer.fit_transform(processed_headlines)

CPU times: user 6min 3s, sys: 1min 43s, total: 7min 47s
Wall time: 7min 18s


In [19]:
%%time
terms = vectorizer.get_feature_names()

for ix, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(ix)+": ")
    for t in sorted_terms:
        print(t[0])
    print("")

Topic 0: 
front
front wall
front wall street
look front wall
look front
street journal
wall street journal
journal
wall street
wall

Topic 1: 
front financial
front financial time
financial time
publish front
publish front financial
publish
financial
time
financial time international
time international

Topic 2: 
coronavirus
trump
president
president trump
pandemic
covid
biden
coronavirus pandemic
test
house

Topic 3: 
quote
quote rocketmortgage
rocketmortgage
trump
president
birthday
honor
biden
honor birthday
quote powerwomen

Topic 4: 
coronavirus
pandemic
coronavirus pandemic
death
test
report
outbreak
spread
covid
coronavirus death

Topic 5: 
start
europe start
europe
read
sleep read
sleep
hong
kong
hong kong
police

Topic 6: 
read
sleep read
sleep
hong
hong kong
kong
york sleep read
york sleep
york
hong kong sleep

Topic 7: 
coronavirus
trump
read
president
sleep read
sleep
president trump
coronavirus pandemic
biden
york sleep read

Topic 8: 
biden
joe
joe biden
presidential
demo