In [2]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
from time import time
%matplotlib inline

from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LdaModel
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.utils import tokenize
from gensim.parsing import preprocess_string, remove_stopwords, strip_tags, strip_short, strip_numeric, stem_text, strip_punctuation
from gensim.parsing.preprocessing import STOPWORDS
import spacy
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
df_leans = pd.read_pickle('data/df_leans.pkl')

In [10]:
corpus = df_leans.content

In [11]:
len(corpus)

142570

In [None]:
y = df_leans.leaning.values

In [12]:
y.shape

(142570,)

### Create a stratified sample of the original dataset corpus - this will make Gensim LDA topic modeling faster


In [14]:
stratified_sample, _, stratified_out, _ = train_test_split(corpus, y, test_size=.90, stratify=y, random_state=5)

In [15]:
stratified_sample.index

Int64Index([ 34988,  57737, 107158,  80565, 120456, 116926,  98133, 110952,
             15640,  80117,
            ...
            124330,  77337,  85230,  17552, 123227, 133580, 129550,  71448,
             38090, 132399],
           dtype='int64', length=14257)

In [16]:
stratified_out.shape

(14257,)

In [17]:
df_sample = df_leans.copy()

In [18]:
df_sample = df_sample[df_sample.index.isin(stratified_sample.index)]

In [19]:
df_sample

Unnamed: 0,id,title,publication,author,date,year,month,url,content,leaning,factual
10,17293,Weak Federal Powers Could Limit Trump’s Climat...,New York Times,Justin Gillis,2017-01-03,2017.0,1.0,,With Donald J. Trump about to take control of ...,Left-Center,High
30,17318,Tips for Your Post-Holiday Clutter Purge - The...,New York Times,Michelle Higgins,2017-04-11,2017.0,4.0,,With the year winding down and New Year’s reso...,Left-Center,High
32,17321,"France Lets Workers Turn Off, Tune Out and Liv...",New York Times,Alissa J. Rubin,2017-01-03,2017.0,1.0,,PARIS — If the world does not envy the Fren...,Left-Center,High
59,17352,"In Republicans’ Ethics Office Gambit, a Specta...",New York Times,Carl Hulse,2017-01-05,2017.0,1.0,,WASHINGTON — Majorities in Congress often o...,Left-Center,High
66,17361,How We Put Together Our 52 Places to Go List -...,New York Times,,2017-01-15,2017.0,1.0,,"For the 12th straight year, the Travel section...",Left-Center,High
...,...,...,...,...,...,...,...,...,...,...,...
142514,218016,"With new monuments in Nevada, Utah, Obama adds...",Washington Post,Juliet Eilperin,2016-12-28,2016.0,12.0,https://web.archive.org/web/20161229004018/htt...,President Obama on Wednesday created new...,Left-Center,High
142515,218017,Memo to Trump: There can be only one president...,Washington Post,Ruth Marcus,2016-12-28,2016.0,12.0,https://web.archive.org/web/20161229004018/htt...,One of the hallmarks of our democratic system...,Left-Center,High
142544,218052,Rare coast-to-coast cold snap to engulf Lower ...,Washington Post,Jason Samenow,2016-12-29,2016.0,12.0,https://web.archive.org/web/20161230000842/htt...,Frigid air will grip an unusually large p...,Left-Center,High
142547,218055,A CIA calendar the CIA gift shop refuses to se...,Washington Post,Ian Shapira,2016-12-29,2016.0,12.0,https://web.archive.org/web/20161230000842/htt...,"Far Side cartoons, Ansel Adams landscapes, ...",Left-Center,High


In [4]:
def token_preprocess_newstops(string):
    tokens = list(tokenize(string, lower=True, deacc=True))
    tokens = [word for word in tokens if word not in STOPWORDS_U]
    temp_process = ' '.join(tokens)
    tokens = preprocess_string(temp_process, [lambda x: x.lower(), strip_tags, strip_short, strip_numeric, strip_punctuation, stem_text])
    return tokens

In [23]:
t0 = time()

cleaned_newstop = []
for article in df_sample.content:
    cleaned_newstop.append(token_preprocess_newstops(article))
    
print(time() - t0)

44.49448251724243


In [24]:
dictionary_new = corpora.Dictionary(cleaned_newstop)
gen_corpus_new = [dictionary_new.doc2bow(text) for text in cleaned_newstop]

In [28]:
t0 = time()

lda_model_new = LdaModel(corpus=gen_corpus_new,
                       id2word=dictionary_new,
                       num_topics=10, 
                       random_state=5,
                       update_every=1,
                       chunksize=100,
                       passes=10,
                       alpha='auto',
                       per_word_topics=True)

print(time()-t0)

334.08730840682983


In [29]:
# lda_model_new.save('data/model/lda.modelnew')
# lda_model_new = LdaModel.load('data/model/lda.modelnew')

In [30]:
lda_model_new.show_topics()

[(0,
  '0.010*"citi" + 0.010*"water" + 0.010*"area" + 0.006*"year" + 0.005*"park" + 0.005*"nation" + 0.005*"air" + 0.005*"north" + 0.005*"plant" + 0.004*"south"'),
 (1,
  '0.024*"percent" + 0.022*"compani" + 0.013*"year" + 0.011*"market" + 0.010*"million" + 0.010*"billion" + 0.010*"busi" + 0.009*"bank" + 0.008*"price" + 0.007*"rate"'),
 (2,
  '0.017*"state" + 0.013*"unit" + 0.011*"countri" + 0.010*"govern" + 0.009*"war" + 0.008*"syria" + 0.008*"china" + 0.008*"forc" + 0.008*"presid" + 0.008*"russia"'),
 (3,
  '0.011*"peopl" + 0.008*"sai" + 0.007*"wai" + 0.006*"year" + 0.006*"women" + 0.004*"life" + 0.004*"need" + 0.004*"differ" + 0.004*"thing" + 0.003*"help"'),
 (4,
  '0.021*"polic" + 0.015*"offic" + 0.012*"kill" + 0.011*"peopl" + 0.010*"citi" + 0.009*"attack" + 0.008*"gun" + 0.008*"shoot" + 0.008*"told" + 0.007*"protest"'),
 (5,
  '0.008*"post" + 0.008*"twitter" + 0.007*"film" + 0.007*"olymp" + 0.006*"night" + 0.006*"movi" + 0.006*"video" + 0.006*"star" + 0.006*"stori" + 0.006*"perfor

In [None]:
visnew = pyLDAvis.gensim.prepare(lda_model_new, gen_corpus_new, dictionary_new)

In [None]:
pyLDAvis.enable_notebook()
visnew

In [34]:
# Save to html
# pyLDAvis.save_html(visnew, 'ldanew.html')

### Assigning topics based on keywords

In [292]:
def format_topics_sentences(ldamodel, corpus):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
#     contents = pd.Series(texts)
#     sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


In [288]:
for i in range(0, 10):
    print(lda_model_new.show_topic(i))
    print()

[('citi', 0.010200432), ('water', 0.009685914), ('area', 0.00952618), ('year', 0.005861368), ('park', 0.0054237465), ('nation', 0.004815536), ('air', 0.004758652), ('north', 0.0046952623), ('plant', 0.004673799), ('south', 0.0044658086)]

[('percent', 0.023814801), ('compani', 0.021627992), ('year', 0.012897828), ('market', 0.010664429), ('million', 0.010331284), ('billion', 0.009840626), ('busi', 0.009542874), ('bank', 0.008880805), ('price', 0.0075456225), ('rate', 0.007393971)]

[('state', 0.017151624), ('unit', 0.012591133), ('countri', 0.011211348), ('govern', 0.010462813), ('war', 0.008522371), ('syria', 0.008167333), ('china', 0.008159017), ('forc', 0.008079694), ('presid', 0.007967894), ('russia', 0.007942189)]

[('peopl', 0.011263334), ('sai', 0.0077411523), ('wai', 0.007368372), ('year', 0.0060954383), ('women', 0.0059584347), ('life', 0.0037984333), ('need', 0.0036750655), ('differ', 0.00357594), ('thing', 0.0035441879), ('help', 0.0034585765)]

[('polic', 0.021122452), ('of

In [300]:
topic_nums = {
    0: 9,
    1: 7,
    2: 6,
    3: 1,
    4: 4,
    5: 8,
    6: 3,
    7: 5,
    8: 10,
    9: 2}

topic_map = {
    1: 'lifestyle-general',
    2: 'domestic-politics',
    3: 'government',
    4: 'police-shootings',
    5: 'fbi-investigation',
    6: 'foreign-politics',
    7: 'economy',
    8: 'arts-culture',
    9: 'zika-virus',
    10: 'college-sports'
}

### Adding new column to articles dataframe - Dominant topic

In [298]:
t0 = time()

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model_new, corpus=gen_corpus_new)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords']
df_dominant_topic.head(10)

print(time() - t0)

135.80958199501038


In [301]:
df_dominant_topic['vis_num'] = df_dominant_topic['Dominant_Topic'].map(lambda x: topic_nums[x])

In [304]:
df_dominant_topic['topic'] = df_dominant_topic.vis_num.map(lambda x: topic_map[x])

In [306]:
df_dominant_topic[['vis_num', 'Topic_Perc_Contrib', 'topic']]

Unnamed: 0,vis_num,Topic_Perc_Contrib,topic
0,3,0.2552,government
1,1,0.7852,lifestyle-general
2,1,0.4822,lifestyle-general
3,2,0.4290,domestic-politics
4,1,0.4490,lifestyle-general
...,...,...,...
14252,3,0.3861,government
14253,2,0.4822,domestic-politics
14254,9,0.6373,zika-virus
14255,1,0.2394,lifestyle-general


In [264]:
df_sample_inds = df_sample1.reset_index()
df_sample_inds = df_sample_inds.drop(['index'], axis=1)

In [307]:
df_sample_inds

Unnamed: 0,id,title,publication,author,date,year,month,url,content,leaning,factual
0,17293,Weak Federal Powers Could Limit Trump’s Climat...,New York Times,Justin Gillis,2017-01-03,2017.0,1.0,,With Donald J. Trump about to take control of ...,Left-Center,High
1,17318,Tips for Your Post-Holiday Clutter Purge - The...,New York Times,Michelle Higgins,2017-04-11,2017.0,4.0,,With the year winding down and New Year’s reso...,Left-Center,High
2,17321,"France Lets Workers Turn Off, Tune Out and Liv...",New York Times,Alissa J. Rubin,2017-01-03,2017.0,1.0,,PARIS — If the world does not envy the Fren...,Left-Center,High
3,17352,"In Republicans’ Ethics Office Gambit, a Specta...",New York Times,Carl Hulse,2017-01-05,2017.0,1.0,,WASHINGTON — Majorities in Congress often o...,Left-Center,High
4,17361,How We Put Together Our 52 Places to Go List -...,New York Times,,2017-01-15,2017.0,1.0,,"For the 12th straight year, the Travel section...",Left-Center,High
...,...,...,...,...,...,...,...,...,...,...,...
14252,218016,"With new monuments in Nevada, Utah, Obama adds...",Washington Post,Juliet Eilperin,2016-12-28,2016.0,12.0,https://web.archive.org/web/20161229004018/htt...,President Obama on Wednesday created new...,Left-Center,High
14253,218017,Memo to Trump: There can be only one president...,Washington Post,Ruth Marcus,2016-12-28,2016.0,12.0,https://web.archive.org/web/20161229004018/htt...,One of the hallmarks of our democratic system...,Left-Center,High
14254,218052,Rare coast-to-coast cold snap to engulf Lower ...,Washington Post,Jason Samenow,2016-12-29,2016.0,12.0,https://web.archive.org/web/20161230000842/htt...,Frigid air will grip an unusually large p...,Left-Center,High
14255,218055,A CIA calendar the CIA gift shop refuses to se...,Washington Post,Ian Shapira,2016-12-29,2016.0,12.0,https://web.archive.org/web/20161230000842/htt...,"Far Side cartoons, Ansel Adams landscapes, ...",Left-Center,High


In [311]:
df_gentops = pd.concat([df_sample_inds, df_dominant_topic[['vis_num', 'Topic_Perc_Contrib', 'topic']]], axis=1)

In [312]:
df_gentops

Unnamed: 0,id,title,publication,author,date,year,month,url,content,leaning,factual,vis_num,Topic_Perc_Contrib,topic
0,17293,Weak Federal Powers Could Limit Trump’s Climat...,New York Times,Justin Gillis,2017-01-03,2017.0,1.0,,With Donald J. Trump about to take control of ...,Left-Center,High,3,0.2552,government
1,17318,Tips for Your Post-Holiday Clutter Purge - The...,New York Times,Michelle Higgins,2017-04-11,2017.0,4.0,,With the year winding down and New Year’s reso...,Left-Center,High,1,0.7852,lifestyle-general
2,17321,"France Lets Workers Turn Off, Tune Out and Liv...",New York Times,Alissa J. Rubin,2017-01-03,2017.0,1.0,,PARIS — If the world does not envy the Fren...,Left-Center,High,1,0.4822,lifestyle-general
3,17352,"In Republicans’ Ethics Office Gambit, a Specta...",New York Times,Carl Hulse,2017-01-05,2017.0,1.0,,WASHINGTON — Majorities in Congress often o...,Left-Center,High,2,0.4290,domestic-politics
4,17361,How We Put Together Our 52 Places to Go List -...,New York Times,,2017-01-15,2017.0,1.0,,"For the 12th straight year, the Travel section...",Left-Center,High,1,0.4490,lifestyle-general
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14252,218016,"With new monuments in Nevada, Utah, Obama adds...",Washington Post,Juliet Eilperin,2016-12-28,2016.0,12.0,https://web.archive.org/web/20161229004018/htt...,President Obama on Wednesday created new...,Left-Center,High,3,0.3861,government
14253,218017,Memo to Trump: There can be only one president...,Washington Post,Ruth Marcus,2016-12-28,2016.0,12.0,https://web.archive.org/web/20161229004018/htt...,One of the hallmarks of our democratic system...,Left-Center,High,2,0.4822,domestic-politics
14254,218052,Rare coast-to-coast cold snap to engulf Lower ...,Washington Post,Jason Samenow,2016-12-29,2016.0,12.0,https://web.archive.org/web/20161230000842/htt...,Frigid air will grip an unusually large p...,Left-Center,High,9,0.6373,zika-virus
14255,218055,A CIA calendar the CIA gift shop refuses to se...,Washington Post,Ian Shapira,2016-12-29,2016.0,12.0,https://web.archive.org/web/20161230000842/htt...,"Far Side cartoons, Ansel Adams landscapes, ...",Left-Center,High,1,0.2394,lifestyle-general


In [321]:
grouped = df_gentops.groupby(['leaning', 'topic']).topic.count()
grouped

leaning        topic            
Least Biased   arts-culture           15
               college-sports          8
               domestic-politics     101
               economy               425
               fbi-investigation      97
               foreign-politics      211
               government             98
               lifestyle-general      18
               police-shootings       72
               zika-virus             26
Left           arts-culture           92
               college-sports         34
               domestic-politics     558
               economy                31
               fbi-investigation     213
               foreign-politics      187
               government            234
               lifestyle-general     466
               police-shootings      234
               zika-virus            116
Left-Center    arts-culture          192
               college-sports        153
               domestic-politics     972
               economy  

In [351]:
for lean in df_gentops.leaning.unique():
    print(lean)
    print(grouped[lean].sort_values(ascending = False).head(3)/grouped[lean].sum() * 100)
    print()

Left-Center
topic
lifestyle-general    33.932854
domestic-politics    16.649538
government            8.684481
Name: topic, dtype: float64

Right-Extreme
topic
domestic-politics    37.132044
police-shootings     14.802355
lifestyle-general    13.961312
Name: topic, dtype: float64

Left
topic
domestic-politics    25.773672
lifestyle-general    21.524249
police-shootings     10.808314
Name: topic, dtype: float64

Right
topic
domestic-politics    30.965909
lifestyle-general    26.136364
government           11.458333
Name: topic, dtype: float64

Right-Center
topic
lifestyle-general    32.818754
police-shootings     13.379074
economy              12.121212
Name: topic, dtype: float64

Least Biased
topic
economy              39.682540
foreign-politics     19.701214
domestic-politics     9.430439
Name: topic, dtype: float64

