In [1]:
# Part 1 : gensim LDA based on NLTK & SpaCy

# Run in python console
import nltk; nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ipekcinar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# For Shakespeare : We need to change this: kill thou, thy and shall & keep subject, re, edu and use
stop_words.extend(['may', 'make', 'would', 'shall', 'must', 'could', 'applause'])  # make, come, go also very common


In [4]:
# Import Dataset -- original source
'''
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])
'''

'\ndf = pd.read_json(\'https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json\')\nprint(df.target_names.unique())\ndf.head()\n\n# Convert to list\ndata = df.content.values.tolist()\n\n# Remove Emails\ndata = [re.sub(\'\\S*@\\S*\\s?\', \'\', sent) for sent in data]\n\n# Remove new line characters\ndata = [re.sub(\'\\s+\', \' \', sent) for sent in data]\n\n# Remove distracting single quotes\ndata = [re.sub("\'", "", sent) for sent in data]\n\npprint(data[:1])\n'

In [5]:
# Import Dataset -- Shakespeare

# Pull all into "data" = all Shakespeare raw text

# directory containing all source texts for training the model 
data_dir="/Users/ipekcinar/Desktop/populism-hackathon/corpus_populism/United States/2016/Bernie Sanders"
import glob, os
os.chdir(data_dir)

#documents = list()
data = list()   # reset data to 0

for filename in glob.glob("*.txt"):
    filedata = open(filename, 'r').read()
    print(filename + " = " + str(len(filedata)) + " chars")
    #documents = documents + filedata.split(".")
    data.append(filedata)


Clinton102416.txt = 23434 chars
Sanders111915.txt = 39871 chars
Clinton110416.txt = 21670 chars
Trump100516.txt = 12214 chars
Trump102116.txt = 9767 chars
Sanders012616.txt = 26810 chars
Trump110116.txt = 13623 chars
Trump090716.txt = 14165 chars
Clinton100216.txt = 10528 chars
Trump081816.txt = 18203 chars
Trump102916_2.txt = 4771 chars
Clinton091916.txt = 20921 chars
Trump102316.txt = 10637 chars
Trump061316.txt = 17787 chars
Sanders041516.txt = 10767 chars
Sanders020116.txt = 7692 chars
Clinton102616.txt = 16458 chars
Sanders032116.txt = 21519 chars
Trump080516.txt = 50632 chars
Trump100316.txt = 6179 chars
Sanders091915.txt = 9123 chars
Clinton102216.txt = 23948 chars
Trump110716.txt = 34499 chars
Trump090116.txt = 7175 chars
Sanders030116.txt = 5662 chars
Trump102716.txt = 22285 chars
Clinton012616.txt = 26281 chars
Clinton100316_2.txt = 22390 chars
Trump082316.txt = 13582 chars
Trump101816.txt = 27662 chars
Trump102116_2.txt = 9795 chars
Trump090316.txt = 8834 chars
Sanders102415

In [6]:
%%time

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])
print('\n')

[[u'thank', u'you', u'wow', u'well', u'don', u'know', u'about', u'you', u'but', u'could', u'listen', u'to', u'elizabeth', u'warren', u'go', u'on', u'all', u'day', u'it', u'is', u'so', u'great', u'to', u'be', u'back', u'here', u'in', u'new', u'hampshire', u'have', u'significant', u'unruly', u'group', u'of', u'women', u'went', u'to', u'wellesley', u'with', u'back', u'here', u'oh', u'it', u'is', u'so', u'wonderful', u'to', u'be', u'here', u'on', u'this', u'college', u'campus', u'and', u'to', u'see', u'so', u'many', u'young', u'people', u'here', u'as', u'maggie', u'and', u'elizabeth', u'and', u'were', u'walking', u'up', u'to', u'the', u'stage', u'and', u'lot', u'of', u'people', u'were', u'hanging', u'out', u'of', u'the', u'windows', u'and', u'we', u're', u'glad', u'that', u'you', u've', u'got', u'the', u'best', u'view', u'of', u'what', u'we', u're', u'doing', u'here', u'it', u'also', u'exciting', u'to', u'be', u'here', u'with', u'two', u'weeks', u'left', u'because', u'this', u'is', u'the',

In [7]:
#%%time

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])
print('\n')

[u'thank', u'you', u'wow', u'well', u'don', u'know', u'about', u'you', u'but', u'could', u'listen', u'to', u'elizabeth', u'warren', u'go', u'on', u'all', u'day', u'it', u'is', u'so', u'great', u'to', u'be', u'back', u'here', u'in', u'new_hampshire', u'have', u'significant', u'unruly', u'group', u'of', u'women', u'went', u'to', u'wellesley', u'with', u'back', u'here', u'oh', u'it', u'is', u'so', u'wonderful', u'to', u'be', u'here', u'on', u'this', u'college', u'campus', u'and', u'to', u'see', u'so', u'many', u'young', u'people', u'here', u'as', u'maggie', u'and', u'elizabeth', u'and', u'were', u'walking', u'up', u'to', u'the', u'stage', u'and', u'lot', u'of', u'people', u'were', u'hanging', u'out', u'of', u'the', u'windows', u'and', u'we', u're', u'glad', u'that', u'you', u've', u'got', u'the', u'best', u'view', u'of', u'what', u'we', u're', u'doing', u'here', u'it', u'also', u'exciting', u'to', u'be', u'here', u'with', u'two', u'weeks', u'left', u'because', u'this', u'is', u'the', u'mo

In [8]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [9]:
%%time
# This cell takes 2-3 minutes to run on my machine.  -j

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])
print('\n')

[[u'thank', u'know', u'listen', u'elizabeth', u'go', u'day', u'great', u'back', u'new_hampshire', u'significant', u'unruly', u'group', u'woman', u'go', u'wellesley', u'back', u'wonderful', u'college', u'campus', u'see', u'many', u'young', u'people', u'maggie', u'elizabeth', u'walking', u'stage', u'lot', u'people', u'hang', u'window', u'glad', u'get', u'good', u'view', u'also', u'excite', u'week', u'leave', u'consequential', u'election', u'lifetime', u'see', u'energy', u'enthusiasm', u'crowd', u'display', u'see', u'yesterday', u'north_carolina', u'see', u'day', u'ohio', u'really', u'demonstrate', u'american', u'really', u'look', u'stake', u'come', u'conclusion', u'involve', u'remain', u'day', u'campaign', u'everyone', u'need', u'turn', u'vote', u'new_hampshire', u'lot', u'reason', u'vote', u'get', u'great', u'candidate', u'congress', u'annie', u'kuster', u'carol', u'shea', u'porter', u'deserve', u'support', u'get', u'great', u'candidate', u'governor', u'colin', u'van', u'ostern', u'than

In [10]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 2), (1, 1), (2, 1), (3, 2), (4, 1), (5, 6), (6, 1), (7, 1), (8, 2), (9, 1), (10, 2), (11, 2), (12, 3), (13, 1), (14, 1), (15, 3), (16, 2), (17, 2), (18, 3), (19, 14), (20, 9), (21, 3), (22, 1), (23, 4), (24, 1), (25, 1), (26, 2), (27, 1), (28, 3), (29, 2), (30, 1), (31, 2), (32, 3), (33, 10), (34, 2), (35, 1), (36, 1), (37, 1), (38, 4), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 6), (46, 1), (47, 1), (48, 8), (49, 1), (50, 1), (51, 3), (52, 1), (53, 1), (54, 1), (55, 1), (56, 2), (57, 2), (58, 1), (59, 2), (60, 2), (61, 1), (62, 5), (63, 8), (64, 1), (65, 2), (66, 2), (67, 2), (68, 3), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 2), (77, 1), (78, 1), (79, 1), (80, 4), (81, 3), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 5), (88, 6), (89, 14), (90, 1), (91, 1), (92, 1), (93, 2), (94, 1), (95, 5), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 3), (103, 1), (104, 1), (105, 1), (106, 1), (107, 2), (108, 1), (109, 3), (110,

In [11]:
id2word[0]

u'able'

In [12]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[(u'able', 2),
  (u'absolutely', 1),
  (u'abuse', 1),
  (u'achieve', 2),
  (u'action', 1),
  (u'actually', 6),
  (u'ad', 1),
  (u'add', 1),
  (u'addiction', 2),
  (u'addition', 1),
  (u'advanced_manufactur', 2),
  (u'afford', 2),
  (u'affordable', 3),
  (u'afraid', 1),
  (u'agency', 1),
  (u'ahead', 3),
  (u'ally', 2),
  (u'already', 2),
  (u'also', 3),
  (u'america', 14),
  (u'american', 9),
  (u'annie', 3),
  (u'anti', 1),
  (u'anybody', 4),
  (u'anymore', 1),
  (u'anyone', 1),
  (u'anything', 2),
  (u'anywhere', 1),
  (u'ask', 3),
  (u'attack', 2),
  (u'attention', 1),
  (u'away', 2),
  (u'baby', 3),
  (u'back', 10),
  (u'bad', 2),
  (u'balance', 1),
  (u'ball', 1),
  (u'ballot', 1),
  (u'bank', 4),
  (u'barrier', 1),
  (u'basically', 1),
  (u'battle', 1),
  (u'beautiful', 1),
  (u'become', 1),
  (u'behalf', 1),
  (u'believe', 6),
  (u'bernie', 1),
  (u'bernie_sander', 1),
  (u'big', 8),
  (u'biggest_investment', 1),
  (u'billion', 1),
  (u'billion_dollar', 3),
  (u'billionair', 1)

In [13]:
%%time

### Much as we saw with Word2vec, these settings are the key to tuning your LDA Topic Model. ###

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

CPU times: user 9.79 s, sys: 68.5 ms, total: 9.86 s
Wall time: 5.27 s


In [14]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  u'0.022*"american" + 0.014*"country" + 0.014*"go" + 0.012*"job" + 0.011*"people" + 0.009*"america" + 0.006*"new" + 0.006*"year" + 0.006*"time" + 0.005*"want"'),
 (1,
  u'0.016*"go" + 0.015*"people" + 0.015*"get" + 0.012*"work" + 0.011*"know" + 0.011*"want" + 0.011*"say" + 0.009*"country" + 0.008*"america" + 0.008*"family"'),
 (2,
  u'0.027*"go" + 0.017*"people" + 0.015*"say" + 0.013*"country" + 0.012*"know" + 0.012*"get" + 0.009*"great" + 0.008*"want" + 0.007*"take" + 0.006*"happen"')]


In [15]:
%%time

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
print('\n')

('\nPerplexity: ', -7.247581712417654)
('\nCoherence Score: ', 0.2843947702077843)


CPU times: user 1.82 s, sys: 64.5 ms, total: 1.89 s
Wall time: 6.74 s


In [16]:
%%time

# Visualize the topics
#
# If you get an error like this: "pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. 
# A future version of pandas will change to not sort by default."
#
# then from the command line do: "pip install pandas==0.21.0"

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)  # sort=False ? sort=True
vis
print('\n')



CPU times: user 1.03 s, sys: 113 ms, total: 1.15 s
Wall time: 8.59 s


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


#import pandas as pd
#pd.__version__

vis

In [17]:
vis