In [1]:
# Part 1 : gensim LDA based on NLTK & SpaCy

# Run in python console
import nltk; nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ipekcinar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# For Shakespeare : We need to change this: kill thou, thy and shall & keep subject, re, edu and use
stop_words.extend(['may', 'make', 'would', 'shall', 'must', 'could', 'applause'])  # make, come, go also very common


In [4]:
# Import Dataset -- original source
'''
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])
'''

'\ndf = pd.read_json(\'https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json\')\nprint(df.target_names.unique())\ndf.head()\n\n# Convert to list\ndata = df.content.values.tolist()\n\n# Remove Emails\ndata = [re.sub(\'\\S*@\\S*\\s?\', \'\', sent) for sent in data]\n\n# Remove new line characters\ndata = [re.sub(\'\\s+\', \' \', sent) for sent in data]\n\n# Remove distracting single quotes\ndata = [re.sub("\'", "", sent) for sent in data]\n\npprint(data[:1])\n'

In [5]:
# Import Dataset -- Shakespeare

# Pull all into "data" = all Shakespeare raw text

# directory containing all source texts for training the model 
data_dir="/Users/ipekcinar/Desktop/populism-hackathon/corpus_populism/United States/2016/Donald Trump"
import glob, os
os.chdir(data_dir)

#documents = list()
data = list()   # reset data to 0

for filename in glob.glob("*.txt"):
    filedata = open(filename, 'r').read()
    print(filename + " = " + str(len(filedata)) + " chars")
    #documents = documents + filedata.split(".")
    data.append(filedata)


Trump100516.txt = 12214 chars
Trump102116.txt = 9767 chars
Trump110116.txt = 13623 chars
Trump090716.txt = 14165 chars
Trump081816.txt = 18203 chars
Trump102916_2.txt = 4771 chars
Trump102316.txt = 10637 chars
Trump061316.txt = 17787 chars
Trump080516.txt = 50632 chars
Trump100316.txt = 6179 chars
Trump110716.txt = 34499 chars
Trump090116.txt = 7175 chars
Trump102716.txt = 22285 chars
Trump082316.txt = 13582 chars
Trump101816.txt = 27662 chars
Trump102116_2.txt = 9795 chars
Trump090316.txt = 8834 chars
Trump092816_2.txt = 11832 chars
Trump071116.txt = 14768 chars
Trump082416.txt = 21589 chars
Trump102216.txt = 26170 chars
Trump081916.txt = 24081 chars
Trump110216.txt = 28016 chars
Trump071616.txt = 22940 chars
Trump092016.txt = 10721 chars
Trump101516_2.txt = 9801 chars
Trump100516_2.txt = 15140 chars
Trump100416.txt = 15650 chars
Trump092216.txt = 12980 chars
Trump090616.txt = 12221 chars
Trump102016.txt = 11811 chars
Trump032116.txt = 13331 chars
Trump061615.txt = 35008 chars
Trump09

In [6]:
%%time

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])
print('\n')

[[u'thank', u'you', u'am', u'so', u'thrilled', u'to', u'be', u'here', u'in', u'reno', u'with', u'your', u'help', u'in', u'days', u'we', u'are', u'going', u'to', u'win', u'this', u'state', u'we', u'are', u'going', u'to', u'win', u'the', u'white', u'house', u'and', u'we', u'are', u'going', u'to', u'bring', u'back', u'our', u'jobs', u'we', u'are', u'going', u'to', u'make', u'america', u'rich', u'again', u'am', u'going', u'to', u'end', u'illegal', u'immigration', u'stop', u'the', u'massive', u'inflow', u'of', u'refugees', u'keep', u'jobs', u'from', u'pouring', u'out', u'of', u'our', u'country', u'renegotiate', u'our', u'disastrous', u'trade', u'deals', u'and', u'massively', u'reduce', u'taxes', u'and', u'regulations', u'on', u'our', u'workers', u'and', u'our', u'small', u'businesses', u'hillary', u'clinton', u'has', u'been', u'there', u'for', u'years', u'and', u'hasn', u'fixed', u'anything', u'in', u'fact', u'she', u'just', u'made', u'things', u'worse', u'it', u'been', u'years', u'of', u'f

In [7]:
#%%time

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])
print('\n')

[u'thank', u'you', u'am', u'so', u'thrilled', u'to', u'be', u'here', u'in', u'reno', u'with', u'your', u'help', u'in', u'days', u'we', u'are', u'going', u'to', u'win', u'this', u'state', u'we', u'are', u'going', u'to', u'win', u'the', u'white_house', u'and', u'we', u'are', u'going', u'to', u'bring', u'back', u'our', u'jobs', u'we', u'are', u'going', u'to', u'make', u'america', u'rich', u'again', u'am', u'going', u'to', u'end', u'illegal_immigration', u'stop', u'the', u'massive_inflow', u'of', u'refugees', u'keep', u'jobs', u'from', u'pouring', u'out', u'of', u'our', u'country', u'renegotiate', u'our', u'disastrous_trade_deals', u'and', u'massively', u'reduce', u'taxes', u'and', u'regulations', u'on', u'our', u'workers', u'and', u'our', u'small_businesses', u'hillary', u'clinton', u'has', u'been', u'there', u'for', u'years', u'and', u'hasn', u'fixed', u'anything', u'in', u'fact', u'she', u'just', u'made', u'things', u'worse', u'it', u'been', u'years', u'of', u'failure', u'the', u'proble

In [8]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [9]:
%%time
# This cell takes 2-3 minutes to run on my machine.  -j

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])
print('\n')

[[u'thank', u'thrilled', u'reno', u'help', u'day', u'go', u'win', u'state', u'go', u'win', u'white_house', u'go', u'bring', u'back', u'job', u'go', u'america', u'rich', u'go', u'end', u'illegal', u'immigration', u'stop', u'massive_inflow', u'refugee', u'keep', u'job', u'pour', u'country', u'renegotiate', u'disastrous', u'trade_deal', u'massively', u'reduce', u'tax', u'regulation', u'worker', u'small_business', u'year', u'fix', u'anything', u'fact', u'make', u'thing', u'bad', u'year', u'failure', u'problem', u'face', u'country', u'immense', u'go', u'take', u'bold', u'action', u'turn', u'thing', u'right', u'owe_trillion', u'debt', u'double', u'president', u'obama', u'infrastructure', u'third', u'world', u'country', u'homicide', u'rate', u'last', u'year', u'experience', u'big', u'single', u'year', u'increase', u'year', u'heroin', u'overdos', u'surge', u'meth', u'overdose', u'nevada', u'percent', u'thousand', u'refugee', u'admit', u'way', u'screen', u'instantly', u'make', u'eligible', u'we

In [10]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 4), (10, 1), (11, 1), (12, 1), (13, 1), (14, 9), (15, 1), (16, 13), (17, 12), (18, 4), (19, 1), (20, 1), (21, 1), (22, 1), (23, 2), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 9), (31, 1), (32, 3), (33, 1), (34, 3), (35, 1), (36, 3), (37, 1), (38, 3), (39, 7), (40, 1), (41, 1), (42, 1), (43, 1), (44, 3), (45, 1), (46, 4), (47, 1), (48, 1), (49, 3), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 4), (62, 1), (63, 1), (64, 3), (65, 1), (66, 1), (67, 1), (68, 4), (69, 1), (70, 3), (71, 1), (72, 4), (73, 1), (74, 1), (75, 1), (76, 1), (77, 2), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 24), (89, 1), (90, 1), (91, 1), (92, 2), (93, 2), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 7), (100, 2), (101, 7), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 2), (109, 1), (110,

In [11]:
id2word[0]

u'abide'

In [12]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[(u'abide', 1),
  (u'abolish', 1),
  (u'accomplish', 1),
  (u'accomplishment', 1),
  (u'action', 1),
  (u'add', 1),
  (u'admission', 1),
  (u'admit', 2),
  (u'afghanistan', 1),
  (u'african', 4),
  (u'age', 1),
  (u'ago', 1),
  (u'aid', 1),
  (u'allow', 1),
  (u'also', 9),
  (u'amazing', 1),
  (u'america', 13),
  (u'american', 12),
  (u'amnesty', 4),
  (u'angler', 1),
  (u'announce', 1),
  (u'annual_trade', 1),
  (u'anymore', 1),
  (u'anyone', 2),
  (u'anything', 1),
  (u'appoint_justice', 1),
  (u'arrogance', 1),
  (u'asylum', 1),
  (u'attempt', 1),
  (u'average', 1),
  (u'back', 9),
  (u'background', 1),
  (u'bad', 3),
  (u'basic', 1),
  (u'begin', 3),
  (u'beij', 1),
  (u'believe', 3),
  (u'better', 1),
  (u'big', 3),
  (u'bill', 7),
  (u'bit', 1),
  (u'bleach', 1),
  (u'bold', 1),
  (u'bomb', 1),
  (u'border', 3),
  (u'boston_bomber', 1),
  (u'bring', 4),
  (u'budget', 1),
  (u'build', 1),
  (u'business', 3),
  (u'bust', 1),
  (u'call', 1),
  (u'called_recovery', 1),
  (u'cancel',

In [13]:
%%time

### Much as we saw with Word2vec, these settings are the key to tuning your LDA Topic Model. ###

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

CPU times: user 9.82 s, sys: 66.9 ms, total: 9.89 s
Wall time: 5.38 s


In [14]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  u'0.020*"american" + 0.017*"go" + 0.015*"country" + 0.011*"job" + 0.011*"people" + 0.010*"america" + 0.006*"new" + 0.005*"also" + 0.005*"want" + 0.005*"time"'),
 (1,
  u'0.006*"cyber" + 0.005*"military" + 0.005*"defense" + 0.004*"also" + 0.004*"new" + 0.004*"state" + 0.004*"government" + 0.003*"security" + 0.003*"american" + 0.003*"america"'),
 (2,
  u'0.032*"go" + 0.017*"say" + 0.017*"people" + 0.014*"get" + 0.013*"know" + 0.012*"country" + 0.010*"great" + 0.010*"want" + 0.008*"job" + 0.007*"right"')]


In [15]:
%%time

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
print('\n')

('\nPerplexity: ', -7.131126554389999)
('\nCoherence Score: ', 0.2950742939252106)


CPU times: user 1.25 s, sys: 54.2 ms, total: 1.31 s
Wall time: 5.14 s


In [16]:
%%time

# Visualize the topics
#
# If you get an error like this: "pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. 
# A future version of pandas will change to not sort by default."
#
# then from the command line do: "pip install pandas==0.21.0"

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)  # sort=False ? sort=True
vis
print('\n')



CPU times: user 737 ms, sys: 112 ms, total: 849 ms
Wall time: 8.48 s


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


#import pandas as pd
#pd.__version__

vis

In [17]:
vis