In [1]:
# Part 1 : gensim LDA based on NLTK & SpaCy

# Run in python console
import nltk; nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ipekcinar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('spanish')

# For Shakespeare : We need to change this: kill thou, thy and shall & keep subject, re, edu and use
#stop_words.extend(['may', 'make', 'would', 'shall', 'must', 'could', 'applause'])  # make, come, go also very common


In [4]:
# Import Dataset -- Shakespeare

# Pull all into "data" = all Shakespeare raw text

# directory containing all source texts for training the model 
data_dir="/Users/ipekcinar/Desktop/populism-hackathon/corpus_populism/Venezuela"
import glob, os
os.chdir(data_dir)

#documents = list()
data = list()   # reset data to 0

for filename in glob.glob("*.txt"):
    filedata = open(filename, 'r').read()
    print(filename + " = " + str(len(filedata)) + " chars")
    #documents = documents + filedata.split(".")
    data.append(filedata)


1053.txt = 1346 chars
1735.txt = 1767 chars
1721.txt = 327958 chars
1047.txt = 43680 chars
1709.txt = 11 chars
289.txt = 27695 chars
504.txt = 47863 chars
1090.txt = 1151 chars
262.txt = 12 chars
276.txt = 43305 chars
1084.txt = 908 chars
510.txt = 13597 chars
538.txt = 11587 chars
1912.txt = 1402 chars
1906.txt = 67000 chars
1537.txt = 6503 chars
1251.txt = 195376 chars
909.txt = 16879 chars
1245.txt = 212600 chars
1523.txt = 7546 chars
921.txt = 67359 chars
935.txt = 40774 chars
1279.txt = 236050 chars
1292.txt = 61020 chars
706.txt = 7050 chars
712.txt = 162 chars
1286.txt = 168196 chars
1443.txt = 257404 chars
29.txt = 33229 chars
1325.txt = 198152 chars
869.txt = 26326 chars
1331.txt = 209988 chars
1457.txt = 213705 chars
15.txt = 1533 chars
1319.txt = 175525 chars
855.txt = 44206 chars
699.txt = 36688 chars
841.txt = 75244 chars
1480.txt = 7612 chars
114.txt = 5363 chars
672.txt = 5466 chars
666.txt = 34564 chars
100.txt = 723 chars
1494.txt = 25761 chars
128.txt = 14772 chars
89

110.txt = 27132 chars
1484.txt = 229179 chars
138.txt = 49564 chars
886.txt = 44141 chars
892.txt = 44087 chars
1453.txt = 199511 chars
879.txt = 7960 chars
39.txt = 55052 chars
1335.txt = 159148 chars
1321.txt = 98634 chars
1447.txt = 36174 chars
845.txt = 62655 chars
1309.txt = 84036 chars
851.txt = 783 chars
11.txt = 73174 chars
689.txt = 12 chars
1282.txt = 85911 chars
716.txt = 5353 chars
702.txt = 76875 chars
1296.txt = 208733 chars
1527.txt = 5789 chars
1241.txt = 248024 chars
1255.txt = 70845 chars
919.txt = 33784 chars
1533.txt = 36770 chars
931.txt = 464 chars
1269.txt = 106014 chars
925.txt = 183844 chars
514.txt = 169 chars
1080.txt = 39726 chars
272.txt = 658 chars
266.txt = 10375 chars
1094.txt = 2732 chars
500.txt = 25710 chars
528.txt = 735 chars
1902.txt = 19475 chars
1916.txt = 32913 chars
1043.txt = 173 chars
1725.txt = 765 chars
1731.txt = 12 chars
1057.txt = 12154 chars
1719.txt = 326248 chars
299.txt = 28218 chars
1756.txt = 1926 chars
1030.txt = 9840 chars
1024.t

560.txt = 4626 chars
574.txt = 37519 chars
212.txt = 22257 chars
1786.txt = 881 chars
548.txt = 2609 chars
1751.txt = 2770 chars
1037.txt = 59653 chars
1023.txt = 24911 chars
1745.txt = 12 chars
1779.txt = 426 chars
1009.txt = 29984 chars
589.txt = 89951 chars
1035.txt = 40943 chars
1753.txt = 2220 chars
1747.txt = 2967 chars
1021.txt = 48061 chars
238.txt = 357 chars
562.txt = 25 chars
204.txt = 674 chars
1790.txt = 431 chars
1784.txt = 254 chars
210.txt = 79715 chars
576.txt = 27372 chars
947.txt = 4373 chars
953.txt = 32892 chars
1579.txt = 250619 chars
1551.txt = 25693 chars
1237.txt = 153037 chars
1223.txt = 13129 chars
1545.txt = 146084 chars
984.txt = 14257 chars
748.txt = 1314 chars
990.txt = 77511 chars
1592.txt = 200016 chars
760.txt = 370 chars
774.txt = 1155 chars
1586.txt = 207536 chars
1419.txt = 58361 chars
73.txt = 35350 chars
833.txt = 60977 chars
67.txt = 76085 chars
827.txt = 36848 chars
199.txt = 38233 chars
1425.txt = 145895 chars
1343.txt = 71118 chars
1357.txt = 

420.txt = 30902 chars
346.txt = 24386 chars
352.txt = 29268 chars
434.txt = 17629 chars
353.txt = 33430 chars
435.txt = 76653 chars
421.txt = 42776 chars
347.txt = 32610 chars
1823.txt = 13501 chars
409.txt = 6579 chars
1189.txt = 9009 chars
1837.txt = 8578 chars
390.txt = 1087 chars
1604.txt = 142025 chars
1162.txt = 5153 chars
1176.txt = 20606 chars
1610.txt = 9669 chars
384.txt = 15529 chars
1638.txt = 1816 chars
637.txt = 11795 chars
151.txt = 41676 chars
145.txt = 26599 chars
623.txt = 9026 chars
93.txt = 3653 chars
179.txt = 70603 chars
87.txt = 10586 chars
1360.txt = 25064 chars
1406.txt = 52565 chars
192.txt = 45545 chars
186.txt = 41410 chars
1412.txt = 11002 chars
838.txt = 35579 chars
78.txt = 21735 chars
1374.txt = 51758 chars
810.txt = 29776 chars
50.txt = 41267 chars
2.txt = 584 chars
804.txt = 66259 chars
44.txt = 41608 chars
1348.txt = 52065 chars
743.txt = 99 chars
757.txt = 2667 chars
1599.txt = 280510 chars
1214.txt = 19988 chars
780.txt = 13882 chars
958.txt = 1538 

1460.txt = 190009 chars
1474.txt = 13315 chars
686.txt = 913 chars
1312.txt = 176594 chars
719.txt = 1502 chars
1299.txt = 195825 chars
725.txt = 2755 chars
731.txt = 93207 chars
902.txt = 11178 chars
1528.txt = 203774 chars
916.txt = 28968 chars
1272.txt = 28504 chars
1514.txt = 12 chars
1500.txt = 16465 chars
1266.txt = 35007 chars
269.txt = 11 chars
241.txt = 1264 chars
527.txt = 3658 chars
533.txt = 10891 chars
255.txt = 45120 chars
1058.txt = 28433 chars
282.txt = 15984 chars
1716.txt = 157798 chars
1070.txt = 21572 chars
1064.txt = 1451 chars
1702.txt = 4456 chars
296.txt = 37424 chars
1048.txt = 4895 chars
292.txt = 4695 chars
1706.txt = 21323 chars
1060.txt = 85684 chars
1074.txt = 110246 chars
1712.txt = 8501 chars
286.txt = 22143 chars
279.txt = 42771 chars
251.txt = 33467 chars
537.txt = 24614 chars
523.txt = 12282 chars
245.txt = 331 chars
1909.txt = 54768 chars
912.txt = 30261 chars
1538.txt = 187484 chars
906.txt = 1640 chars
1262.txt = 244116 chars
1504.txt = 4304 chars


In [5]:
%%time

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])
print('\n')

[[u'presidente', u'de', u'la', u'republica', u'bolivariana', u'de', u'venezuela', u'hugo', u'chavez', u'muy', u'buenas', u'tardes', u'senor', u'presidente', u'excelencias', u'amigas', u'amigos', u'todos', u'yo', u'traia', u'un', u'discurso', u'pero', u'como', u'esta', u'manana', u'exprese', u'buena', u'parte', u'de', u'las', u'ideas', u'que', u'estan', u'aqui', u'escritas', u'en', u'funcion', u'de', u'que', u'la', u'lista', u'de', u'oradores', u'es', u'larga', u'yo', u'solo', u'quiero', u'someter', u'consideracion', u'de', u'la', u'asamblea', u'de', u'nuestro', u'movimiento', u'una', u'propuesta', u'hemos', u'oido', u'cuantas', u'reflexiones', u'de', u'todas', u'ellas', u'creo', u'que', u'se', u'recoge', u'un', u'factor', u'comun', u'es', u'impostergable', u'la', u'necesidad', u'suprema', u'necesidad', u'de', u'relanzar', u'nuestro', u'movimiento', u'senor', u'presidente', u'ahora', u'insisto', u'cuando', u'cuba', u'de', u'la', u'mano', u'hermana', u'de', u'malasia', u'recibe', u'la', 

In [6]:
#%%time

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])
print('\n')

[u'presidente', u'de', u'la', u'republica', u'bolivariana', u'de', u'venezuela', u'hugo', u'chavez', u'muy', u'buenas_tardes', u'senor', u'presidente', u'excelencias', u'amigas_amigos', u'todos', u'yo', u'traia', u'un', u'discurso', u'pero', u'como', u'esta', u'manana', u'exprese', u'buena', u'parte', u'de', u'las', u'ideas', u'que', u'estan', u'aqui', u'escritas', u'en', u'funcion', u'de', u'que', u'la', u'lista', u'de', u'oradores', u'es', u'larga', u'yo', u'solo', u'quiero', u'someter', u'consideracion', u'de', u'la', u'asamblea', u'de', u'nuestro', u'movimiento', u'una', u'propuesta', u'hemos', u'oido', u'cuantas', u'reflexiones', u'de', u'todas', u'ellas', u'creo', u'que', u'se', u'recoge', u'un', u'factor_comun', u'es', u'impostergable', u'la', u'necesidad', u'suprema', u'necesidad', u'de', u'relanzar', u'nuestro', u'movimiento', u'senor', u'presidente', u'ahora', u'insisto', u'cuando', u'cuba', u'de', u'la', u'mano', u'hermana', u'de', u'malasia', u'recibe', u'la', u'antorcha', 

In [7]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [8]:
%%time
# This cell takes 2-3 minutes to run on my machine.  -j

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('es', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])
print('\n')

TypeError: Argument 'string' has incorrect type (expected unicode, got str)

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
id2word[0]

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [None]:
%%time

### Much as we saw with Word2vec, these settings are the key to tuning your LDA Topic Model. ###

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
%%time

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
print('\n')

In [None]:
%%time

# Visualize the topics
#
# If you get an error like this: "pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. 
# A future version of pandas will change to not sort by default."
#
# then from the command line do: "pip install pandas==0.21.0"

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)  # sort=False ? sort=True
vis
print('\n')

#import pandas as pd
#pd.__version__

vis

In [None]:
vis