# Commands to install the following

# conda install -c spacy spacy 
# conda install -c conda-forge pyldavis 
# python -m spacy download en

In LDA, each document may be viewed as a mixture of various topics where each document is considered to have a set of topics that are assigned to it via LDA. This is identical to probabilistic latent semantic analysis (pLSA), except that in LDA the topic distribution is assumed to have a sparse Dirichlet prior. The sparse Dirichlet priors encode the intuition that documents cover only a small set of topics and that topics use only a small set of words frequently. In practice, this results in a better disambiguation of words and a more precise assignment of documents to topics. LDA is a generalization of the pLSA model, which is equivalent to LDA under a uniform Dirichlet prior distribution.[

We use the following function to clean our texts and return a list of tokens

In [1]:

import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

We use NLTK’s Wordnet to find the meanings of words, synonyms, antonyms, and more. In addition, we use WordNetLemmatizer to get the root word.



In [2]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

[nltk_data] Downloading package wordnet to /Users/varsha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

Filtering out stop words:

In [4]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/varsha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


 
Now we can define a function to prepare the text for topic modelling:

In [5]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

Open up the data, read line by line, for each line, prepare text for LDA, then add to a list.

Data converted to csv

In [6]:
import random
text_data = []
with open('/Users/varsha/.spyder-py3/NLP_proj_twitter.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['chris', 'twitter', 'quot;@nelsonforsenate', 'partial', 'birth', 'killing', 'scotus', 'uphold', 'allow', 'revoke', 'SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', 'redwave2018', '15:31:03,chris,.0,nelson,1054817497860370000.0']
['invest', 'elect', 'illinois', 'twitter', 'quot;yep', 'health', 'medicare', 'SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', '14:27:53,#ilvote', '11/6,1.0,nelson,1054076825079550000.0']
['light', 'twitter', 'quot;your', 'right', 'don&#39;t', 'think', 'election', 'could', 'problem', '11:43:28,bud', 'light', ',.0,nelson,1054035446458710000.0']
['misogyny', 'twitter', 'quot;did', 'fraud', 'quot;,2018', '15:53:02,joni', 'skibo', 'lacriox,2.0,scott,1054823029056910000.0']
['twitter', 'quot;there', 'poll', 'backup', 'claim', 'eather', 'kavaugh', 'fading', 'pollster', 'likely', 'voter', 'model', 'overestimate', '15:19:59,rip', '2019,1.0,rosen,1054452326138

['SCREEN_NAME', '",2018', '12:05:01,hmarmas,3908.0,nelson,1054040870930940000.0']
['SCREEN_NAME', 'SCREEN_NAME', 'enthusiastic', 'votes.",2018', '11:54:35,conspiracy', 'collusion,58.0,nelson,1054038246370340000.0']
[]
['",2018', '11:34:54,#carol', 'trump', 'follow', 'back,1.0,nelson,1054033294256430000.0']
[]
['SCREEN_NAME']
['SCREEN_NAME', 'gorgeous', 'tampa', 'knock', 'door', 'florida', 'SCREEN_NAME', 'volunteer', 'sense', 'candidate', 'SCREEN_NAME', 'SCREEN_NAME', '",2018', '11:24:15,gwen', 'ruda,76.0,nelson,1054030612259060000.0']
['SCREEN_NAME', '",2018', '11:13:39,jrohrich,3908.0,nelson,1054027945558240000.0']
['SCREEN_NAME']
['SCREEN_NAME']
['support']
['SCREEN_NAME']
['SCREEN_NAME']
['SCREEN_NAME', 'democratic', 'senator', 'upcoming', 'midterm', 'please', 'follow']
[]
['SCREEN_NAME', '",2018', '10:24:41,randy', 'ferrell', 'vetsvote/@votevets,3908.0,nelson,1054015621028030000.0']
['SCREEN_NAME']
['SCREEN_NAME', '",2018', '10:17:48,gayle', 'cartwright,3908.0,nelson,10540138872494

[]
['SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', 'source', ',2018', '22:03:57,doc', 'holliday,3.0,scott,1054191598823710000.0']
['SCREEN_NAME', 'SCREEN_NAME', 'great', 'governor', 'continue', 'legacy', 'growing', 'cutting', ',2018', '22:00:23,jayme', 'wyatt,929.0,scott,1054190699384570000.0']
[]
['SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', 'thank', 'work,2018', '21:54:31,mike', 'mechanic,.0,scott,1054189223316800000.0']
['SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', 'place', 'stay?,2018', '21:51:57,blackcatsoda,.0,scott,1054188579809910000.0']
['close', '",2018', '16:00:26,martha', 'grierson,1816.0,rosen,1054462505660100000.0']
['close', '",2018', '15:58:00,doris', 'ewing,1816.0,rosen,1054461893312720000.0']
['SCREEN_NAME', 'senator', 'heller', 'guilty', 'big', 'break', 'promise', 'modern', 'nevada', 'history', 'vote', 'health', ',2018', '15:54:56,paula', 'di,488.0,rosen,1054461120482710000.0']
['SCREEN_NAME', 'excite', 'rally

['SCREEN_NAME', 'debate', 'tonight', 'missouri', 'SCREEN_NAME', 'SCREEN_NAME', 'SCREEN_NAME', 'want', 'remind', 'everyone', 'mccaskill', 'support', ',2018', '18:20:05,rawklion', ',256.0,mccaskill,1053048095766830000.0']
['SCREEN_NAME', '.@clairecmc', 'travel', 'thousand', 'mile', 'answer', 'hundred', 'question', 'listen', 'missourian', 'missourian', '",2018', '17:33:47,maryelizabeth', 'dorsey,43.0,mccaskill,1053036444896370000.0']
['SCREEN_NAME', '.@clairecmc', 'honor', 'missionary', 'baptist', 'state', 'convention', 'missouri', 'earth', 'morning', ',2018', '17:30:10,jacqueline,30.0,mccaskill,1053035532660430000.0']
['SCREEN_NAME', 'SCREEN_NAME', 'claire', 'impeachtrumpandpencenow', 'impeachkavanaugh,2018', '17:27:35,soccer', 'thoughts,.0,mccaskill,1053034882715180000.0']
['SCREEN_NAME', 'SCREEN_NAME', 'photo', 'actually', 'proposal', 'would', 'state', 'it,2018', '12:36:49,jd,.0,mccaskill,1052961710875780000.0']
['SCREEN_NAME', 'casualty', 'trade', 'rising', 'SCREEN_NAME', 'right', ',2

First, we are creating a dictionary from the data, then convert to bag-of-words corpus and save the dictionary and corpus for future use.

In [7]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)


In [8]:
corpus = [dictionary.doc2bow(text) for text in text_data]


In [9]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

We are asking LDA to find 10 topics in the data. Reapeat the same couple of times with different numbers

In [10]:
import gensim
NUM_TOPICS = 10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')

In [11]:
topics = ldamodel.print_topics(num_words=9)
for topic in topics:
    print(topic)
    

(0, '0.098*"senate" + 0.085*"SCREEN_NAME" + 0.054*"focus" + 0.050*"terror" + 0.050*"mcconnell" + 0.050*"reign" + 0.039*"please" + 0.039*"turnout" + 0.033*"",2018"')
(1, '0.061*"close" + 0.058*"breaking" + 0.051*"trump" + 0.050*"nevada" + 0.049*"SCREEN_NAME" + 0.048*",2018" + 0.048*"back" + 0.048*"country" + 0.048*"entire"')
(2, '0.083*"SCREEN_NAME" + 0.023*"flipthesenate" + 0.017*"early" + 0.017*"nevada" + 0.017*"voting" + 0.017*"strong" + 0.017*"worth" + 0.015*"",2018" + 0.012*"place"')
(3, '0.204*"SCREEN_NAME" + 0.034*",2018" + 0.021*"great" + 0.018*"",2018" + 0.013*"governor" + 0.013*"missouri" + 0.011*"continue" + 0.011*"cutting" + 0.011*"growing"')
(4, '0.447*"SCREEN_NAME" + 0.030*"",2018" + 0.024*"today" + 0.010*"twitter" + 0.008*",2018" + 0.006*"elect" + 0.006*"proposal" + 0.006*"medicare" + 0.004*"future"')
(5, '0.092*"",2018" + 0.085*"SCREEN_NAME" + 0.031*"support" + 0.025*"want" + 0.023*"voter" + 0.023*"catch" + 0.023*"deceive" + 0.023*"believe" + 0.023*".@mccaskill4mo"')
(6,

In [12]:
    
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))    
    


[(101, 1)]
[(0, 0.05), (1, 0.05), (2, 0.05), (3, 0.05), (4, 0.05), (5, 0.05), (6, 0.05), (7, 0.05), (8, 0.05), (9, 0.55)]


In [13]:

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 15, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=14)
for topic in topics:
    print(topic)
    

(0, '0.056*"SCREEN_NAME" + 0.021*"twitter" + 0.014*",2018" + 0.014*"light" + 0.014*"count" + 0.014*"",2018" + 0.014*"campaign" + 0.014*"think" + 0.007*"quot;spare" + 0.007*"07:38:43,julie" + 0.007*"flynn,3908.0,nelson,1053973854786330000.0" + 0.007*"spinning" + 0.007*"09:39:08,rac" + 0.007*"november"')
(1, '0.050*"SCREEN_NAME" + 0.049*"",2018" + 0.017*"twitter" + 0.017*"louis" + 0.017*"scheve" + 0.009*"mccaskill" + 0.009*"knock" + 0.009*"09:05:57,tanha,3908.0,nelson,1053995808058700000.0" + 0.009*"november" + 0.009*"election" + 0.009*"door" + 0.009*"course" + 0.009*"07:49:47,oldestdaughterof5,807.0,nelson,1053976640886400000.0" + 0.009*"loose"')
(2, '0.059*"",2018" + 0.046*"SCREEN_NAME" + 0.016*"close" + 0.015*"legion" + 0.015*"05:05:03,ivotedbluenotorange,3908.0,nelson,1053935183856990000.0" + 0.015*"5,000" + 0.015*"00:52:26,max,3908.0,nelson,1053871608438320000.0" + 0.015*"01:48:09,kim,3908.0,nelson,1053885632370690000.0" + 0.015*"roman" + 0.015*"12:33:23,francie,1816.0,rosen,1054410

In [14]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 20, id2word=dictionary, passes=15)
ldamodel.save('model20.gensim')
topics = ldamodel.print_topics(num_words=7)
for topic in topics:
    print(topic)

(0, '0.045*"SCREEN_NAME" + 0.035*"veteran" + 0.026*"flipthesenate" + 0.026*"strong" + 0.026*"nevada" + 0.026*"early" + 0.026*"voting"')
(1, '0.070*"breaking" + 0.070*"close" + 0.060*"back" + 0.060*"entire" + 0.060*"country" + 0.060*"trump" + 0.060*"nevada"')
(2, '0.055*"SCREEN_NAME" + 0.052*"",2018" + 0.018*"trump" + 0.018*"follow" + 0.018*"12:32:15,laura" + 0.018*"byrd,765.0,rosen,1054410114290110000.0" + 0.018*"03:17:27,onanotherplanet,3908.0,nelson,1053908104151210000.0"')
(3, '0.035*"SCREEN_NAME" + 0.019*"missourian" + 0.019*"",2018" + 0.019*"twitter" + 0.019*"local" + 0.019*"today" + 0.010*"travel"')
(4, '0.063*"SCREEN_NAME" + 0.025*"everything" + 0.013*",2018" + 0.013*"illegals" + 0.013*"springfield" + 0.013*"risenvote" + 0.013*"deport"')
(5, '0.044*"show" + 0.044*"almost" + 0.044*"politico" + 0.044*"senat" + 0.044*"breaking" + 0.043*"nevada" + 0.038*",2018"')
(6, '0.115*"SCREEN_NAME" + 0.031*"missouri" + 0.024*"defend" + 0.024*"owner" + 0.024*"abide" + 0.024*"fight" + 0.024*"mis

In [15]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')    


Visualize:
pyLDAvis is designed to help users interpret the topics in a topic model that has been fit to a corpus of text data. The package extracts information from a fitted LDA topic model to inform an interactive web-based visualization.

In [16]:

import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)
    

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
