# LDA in Gensim

In [1]:
# want to make clean words and return a list of tokens
import spacy
parser = spacy.load("en_core_web_sm")

In [2]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [3]:
sent = '@bob said the #chicken was at the #junkyard. see http://ww.jonathanmugan.com.'
out_tokens = tokenize(sent)
print(out_tokens)

['SCREEN_NAME', 'said', 'the', '#', 'chicken', 'was', 'at', 'the', '#', 'junkyard', '.', 'see', 'URL', '.']


In [4]:
# lemmatize so dogs goes to dog and ran goes to run
# leammatization means t oget the "dictionary entry" for a word

from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
# or can use this 
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [6]:
for w in ['dogs', 'ran', 'discouraged']:
    print(w,get_lemma(w),get_lemma2(w))

dogs dog dog
ran run ran
discouraged discourage discouraged


In [7]:
import nltk
en_stop = set(nltk.corpus.stopwords.words('english'))


In [9]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token)> 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token)for token in tokens]
    return tokens

In [11]:
sent = 'I enjoy going to restaurants to eat hamburgers.'
print(prepare_text_for_lda(sent))

['enjoy', 'going', 'restaurant', 'hamburger']


In [15]:
# get data

import random
text_data = []
with open ('jonathan_mugan_tweets.txt','r', encoding='utf-8') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .95:
            print(tokens)
        text_data.append(tokens)

['buy', 'extend', 'warranty', 'laptop', 'walmart', 'computer', 'break', 'fix', 'hassle']
['recently', 'watch', 'robot', 'frank', 'robot', 'older']
['become', 'increasingly', 'ambivalent', 'toward', 'malcolm', 'gladwell', 'book', 'abandon', 'david', 'goliath', 'longer', 'trust', 'story']
['realization', 'begin', 'thought', 'would', 'month', 'cannotgettherefromhere']
['sometimes', 'could', 'wikipedia']
['recently', 'watch', 'transcendence', 'appear', 'johnny', 'assume', 'scientist', 'soulless', 'automaton', 'play']
['minecraft', 'really', 'store', 'model', 'ecosystem']
['single', 'container', 'pollute', 'million', 'truck', 'right']
['sneeze', 'still', 'hurt', 'middleage']
['interest', 'impression', 'world', 'movie', 'still', 'picture']
['printer', 'business', 'asking', 'software', 'update']
['spend', 'try', 'little', 'notification', 'icon', 'skinner']
['think', 'watch', 'could', 'remote']
['excite', 'could', 'would', 'improve', 'improve']
['move', 'cheap', 'laptop', 'cloud', 'suddenly', 

In [16]:
# Create a dictionary from the data
from gensim import corpora
dictionary = corpora.Dictionary(text_data)


  "class": algorithms.Blowfish,


In [17]:
# convert to bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in text_data]

In [18]:
# save the corpus and dictionary
import pickle
pickle.dump(corpus, open("corpus.pkl", "wb")) 
dictionary.save('dictionary.gensim')

In [20]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word = dictionary, passes = 15)
ldamodel.save('model5.gensim')

In [21]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.010*"could" + 0.010*"going" + 0.010*"think" + 0.008*"things"')
(1, '0.012*"reading" + 0.010*"coffee" + 0.008*"drink" + 0.008*"going"')
(2, '0.015*"people" + 0.012*"remember" + 0.011*"memory" + 0.009*"seem"')
(3, '0.010*"computer" + 0.009*"funny" + 0.009*"people" + 0.007*"anymore"')
(4, '0.019*"would" + 0.010*"robot" + 0.009*"funny" + 0.007*"older"')


In [22]:
# try a new document
#  topic 3
new_doc = 'I watch movies.'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(8, 1), (190, 1)]
[(0, 0.72899616), (1, 0.06797663), (2, 0.06674644), (3, 0.06722262), (4, 0.06905815)]


In [23]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3,
                                           id2word= dictionary, passes = 15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.011*"people" + 0.008*"coffee" + 0.008*"things" + 0.007*"always"')
(1, '0.013*"could" + 0.009*"want" + 0.006*"people" + 0.006*"would"')
(2, '0.015*"funny" + 0.011*"would" + 0.010*"dream" + 0.009*"seem"')


In [24]:
 # try ten topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10,
                                           id2word= dictionary, passes = 15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.017*"people" + 0.015*"email" + 0.012*"funny" + 0.011*"picture"')
(1, '0.024*"people" + 0.017*"movie" + 0.015*"computer" + 0.013*"always"')
(2, '0.014*"people" + 0.014*"machine" + 0.012*"start" + 0.012*"anyone"')
(3, '0.016*"think" + 0.016*"funny" + 0.015*"would" + 0.013*"change"')
(4, '0.016*"reading" + 0.016*"would" + 0.013*"right" + 0.013*"going"')
(5, '0.016*"dream" + 0.013*"watch" + 0.012*"night" + 0.011*"sense"')
(6, '0.014*"place" + 0.013*"soccer" + 0.012*"coffee" + 0.009*"lunch"')
(7, '0.011*"problem" + 0.010*"stupid" + 0.009*"hungry" + 0.009*"throw"')
(8, '0.013*"check" + 0.010*"people" + 0.009*"somehow" + 0.009*"look"')
(9, '0.018*"remember" + 0.017*"would" + 0.016*"memory" + 0.014*"thing"')


In [25]:
 # Exercise: Run LDA on Newsgroup Data
# The Newsgroup Data
# http://scikit-learn.org/stable/datasets/twenty_newsgroups.html#newsgroups
from sklearn.datasets import fetch_20newsgroups
texts = fetch_20newsgroups(subset='train')
print(dir(texts))
# 11,314 posts
print(len(texts.target))
print(texts.target)
print(texts.target_names)
print(texts.data[0])

['DESCR', 'data', 'filenames', 'target', 'target_names']
11314
[7 4 4 ... 3 1 8]
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, 