# Gensim LDA/LSI

In [30]:
# prepare the dataset we’ll be working with.

import os
import csv

import gensim
from gensim.parsing.preprocessing import preprocess_string

data = []
dirname = '../../../out'
if os.path.exists(dirname):   
    for filename in os.listdir(dirname):
        filename = os.path.join(dirname, filename)
        with open(filename, 'r') as csv_file:
            reader = csv.reader(csv_file)
            next(reader, None)
            for row in reader:
                data.append(row[0])

NUM_DOCUMENTS = len(data)
print('data:', NUM_DOCUMENTS, 'documens')
print('(top 5)')
print(' \n'.join(data[:5]))

data: 13956 documens
(top 5)
Newborn Baby Girl Found Dead Inside Home 
Police Commission President John W. Mack's Comments on Tragic Use of Force Incident 
Neighbor and Mother Detain Molester 
Harbor Area Officers Seriously Injured by Drunk Driver 
Prolific Car Thieves Nabbed in Koreatown


In [33]:
# Gensim doesn’t have an implementation for NMF so we’re only going to play with  
# LDA and LSI (Latent Semantic Indexing AKA Latent Semantic Analysis) models.

import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords

NUM_TOPICS = 30
STOPWORDS = stopwords.words('english')

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def stem(t):
    return stemmer.stem(t)

def tokenize(text):
    return word_tokenize(text)

def clean_text(text):
    tokenized_text = tokenize(text.lower())
    cleaned_text = [stem(t) for t in tokenized_text 
                        if t not in STOPWORDS 
                            and len(t) > 3
                            and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text
 
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))

# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
 
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
 
# Have a look at how the 10th document looks like: [(word_id, count), ...]
# e.g: [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...
doc_id=10
print(tokenized_data[doc_id])
print(corpus[doc_id])

# Build the LDA model (Latent Dirichlet Allocation)
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

# Build the LSI model (Latent Semantic Analysis or Latent Semantic Indexing)
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

['shot', 'kill', 'detect', 'investig']
[(35, 1), (41, 1), (45, 1), (47, 1)]


In [38]:
# Let’s now display the topics the two models have inferred:
print("LDA Model:")
print("=" * 20)
for idx in range(NUM_TOPICS):
    # Print the first 5 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 5)) 

print()

print("LSI Model:")
print("=" * 20)
for idx in range(NUM_TOPICS):
    # Print the first 5 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 5))


LDA Model:
Topic #0: 0.088*"sobrieti" + 0.085*"checkpoint" + 0.050*"bureau" + 0.046*"oper" + 0.041*"south"
Topic #1: 0.056*"harbor" + 0.046*"press" + 0.044*"confer" + 0.043*"local" + 0.031*"wife"
Topic #2: 0.126*"shot" + 0.085*"found" + 0.084*"kill" + 0.067*"dead" + 0.033*"male"
Topic #3: 0.051*"mother" + 0.045*"arrest" + 0.037*"suspect" + 0.032*"offic" + 0.028*"report"
Topic #4: 0.157*"woman" + 0.137*"miss" + 0.075*"critic" + 0.052*"stab" + 0.025*"death"
Topic #5: 0.093*"patrol" + 0.061*"satur" + 0.045*"announc" + 0.041*"elderli" + 0.035*"hollenbeck"
Topic #6: 0.097*"caught" + 0.097*"robberi" + 0.053*"suspect" + 0.050*"video" + 0.046*"hit-and-run"
Topic #7: 0.056*"program" + 0.039*"serial" + 0.037*"camera" + 0.033*"prevent" + 0.032*"lapd"
Topic #8: 0.184*"investig" + 0.141*"detect" + 0.098*"homicid" + 0.040*"murder" + 0.038*"shoot"
Topic #9: 0.086*"die" + 0.056*"victim" + 0.047*"kill" + 0.044*"shoot" + 0.043*"relat"
Topic #10: 0.161*"murder" + 0.105*"suspect" + 0.101*"reward" + 0.100*

In [39]:
# Let’s now put the models to work and transform unseen documents to their topic distribution:

texts = [
    "A men found killed in the park.", 
    "A woman was raped in the park."
]

for text in texts:
    print()
    print(text)
    
    bow = dictionary.doc2bow(clean_text(text))

    # print(lda_model[bow])
    # [(0, 0.020005183), (1, 0.020005869), (2, 0.02000626), (3, 0.020005472), (4, 0.020009108), (5, 0.020005926), (6, 0.81994385), (7, 0.020006068), (8, 0.020006327), (9, 0.020005994)]
    # print(max(lda_model[bow], key=lambda item:item[1]))
    print('lda:', sorted(lda_model[bow], key=lambda item:item[1], reverse=True)[:3])

    # print(lsi_model[bow])
    # [(0, 0.091615426138426506), (1, -0.0085557463300508351), (2, 0.016744863677828108), (3, 0.040508186718598529), (4, 0.014201267714185898), (5, -0.012208538275305329), (6, 0.031254053085582149), (7, 0.017529584659403553), (8, 0.056957633371540077), (9, 0.025989149894888153)]
    print('lsi:', sorted(lsi_model[bow], key=lambda item:item[1], reverse=True)[:3])


A men found killed in the park.
lda: [(2, 0.7583333)]
lsi: [(15, 0.38046298887855734), (5, 0.31777994571191337), (17, 0.23738056506502744)]

A woman was raped in the park.
lda: [(13, 0.26677173), (20, 0.25758633), (4, 0.2506419)]
lsi: [(6, 0.28744708051610984), (25, 0.2152412723071597), (21, 0.17212109131107314)]


# Using Scikit-Learn for Topic Modeling

In [51]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

import nltk.stem
stemmer = nltk.stem.SnowballStemmer('english')

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

# vectorizer = StemmedCountVectorizer(
#     analyzer="word", 
#     min_df=5, max_df=0.9, 
#     stop_words='english', lowercase=True, 
#     token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                ngram_range = (1,2), 
                                min_df = 20,
                                max_df = 1.0)

data_vectorized = vectorizer.fit_transform(data)

# vectorizer = CountVectorizer(
#     min_df=5, max_df=0.9, 
#     stop_words='english', lowercase=True, 
#     token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
# data_vectorized = vectorizer.fit_transform(data)

# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
 
# Let's see how the first document in the corpus looks like in different topic spaces
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])

(13956, 30)
(13956, 30)
(13956, 30)
[0.14761905 0.0047619  0.0047619  0.14761905 0.0047619  0.0047619
 0.0047619  0.0047619  0.14761905 0.0047619  0.0047619  0.0047619
 0.0047619  0.0047619  0.0047619  0.0047619  0.0047619  0.29047619
 0.0047619  0.0047619  0.0047619  0.0047619  0.14761905 0.0047619
 0.0047619  0.0047619  0.0047619  0.0047619  0.0047619  0.0047619 ]
[0.         0.         0.00194942 0.         0.         0.00777623
 0.         0.         0.         0.00119838 0.         0.
 0.         0.         0.         0.00136653 0.00163599 0.
 0.16513451 0.00192834 0.         0.02439027 0.         0.00057302
 0.00755575 0.00068999 0.00753287 0.01316481 0.         0.01013887]
[ 0.09474609 -0.08559438  0.07404194  0.04239119 -0.09411083 -0.01827411
 -0.08830723  0.04487348 -0.06077954 -0.03794249 -0.22667989  0.05592047
  0.17858123  0.04191457 -0.00721927  0.05888994  0.34755246 -0.10536038
  0.70835467  0.05427133 -0.07139276 -0.22070783  0.08243613 -0.04529521
  0.00834758  0.181

In [52]:
def print_topics(model, vectorizer, top_n=6):
    for idx, topic in enumerate(model.components_):
        print()
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

print("LDA Model:")
print("=" * 20)
print_topics(lda_model, vectorizer)
 
print()
print("NMF Model:")
print("=" * 20)
print_topics(nmf_model, vectorizer)

print()
print("LSI Model:")
print("=" * 20)
print_topics(lsi_model, vectorizer)

LDA Model:

Topic 0:
[('kill', 650.6813708333013), ('year-old', 230.54045775368309), ('shot', 183.2226141403016), ('-year-old', 160.76607855775205), ('boy', 102.82056147386932), ('woman', 87.18773681433618)]

Topic 1:
[('help', 330.42673458310765), ('public', 319.48069699134), ('identifi', 157.71931291703342), ('need', 147.1641633012137), ('program', 144.01514550020886), ('detect', 139.0296630812229)]

Topic 2:
[('investig', 650.4278227181013), ('checkpoint', 589.2847539909579), ('detect', 503.0099833962266), ('divis', 351.70901601002987), ('sobrieti', 307.49428201639654), ('death', 305.36521671265933)]

Topic 3:
[('murder', 941.2137059968395), ('reward', 344.77872932755815), ('offer', 274.15063716553055), ('girl', 142.1507519468208), ('rampart', 133.68697560100912), ('assist', 125.70833606758637)]

Topic 4:
[('medic', 49.92898431765296), ('bandit', 48.678088557323015), ('forum', 45.11835708013457), ('violent', 42.240379893129926), ('drug', 38.3412710874964), ('marijuana', 38.133737533

[('area', 0.659194333398899), ('sobrieti', 0.20571005739593357), ('hit', 0.14955727908385166), ('run', 0.1460664713148668), ('public', 0.1426484550632119), ('valley', 0.1423884200556476)]

Topic 18:
[('dead', 0.6145611212006065), ('robberi', 0.300751974773425), ('leav', 0.2892782710407523), ('woman', 0.13911252942506194), ('offic', 0.13529234863645045), ('kill', 0.13009125184911788)]

Topic 19:
[('shot', 0.42202028818141557), ('death', 0.40751663550865463), ('woman', 0.2607019409107353), ('public', 0.1677614575338257), ('help', 0.16548926100442773), ('arrest', 0.13329123955008088)]

Topic 20:
[('chief', 0.46156066378870114), ('sobrieti', 0.3667126800450457), ('checkpoint', 0.27793706743381963), ('beck', 0.1697048619502827), ('media', 0.15394592424267986), ('divis', 0.145135794017124)]

Topic 21:
[('robberi', 0.6754036674602929), ('victim', 0.22316307582196585), ('caught', 0.19970863836749603), ('arrest', 0.13287887392048353), ('home', 0.10101813105977979), ('die', 0.09965631945173352)]

In [53]:
# Transforming an unseen document
texts = [
    "A men found killed in the park.", 
    "A woman was raped in the park."
]

for text in texts:
    print()
    print(text)
    x = nmf_model.transform(vectorizer.transform([text]))[0]
    print(x)


A men found killed in the park.
[0.         0.         0.00138141 0.         0.         0.
 0.         0.         0.         0.00023715 0.         0.00179709
 0.16245042 0.         0.         0.         0.         0.00073541
 0.00084145 0.00485255 0.         0.         0.00808862 0.
 0.00027024 0.         0.00436239 0.         0.00260959 0.        ]

A woman was raped in the park.
[0.00052316 0.         0.00076593 0.         0.00021708 0.
 0.         0.         0.         0.00020934 0.         0.00069343
 0.00020102 0.         0.         0.         0.         0.00074508
 0.00104073 0.00146113 0.         0.         0.00795391 0.
 0.16319799 0.         0.00405763 0.         0.00290318 0.        ]


# Plotting words and documents in 2D with SVD

In [54]:
import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [55]:
# plot documents in 2D
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(data))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
# plot.add_layout(labels)
show(plot, notebook_handle=True)

In [56]:
# display words in 2D we just need to transpose the vectorized data: 
# words_2d = svd.fit_transform(data_vectorized.T).

svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)
 
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [96]:
# lda

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 
NUM_TOPICS = 40

stem = False

if stem:
    vectorizer = StemmedCountVectorizer(
        analyzer="word", 
        min_df=5, max_df=0.9, 
        stop_words='english', lowercase=True, 
        token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
else:
#     vectorizer = TfidfVectorizer(
#         min_df=5, max_df=0.9, 
#         stop_words='english', lowercase=True, 
#         token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

#     vectorizer = CountVectorizer(strip_accents = 'unicode',
    vectorizer = TfidfVectorizer(strip_accents = 'unicode',
                                    stop_words = 'english',
                                    lowercase = True,
                                    token_pattern = r'\b[a-zA-Z]{3,}\b',
                                    ngram_range = (1,2), 
                                    min_df = 20,
                                    max_df = 1.0)

data_vectorized = vectorizer.fit_transform(data)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)

# Transforming an unseen document
texts = [
    "A men found killed in the park.", 
    "A woman was raped in the park."
]

for text in texts:
    print()
    print(text)
    x = lda_model.transform(vectorizer.transform([text]))[0]
    print(x, x.sum())


A men found killed in the park.
[0.00927723 0.00927723 0.00927723 0.00927723 0.00927723 0.00927723
 0.00927723 0.00927723 0.00927723 0.00927723 0.00927723 0.00927723
 0.00927723 0.00927723 0.00927723 0.00927723 0.00927723 0.00927723
 0.00927723 0.00927723 0.00927723 0.00927723 0.00927723 0.00927723
 0.00927723 0.00927723 0.00927723 0.00927723 0.00927723 0.00927723
 0.00927723 0.00927723 0.00927723 0.00927723 0.00927723 0.00927723
 0.00927723 0.63818786 0.00927723 0.00927723] 0.9999999999999999

A woman was raped in the park.
[0.0104475  0.0104475  0.0104475  0.0104475  0.0104475  0.0104475
 0.0104475  0.0104475  0.0104475  0.0104475  0.0104475  0.0104475
 0.0104475  0.0104475  0.0104475  0.0104475  0.0104475  0.0104475
 0.0104475  0.0104475  0.0104475  0.0104475  0.25040731 0.0104475
 0.0104475  0.0104475  0.0104475  0.0104475  0.0104475  0.0104475
 0.0104475  0.0104475  0.0104475  0.0104475  0.0104475  0.0104475
 0.0104475  0.35258756 0.0104475  0.0104475 ] 1.0


In [97]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
