# Gensim LDA/LSI

## Load the dataset
prepare the dataset we’ll be working with.
<br>(array of texts of news titles)


In [1]:
import os
import csv

data = []
dirname = '../../../out'
if os.path.exists(dirname):   
    for filename in os.listdir(dirname):
        filename = os.path.join(dirname, filename)
        with open(filename, 'r') as csv_file:
            reader = csv.reader(csv_file)
            next(reader, None)
            for row in reader:
                data.append(row[0])

# print first 5 titles
NUM_DOCUMENTS = len(data)
print('data:', NUM_DOCUMENTS, 'documens')
print('(top 5)')
print(' \n'.join(data[:5]))

data: 13956 documens
(top 5)
Two Suspects Arrested in Home Invasion Robbery    NR17032ma 
Suspect Arrested for Chinatown Murders    NR17033ml 
Hit and Run Collision Leaves Pedestrian Dead    NR17033ne 
Fatal Stabbing of a 27-year-old Man   NR17035im 
Press Conference   NA17014ma


## Preprocessing

In [None]:
# preprocess data for lad/lsi (tokenize and clean the data: stopwords, etc.)

import re
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer

stemmer = SnowballStemmer('english')

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    
    # simple_preprocess: lowercases, tokenizes, de-accents (optional). 
    # deacc=True removes punctuations.
    for token in simple_preprocess(text, deacc=True, min_len=2, max_len=15):
        if (token not in STOPWORDS
            and len(token) > 3
            and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', token)):
            result.append(lemmatize_stemming(token))
    return result

# preprocess data
processed_docs = []
for text in data:
    processed_docs.append(preprocess(text))

# print first 5 processed titles
processed_docs[:5]

## Running LDA/LSI using Bag of Words

In [None]:
# Create Gensim LDA and LSI models using BOW corpus.

from gensim import models, corpora

NUM_TOPICS = 30

# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(processed_docs)

# Gensim filter_extremes
# dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Transform the collection of texts to a numerical form
bow_corpus = [dictionary.doc2bow(text) for text in processed_docs]
 
# Have a look at how the 10th document looks like: [(word_id, count), ...]
# e.g: [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...
doc_id = 10
print(processed_docs[doc_id])
print(bow_corpus[doc_id])

# Build the LDA model (Latent Dirichlet Allocation)
lda_model_bow = gensim.models.LdaMulticore(corpus=bow_corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=2, workers=4)

# Build the LSI model (Latent Semantic Analysis or Latent Semantic Indexing)
lsi_model_bow = models.LsiModel(corpus=bow_corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [None]:
# For each topic, print the first 5 most representative topics.

print("LDA Model (bow):")
print("=" * 20)
for idx in range(NUM_TOPICS):
    # Print the first 5 most representative topics
    print("Topic %s:" % idx, lda_model_bow.print_topic(idx, 5)) 

print()

print("LSI Model (bow):")
print("=" * 20)
for idx in range(NUM_TOPICS):
    # Print the first 5 most representative topics
    print("Topic %s:" % idx, lsi_model_bow.print_topic(idx, 5))


In [None]:
# Let’s now put the models to work and transform unseen documents to their topic distribution:

texts = [
    "A men found killed in the park.", 
    "A woman was raped in the park."
]

for text in texts:
    print()
    print(text)
    
    bow = dictionary.doc2bow(preprocess(text))

    # print(lda_model[bow])
    # [(0, 0.020005183), (1, 0.020005869), (2, 0.02000626), (3, 0.020005472), (4, 0.020009108), (5, 0.020005926), (6, 0.81994385), (7, 0.020006068), (8, 0.020006327), (9, 0.020005994)]
    # print(max(lda_model[bow], key=lambda item:item[1]))
    print('lda:', sorted(lda_model_bow[bow], key=lambda item:item[1], reverse=True)[:3])

    # print(lsi_model[bow])
    # [(0, 0.091615426138426506), (1, -0.0085557463300508351), (2, 0.016744863677828108), (3, 0.040508186718598529), (4, 0.014201267714185898), (5, -0.012208538275305329), (6, 0.031254053085582149), (7, 0.017529584659403553), (8, 0.056957633371540077), (9, 0.025989149894888153)]
    print('lsi:', sorted(lsi_model_bow[bow], key=lambda item:item[1], reverse=True)[:3])

## Running LDA/LSI using TF-IDF

In [None]:
tfidf = models.TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]

# Build the LDA model (Latent Dirichlet Allocation)
lda_model_tfidf = gensim.models.LdaMulticore(corpus=tfidf_corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=2, workers=4)

# Build the LSI model (Latent Semantic Analysis or Latent Semantic Indexing)
lsi_model_tfidf = models.LsiModel(corpus=tfidf_corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [None]:
# For each topic, print the first 5 most representative topics.

print("LDA Model (tfidf):")
print("=" * 20)
for idx in range(NUM_TOPICS):
    # Print the first 5 most representative topics
    print("Topic %s:" % idx, lda_model_tfidf.print_topic(idx, 5)) 

print()

print("LSI Model (tfidf):")
print("=" * 20)
for idx in range(NUM_TOPICS):
    # Print the first 5 most representative topics
    print("Topic %s:" % idx, lsi_model_tfidf.print_topic(idx, 5))


# Using Scikit-Learn for Topic Modeling

In [None]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

import nltk.stem
stemmer = nltk.stem.SnowballStemmer('english')

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

# vectorizer = StemmedCountVectorizer(
#     analyzer="word", 
#     min_df=5, max_df=0.9, 
#     stop_words='english', lowercase=True, 
#     token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                ngram_range = (1,2), 
                                min_df = 20,
                                max_df = 1.0)

data_vectorized = vectorizer.fit_transform(data)

# vectorizer = CountVectorizer(
#     min_df=5, max_df=0.9, 
#     stop_words='english', lowercase=True, 
#     token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
# data_vectorized = vectorizer.fit_transform(data)

# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
 
# Let's see how the first document in the corpus looks like in different topic spaces
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])

In [None]:
def print_topics(model, vectorizer, top_n=6):
    for idx, topic in enumerate(model.components_):
        print()
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

print("LDA Model:")
print("=" * 20)
print_topics(lda_model, vectorizer)
 
print()
print("NMF Model:")
print("=" * 20)
print_topics(nmf_model, vectorizer)

print()
print("LSI Model:")
print("=" * 20)
print_topics(lsi_model, vectorizer)

In [None]:
# Transforming an unseen document
texts = [
    "A men found killed in the park.", 
    "A woman was raped in the park."
]

for text in texts:
    print()
    print(text)
    x = nmf_model.transform(vectorizer.transform([text]))[0]
    print(x)

# Plotting words and documents in 2D with SVD

In [None]:
import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [None]:
# plot documents in 2D
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(data))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
# plot.add_layout(labels)
show(plot, notebook_handle=True)

In [None]:
# display words in 2D we just need to transpose the vectorized data: 
# words_2d = svd.fit_transform(data_vectorized.T).

svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)
 
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [None]:
# lda

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 
NUM_TOPICS = 40

stem = False

if stem:
    vectorizer = StemmedCountVectorizer(
        analyzer="word", 
        min_df=5, max_df=0.9, 
        stop_words='english', lowercase=True, 
        token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
else:
#     vectorizer = TfidfVectorizer(
#         min_df=5, max_df=0.9, 
#         stop_words='english', lowercase=True, 
#         token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

#     vectorizer = CountVectorizer(strip_accents = 'unicode',
    vectorizer = TfidfVectorizer(strip_accents = 'unicode',
                                    stop_words = 'english',
                                    lowercase = True,
                                    token_pattern = r'\b[a-zA-Z]{3,}\b',
                                    ngram_range = (1,2), 
                                    min_df = 20,
                                    max_df = 1.0)

data_vectorized = vectorizer.fit_transform(data)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)

# Transforming an unseen document
texts = [
    "A men found killed in the park.", 
    "A woman was raped in the park."
]

for text in texts:
    print()
    print(text)
    x = lda_model.transform(vectorizer.transform([text]))[0]
    print(x, x.sum())

In [None]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel