original link https://nlpforhackers.io/topic-modeling/

In [1]:
from nltk.corpus import brown

data = []

for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)

NO_DOCUMENTS = len(data)
print(NO_DOCUMENTS)
print(data[:5])

500


Using Gensim for Topic Modeling

In [2]:
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords

NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')

def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text

tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))

# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)

# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]    



In [3]:
# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(corpus[20])
# [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...

[(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2), (44, 2), (45, 2), (46, 2), (47, 2), (49, 1), (50, 1), (53, 1), (56, 1), (59, 1), (60, 1), (66, 1), (75, 1), (80, 1), (98, 1), (101, 1), (106, 1), (117, 1), (129, 1), (130, 2), (132, 2), (135, 2), (140, 1), (141, 2), (143, 4), (144, 2), (145, 2), (166, 1), (195, 1), (198, 3), (219, 1), (220, 4), (221, 3), (223, 1), (229, 4), (230, 4), (231, 2), (235, 1), (236, 1), (242, 2), (246, 2), (255, 1), (263, 1), (269, 1), (270, 5), (271, 2), (275, 5), (276, 1), (278, 4), (280, 2), (281, 1), (307, 2), (310, 1), (311, 3), (313, 1), (314, 5), (318, 4), (322, 1), (336, 1), (338, 3), (339, 1), (340, 1), (341, 1), (345, 1), (346, 1), (351, 1), (354, 1), (355, 1), (366, 3), (368, 13), (370, 1), (372, 1), (374, 3), (377, 3), (381, 3), (386, 1), (392, 6), (396, 1), (401, 1), (412, 2), (426, 2), (428, 2), (431, 2), (434, 2), (439, 2), (444, 1), (450, 1), (452, 1), (462, 1), (465, 1), (467, 1), (470, 1), (478, 1), (483, 1), (

In [4]:
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)


In [5]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
 
print("=" * 20)
 
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
 
print("=" * 20)

LDA Model:
Topic #0: 0.007*"one" + 0.005*"said" + 0.004*"would" + 0.004*"new" + 0.003*"could" + 0.003*"time" + 0.003*"may" + 0.003*"two" + 0.003*"first" + 0.003*"like"
Topic #1: 0.007*"one" + 0.004*"would" + 0.004*"said" + 0.004*"could" + 0.003*"time" + 0.003*"may" + 0.003*"new" + 0.002*"like" + 0.002*"state" + 0.002*"also"
Topic #2: 0.006*"one" + 0.004*"would" + 0.004*"said" + 0.004*"may" + 0.003*"could" + 0.003*"time" + 0.003*"made" + 0.002*"new" + 0.002*"back" + 0.002*"like"
Topic #3: 0.006*"would" + 0.005*"one" + 0.003*"two" + 0.003*"time" + 0.003*"man" + 0.003*"could" + 0.003*"new" + 0.003*"said" + 0.003*"like" + 0.002*"first"
Topic #4: 0.007*"would" + 0.006*"one" + 0.004*"could" + 0.003*"two" + 0.003*"said" + 0.003*"new" + 0.003*"first" + 0.003*"like" + 0.002*"way" + 0.002*"even"
Topic #5: 0.007*"one" + 0.004*"would" + 0.003*"man" + 0.003*"said" + 0.003*"could" + 0.003*"first" + 0.003*"time" + 0.003*"also" + 0.003*"new" + 0.002*"even"
Topic #6: 0.006*"would" + 0.005*"one" + 0.004

In [6]:
text = "The economy is working better than ever"
bow = dictionary.doc2bow(clean_text(text))
print(lsi_model[bow])
# [(0, 0.091615426138426506), (1, -0.0085557463300508351), (2, 0.016744863677828108), (3, 0.040508186718598529), (4, 0.014201267714185898), (5, -0.012208538275305329), (6, 0.031254053085582149), (7, 0.017529584659403553), (8, 0.056957633371540077),(9, 0.025989149894888153)]
print(lda_model[bow])
# [(0, 0.020005183), (1, 0.020005869), (2, 0.02000626), (3, 0.020005472), (4, 0.020009108), (5, 0.020005926), (6, 0.81994385), (7, 0.020006068), (8, 0.020006327), (9, 0.020005994)]

[(0, 0.091615116147191061), (1, 0.0087935454679275247), (2, 0.015809979208675013), (3, 0.041295775125740244), (4, -0.016205658552133427), (5, -0.012724787054400413), (6, -0.029694660033554428), (7, 0.018519613047427202), (8, -0.056583418720696534), (9, -0.023832167874994092)]
[(0, 0.02000837), (1, 0.020005628), (2, 0.02000582), (3, 0.020006133), (4, 0.020006608), (5, 0.020006863), (6, 0.020005871), (7, 0.020005912), (8, 0.81994325), (9, 0.020005524)]


In [7]:
from gensim import similarities
 
lda_index = similarities.MatrixSimilarity(lda_model[corpus])
 
# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
 

In [8]:
# Top most similar documents:
print(similarities[:10])

[(191, 0.99762505), (175, 0.99761617), (358, 0.99761367), (62, 0.99760276), (93, 0.99752998), (378, 0.99735695), (84, 0.99733168), (91, 0.99733168), (95, 0.99733168), (298, 0.99733168)]


In [9]:
# Let's see what's the most similar document
document_id, similarity = similarities[0]
print(data[document_id][:1000])

Can thermonuclear war be set off by accident ? ? What steps have been taken to guard against the one sort of mishap that could trigger the destruction of continents ? ? Are we as safe as we should be from such a disaster ? ? Is anything being done to increase our margin of safety ? ? Will the danger increase or decrease ? ? I have just asked these questions in the Pentagon , in the White House , in offices of key scientists across the country and aboard the submarines that prowl for months underwater , with neat rows of green launch tubes which contain Polaris missiles and which are affectionately known as `` Sherwood Forest '' . I asked the same questions inside the launch-control rooms of an Atlas missile base in Wyoming , where officers who wear sidearms are manning the `` commit buttons '' that could start a war -- accidentally or by design -- and in the command centers where other pistol-packing men could give orders to push such buttons . To the men in the instrument-jammed bombe

# Using Scikit-Learn for Topic Modeling

In [10]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

NUM_TOPICS = 10

vectorizer = CountVectorizer(min_df=5, max_df = 0.9,
                            stop_words='english', lowercase=True,
                            token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)

In [11]:
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape) #(NO_DOCUMENTS. NO_TOPICS)

(500, 10)


In [12]:
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(500, 10)


In [13]:
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(500, 10)


In [14]:
# Let's see how the first document in the corpus looks like in different topic spaces
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])

[  1.05616099e-04   1.05596740e-04   1.05596736e-04   3.98167008e-01
   1.05597270e-04   5.95320618e-01   1.05608052e-04   1.05607910e-04
   5.77313942e-03   1.05612195e-04]
[ 0.          0.          2.10658192  0.07687653  0.          0.54517391
  1.059224    0.          0.          0.24607793]
[  2.33068432e+01   1.59511169e+00   2.17887637e+01   1.28693601e-02
   8.36543427e-01   1.15972785e+01   3.94402651e+00  -2.13538089e+00
   1.44820021e+00  -1.45556582e+01]


In [15]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('time', 286.41126656345403), ('used', 282.68043843547133), ('life', 250.03268693671757), ('work', 229.24952373282346), ('man', 224.40769631833365), ('form', 220.75673339927778), ('number', 219.48128529140476), ('new', 208.86342000254629), ('world', 190.07799213808522), ('does', 183.08490579830297)]
Topic 1:
[('vernon', 11.895721970929028), ('santa', 1.6936959582558491), ('primacy', 0.87168755967065403), ('said', 0.65105422015646819), ('coaches', 0.650615955111848), ('palace', 0.55849262772021091), ('patronage', 0.5115796417611278), ('man', 0.49909165269167927), ('new', 0.48401829594338852), ('chinese', 0.4646838272764417)]
Topic 2:
[('new', 0.5524391658375174), ('said', 0.44360946937244367), ('clay', 0.4197871743525951), ('state', 0.40121098960849061), ('years', 0.39622771806361223), ('year', 0.39384857540075174), ('like', 0.3490593162701694), ('people', 0.3411784871007934), ('time', 0.33158554490973963), ('make', 0.3264667504427235)]
Topic 3:
[('mrs', 150.8898743

[('state', 0.43239036090455696), ('mrs', 0.27396542424824438), ('form', 0.20246237007168918), ('dictionary', 0.17122421314802608), ('information', 0.15087001802374686), ('text', 0.12815248378370936), ('federal', 0.12250804385153628), ('forms', 0.1162777507929094), ('cell', 0.1139679227593545), ('man', 0.10881932710227105)]
Topic 7:
[('united', 0.27861146937519138), ('states', 0.2321789526836783), ('mrs', 0.19746301802789101), ('shall', 0.18892813240213183), ('government', 0.1793445851770846), ('school', 0.14482499794992915), ('section', 0.12021306384041053), ('act', 0.11518687861929384), ('agreement', 0.11342567119361489), ('information', 0.10532704994572824)]
Topic 8:
[('form', 0.31829278697342084), ('dictionary', 0.2943732894883988), ('information', 0.28462823606772153), ('text', 0.22237706668216412), ('year', 0.19344315277642599), ('cell', 0.18573950414941204), ('forms', 0.1832847625782115), ('tax', 0.16904504558013553), ('fiscal', 0.14013889994686604), ('list', 0.1328363433116985)]

In [16]:
text = "The economy is working better than ever"
x = nmf_model.transform(vectorizer.transform([text]))[0]
print(x)

[ 0.00289617  0.          0.          0.          0.          0.00440627
  0.          0.          0.          0.00466457]


In [17]:
from sklearn.metrics.pairwise import euclidean_distances
 
def most_similar(x, Z, top_n=5):
    dists = euclidean_distances(x.reshape(1, -1), Z)
    pairs = enumerate(dists[0])
    most_similar = sorted(pairs, key=lambda item: item[1])[:top_n]
    return most_similar
 
similarities = most_similar(x, nmf_Z)
document_id, similarity = similarities[0]
print(data[document_id][:1000])

Livery stable -- J. Vernon , prop. '' . Coaching had declined considerably by 1905 , but the sign was still there , near the old Wells Fargo building in San Francisco , creaking in the fog as it had for thirty years . John Vernon had had all the patronage he cared for -- he had prospered , but he could not retire from horsedom . Coaching was in his blood . He had two interests in life : the pleasures of the table and driving . Twice a week he drove his tallyho over the Santa Cruz road , upland and through the redwood forest , with orchards below him at one hand , and glimpses of the Pacific at the other . The journey back he made along the coast road , traveling hell-for-leather , every lantern of the tallyho ablaze . The southward route was the classic run in California , and the most fashionable . His patronage on this stretch was made up largely of San Franciscans -- regulars , most of them , and trenchermen like himself . They did not complain at the inhuman hour of starting ( seve

# Plotting words and documents in 2D with SVD

In [18]:
import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [19]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)

df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(data))

source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x = "x", y = "y", text="document", y_offset = 8,
                 text_font_size="8pt", text_color="#555555",
                 source=source, text_align='center')

plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color='black', fill_alpha=0.8)
plot.add_layout(labels)
show(plot,notebook_handle=True)

In [20]:
svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)

df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()

source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x = "x", y = "y", text="word", y_offset = 8,
                 text_font_size="8pt", text_color="#555555",
                 source=source, text_align='center')

plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color='black', fill_alpha=0.8)
plot.add_layout(labels)
show(plot,notebook_handle=True)

More about Latent Dirichlet Allocation

In [21]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
 
text = "The economy is working better than ever"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x, x.sum())

[ 0.02500028  0.02501065  0.02500814  0.02500217  0.02501222  0.77495931
  0.02500637  0.02500004  0.02500076  0.02500006] 1.0


In [22]:
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
