original link https://nlpforhackers.io/topic-modeling/

In [20]:
from nltk.corpus import brown

data = []

for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)

NO_DOCUMENTS = len(data)
print(NO_DOCUMENTS)
print(data[:5])

500


Using Gensim for Topic Modeling

In [21]:
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords

NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')

def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text

tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))

# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)

# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]    

In [22]:
# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(corpus[20])
# [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...

[(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2), (44, 2), (45, 2), (46, 2), (47, 2), (49, 1), (50, 1), (53, 1), (56, 1), (59, 1), (60, 1), (66, 1), (75, 1), (80, 1), (98, 1), (101, 1), (106, 1), (117, 1), (129, 1), (130, 2), (132, 2), (135, 2), (140, 1), (141, 2), (143, 4), (144, 2), (145, 2), (166, 1), (195, 1), (198, 3), (219, 1), (220, 4), (221, 3), (223, 1), (229, 4), (230, 4), (231, 2), (235, 1), (236, 1), (242, 2), (246, 2), (255, 1), (263, 1), (269, 1), (270, 5), (271, 2), (275, 5), (276, 1), (278, 4), (280, 2), (281, 1), (307, 2), (310, 1), (311, 3), (313, 1), (314, 5), (318, 4), (322, 1), (336, 1), (338, 3), (339, 1), (340, 1), (341, 1), (345, 1), (346, 1), (351, 1), (354, 1), (355, 1), (366, 3), (368, 13), (370, 1), (372, 1), (374, 3), (377, 3), (381, 3), (386, 1), (392, 6), (396, 1), (401, 1), (412, 2), (426, 2), (428, 2), (431, 2), (434, 2), (439, 2), (444, 1), (450, 1), (452, 1), (462, 1), (465, 1), (467, 1), (470, 1), (478, 1), (483, 1), (

In [23]:
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)


In [24]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
 
print("=" * 20)
 
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
 
print("=" * 20)

LDA Model:
Topic #0: 0.006*"would" + 0.005*"one" + 0.004*"time" + 0.004*"said" + 0.003*"two" + 0.003*"could" + 0.003*"new" + 0.002*"made" + 0.002*"man" + 0.002*"may"
Topic #1: 0.007*"one" + 0.006*"would" + 0.003*"time" + 0.003*"could" + 0.003*"new" + 0.003*"man" + 0.003*"first" + 0.003*"said" + 0.003*"may" + 0.003*"even"
Topic #2: 0.006*"one" + 0.005*"said" + 0.005*"would" + 0.004*"could" + 0.003*"two" + 0.003*"years" + 0.003*"new" + 0.002*"man" + 0.002*"like" + 0.002*"must"
Topic #3: 0.006*"one" + 0.004*"would" + 0.003*"could" + 0.003*"time" + 0.003*"first" + 0.003*"two" + 0.003*"also" + 0.002*"new" + 0.002*"many" + 0.002*"much"
Topic #4: 0.006*"would" + 0.005*"one" + 0.003*"could" + 0.003*"said" + 0.003*"new" + 0.003*"two" + 0.002*"even" + 0.002*"must" + 0.002*"may" + 0.002*"time"
Topic #5: 0.006*"one" + 0.005*"would" + 0.003*"time" + 0.003*"could" + 0.003*"new" + 0.003*"man" + 0.002*"two" + 0.002*"said" + 0.002*"made" + 0.002*"may"
Topic #6: 0.006*"one" + 0.003*"new" + 0.003*"first"

In [25]:
text = "The economy is working better than ever"
bow = dictionary.doc2bow(clean_text(text))
print(lsi_model[bow])
# [(0, 0.091615426138426506), (1, -0.0085557463300508351), (2, 0.016744863677828108), (3, 0.040508186718598529), (4, 0.014201267714185898), (5, -0.012208538275305329), (6, 0.031254053085582149), (7, 0.017529584659403553), (8, 0.056957633371540077),(9, 0.025989149894888153)]
print(lda_model[bow])
# [(0, 0.020005183), (1, 0.020005869), (2, 0.02000626), (3, 0.020005472), (4, 0.020009108), (5, 0.020005926), (6, 0.81994385), (7, 0.020006068), (8, 0.020006327), (9, 0.020005994)]

[(0, 0.091611845729335797), (1, -0.008764791975045632), (2, 0.016471583213729712), (3, -0.041247740198070829), (4, -0.016351799429639449), (5, 0.01271814371828706), (6, -0.027504395825017333), (7, 0.01697367517749512), (8, 0.055921187928742372), (9, -0.028171931317197849)]
[(0, 0.020006532), (1, 0.020005846), (2, 0.02000629), (3, 0.02000542), (4, 0.81994599), (5, 0.020006754), (6, 0.020005751), (7, 0.020006241), (8, 0.020005753), (9, 0.020005364)]


In [26]:
from gensim import similarities
 
lda_index = similarities.MatrixSimilarity(lda_model[corpus])
 
# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
 

In [27]:
# Top most similar documents:
print(similarities[:10])

[(133, 0.99805188), (23, 0.99786484), (69, 0.99782526), (189, 0.99762797), (321, 0.99751246), (249, 0.99745518), (2, 0.99733162), (42, 0.99733162), (171, 0.99733162), (292, 0.99733162)]


In [28]:
# Let's see what's the most similar document
document_id, similarity = similarities[0]
print(data[document_id][:1000])

The controversy of the last few years over whether architects or interior designers should plan the interiors of modern buildings has brought clearly into focus one important difference of opinion . The architects do not believe that the education of the interior designer is sufficiently good or sufficiently extended to compare with that of the architect and that , therefore , the interior designer is incapable of understanding the architectural principles involved in planning the interior of a building . Ordinary politeness may have militated against this opinion being stated so badly but anyone with a wide acquaintance in both groups and who has sat through the many round tables , workshops or panel discussions -- whatever they are called -- on this subject will recognize that the final , boiled down crux of the matter is education . It is true that most architectural schools have five year courses , some even have six or more . The element of public danger which enters so largely in

# Using Scikit-Learn for Topic Modeling

In [30]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

NUM_TOPICS = 10

vectorizer = CountVectorizer(min_df=5, max_df = 0.9,
                            stop_words='english', lowercase=True,
                            token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)

In [33]:
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape) #(NO_DOCUMENTS. NO_TOPICS)

(500, 10)


In [34]:
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(500, 10)


In [35]:
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(500, 10)


In [36]:
# Let's see how the first document in the corpus looks like in different topic spaces
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])

[  1.05625651e-04   8.57800795e-02   4.67968037e-01   1.05611023e-04
   1.05630530e-04   1.05596703e-04   1.64763823e-02   1.05608926e-04
   1.85389642e-02   4.10708464e-01]
[ 0.          0.          2.12150873  0.07694447  0.          0.54339973
  1.07027775  0.          0.          0.246864  ]
[ 23.30684423   1.59538793  21.77535564  -0.09344056   0.94191426
  11.74161701   4.52265829  -2.55593585   1.27616361 -11.30993   ]


In [37]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('palmer', 49.694203140924913), ('said', 41.124708916057692), ('player', 37.547922589765861), ('new', 32.406198178597585), ('state', 26.209079723414394), ('year', 25.848420662270577), ('dallas', 25.235187897847212), ('republican', 23.392031165612607), ('city', 23.219436705091184), ('hughes', 22.241592415093049)]
Topic 1:
[('mrs', 432.80640157106677), ('said', 273.28632032142912), ('god', 210.72991140258418), ('john', 169.5920433302004), ('miss', 148.92570898447397), ('new', 146.61504546725595), ('man', 129.86933932021662), ('church', 116.99290629144522), ('home', 109.60395700003225), ('death', 99.485424136162266)]
Topic 2:
[('jury', 15.93959657282919), ('robinson', 15.837387878030942), ('said', 12.991435541153205), ('election', 11.429672724679801), ('county', 11.361010018049505), ('city', 10.185883715339536), ('secrets', 10.075841281673602), ('league', 7.6444667245888658), ('atlanta', 7.2489750994313189), ('resolution', 7.0979211017481489)]
Topic 3:
[('corporation'

[('united', 0.28323483965160329), ('states', 0.24019036974392236), ('mrs', 0.19964737512886047), ('shall', 0.19579715721080226), ('government', 0.17531086178351343), ('school', 0.16441100924477453), ('section', 0.12648486576260004), ('act', 0.11637936691521179), ('agreement', 0.1151962240137082), ('india', 0.10002744176424297)]
Topic 8:
[('form', 0.32079758566172012), ('dictionary', 0.30019481696848493), ('information', 0.29510717662310532), ('text', 0.22679903181036093), ('cell', 0.19076017900558151), ('forms', 0.18806465738944411), ('year', 0.18679173736963489), ('tax', 0.15875751112275494), ('fiscal', 0.13726374246001863), ('list', 0.13509383735770658)]
Topic 9:
[('year', 0.24231431098182943), ('fiscal', 0.19994994286300063), ('school', 0.19035843140125636), ('time', 0.13342160695863051), ('tax', 0.12935970636594948), ('college', 0.12330085221953847), ('states', 0.10385556522388145), ('years', 0.10026629766583442), ('good', 0.09160232144134095), ('like', 0.086004277660738843)]


In [38]:
text = "The economy is working better than ever"
x = nmf_model.transform(vectorizer.transform([text]))[0]
print(x)

[ 0.00290031  0.          0.          0.          0.          0.00439198
  0.          0.          0.          0.00467926]


In [39]:
from sklearn.metrics.pairwise import euclidean_distances
 
def most_similar(x, Z, top_n=5):
    dists = euclidean_distances(x.reshape(1, -1), Z)
    pairs = enumerate(dists[0])
    most_similar = sorted(pairs, key=lambda item: item[1])[:top_n]
    return most_similar
 
similarities = most_similar(x, nmf_Z)
document_id, similarity = similarities[0]
print(data[document_id][:1000])

Livery stable -- J. Vernon , prop. '' . Coaching had declined considerably by 1905 , but the sign was still there , near the old Wells Fargo building in San Francisco , creaking in the fog as it had for thirty years . John Vernon had had all the patronage he cared for -- he had prospered , but he could not retire from horsedom . Coaching was in his blood . He had two interests in life : the pleasures of the table and driving . Twice a week he drove his tallyho over the Santa Cruz road , upland and through the redwood forest , with orchards below him at one hand , and glimpses of the Pacific at the other . The journey back he made along the coast road , traveling hell-for-leather , every lantern of the tallyho ablaze . The southward route was the classic run in California , and the most fashionable . His patronage on this stretch was made up largely of San Franciscans -- regulars , most of them , and trenchermen like himself . They did not complain at the inhuman hour of starting ( seve

#Plotting words and documents in 2D with SVD

In [45]:
import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [46]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)

df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(data))

source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x = "x", y = "y", text="document", y_offset = 8,
                 text_font_size="8pt", text_color="#555555",
                 source=source, text_align='center')

plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color='black', fill_alpha=0.8)
plot.add_layout(labels)
show(plot,notebook_handle=True)

In [48]:
svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)

df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()

source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x = "x", y = "y", text="word", y_offset = 8,
                 text_font_size="8pt", text_color="#555555",
                 source=source, text_align='center')

plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color='black', fill_alpha=0.8)
plot.add_layout(labels)
show(plot,notebook_handle=True)

More about Latent Dirichlet Allocation

In [50]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
 
text = "The economy is working better than ever"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x, x.sum())

[ 0.02500341  0.02500583  0.02500073  0.7749724   0.02500077  0.02500114
  0.02500001  0.02501058  0.02500003  0.02500511] 1.0


In [51]:
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

  """
