In [1]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Loading dataset...
done in 3.624s.
Extracting tf-idf features for NMF...
done in 0.804s.
Extracting tf features for LDA...
done in 1.009s.

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=1000...
done in 0.518s.

Topics in NMF model (Frobenius norm):
Topic #0: just people don think like know time good make way really say right ve want did ll new use years
Topic #1: windows use dos using window program os drivers application help software pc running ms screen files version card code work
Topic #2: god jesus bible faith christian christ christians does heaven sin believe lord life church mary atheism belief human love religion
Topic #3: thanks know does mail advance hi info interested email anybody looking card help like appreciated information send list video need
Topic #4: car cars tires miles 00 new engine insurance price condition oil power speed good 000 brake year models used bought
Topic #5: edu soon com send university internet mit ftp m

In [2]:
documents = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

In [3]:
from collections import Counter

print(len(documents))
# a list of Counters, one for each document
document_topic_counts = [Counter() for _ in documents]
print(document_topic_counts)

15
[Counter(), Counter(), Counter(), Counter(), Counter(), Counter(), Counter(), Counter(), Counter(), Counter(), Counter(), Counter(), Counter(), Counter(), Counter()]


In [4]:
K = 3
# a list of Counters, one for each topic
topic_word_counts = [Counter() for _ in range(K)]
print(topic_word_counts)

[Counter(), Counter(), Counter()]


In [5]:
# a list of numbers, one for each topic
topic_counts = [0 for _ in range(K)]
print(topic_counts)

[0, 0, 0]


In [6]:
# a list of numbers, one for each document
document_lengths = [len(d) for d in documents]
print(document_lengths)

[7, 5, 6, 5, 4, 6, 4, 4, 4, 4, 3, 4, 3, 5, 3]


In [7]:
distinct_words = set(word for document in documents for word in document)
W = len(distinct_words)

print(distinct_words)
print(W)

{'statistics', 'R', 'artificial intelligence', 'MongoDB', 'Spark', 'neural networks', 'machine learning', 'libsvm', 'Mahout', 'deep learning', 'Hadoop', 'Haskell', 'Postgres', 'Python', 'pandas', 'regression', 'mathematics', 'MapReduce', 'NoSQL', 'Big Data', 'statsmodels', 'MySQL', 'Java', 'Cassandra', 'probability', 'decision trees', 'C++', 'support vector machines', 'scikit-learn', 'databases', 'theory', 'scipy', 'programming languages', 'HBase', 'numpy', 'Storm'}
36


In [8]:
D = len(documents)
print(D)

15


In [9]:
def p_topic_given_document(topic, d, alpha=0.1):
    """the fraction of words in document _d_
    that are assigned to _topic_ (plus some smoothing)"""

    return ((document_topic_counts[d][topic] + alpha) /
            (document_lengths[d] + K * alpha))

def p_word_given_topic(word, topic, beta=0.1):
    """the fraction of words assigned to _topic_
    that equal _word_ (plus some smoothing)"""

    return ((topic_word_counts[topic][word] + beta) /
            (topic_counts[topic] + W * beta))

In [10]:
def topic_weight(d, word, k):
    """given a document and a word in that document,
    return the weight for the kth topic"""

    return p_word_given_topic(word, k) * p_topic_given_document(k, d)

def choose_new_topic(d, word):
    return sample_from([topic_weight(d, word, k)
                        for k in range(K)])

In [11]:
import random

random.seed(0)
document_topics = [[random.randrange(K) for word in document]
                   for document in documents]
print(document_topics)

for d in range(D):
    for word, topic in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic] += 1
        topic_word_counts[topic][word] += 1
        topic_counts[topic] += 1
print()
print(document_topic_counts)
print()
print(topic_word_counts)
print()
print(topic_counts)

[[1, 1, 0, 1, 2, 1, 1], [1, 1, 1, 2, 0], [2, 0, 1, 0, 0, 2], [1, 2, 2, 2, 0], [1, 0, 2, 0], [2, 1, 1, 2, 0, 1], [1, 1, 2, 2], [0, 2, 1, 1], [2, 1, 0, 2], [0, 0, 2, 1], [2, 2, 2], [0, 2, 1, 1], [0, 2, 1], [2, 0, 0, 2, 0], [0, 0, 2]]

[Counter({1: 5, 0: 1, 2: 1}), Counter({1: 3, 2: 1, 0: 1}), Counter({0: 3, 2: 2, 1: 1}), Counter({2: 3, 1: 1, 0: 1}), Counter({0: 2, 1: 1, 2: 1}), Counter({1: 3, 2: 2, 0: 1}), Counter({1: 2, 2: 2}), Counter({1: 2, 0: 1, 2: 1}), Counter({2: 2, 1: 1, 0: 1}), Counter({0: 2, 2: 1, 1: 1}), Counter({2: 3}), Counter({1: 2, 0: 1, 2: 1}), Counter({0: 1, 2: 1, 1: 1}), Counter({0: 3, 2: 2}), Counter({0: 2, 2: 1})]

[Counter({'HBase': 2, 'Postgres': 2, 'regression': 2, 'libsvm': 2, 'scikit-learn': 1, 'numpy': 1, 'statsmodels': 1, 'probability': 1, 'Haskell': 1, 'machine learning': 1, 'Big Data': 1, 'Hadoop': 1, 'Java': 1, 'C++': 1, 'pandas': 1, 'MongoDB': 1}), Counter({'Big Data': 2, 'Java': 2, 'Cassandra': 2, 'R': 2, 'probability': 2, 'Hadoop': 1, 'Storm': 1, 'NoSQL': 

In [12]:
def sample_from(weights):
    """returns i with probability weights[i] / sum(weights)"""
    total = sum(weights)
    rnd = total * random.random()      # uniform between 0 and total
    for i, w in enumerate(weights):
        rnd -= w                       # return the smallest i such that
        if rnd <= 0: return i          # weights[0] + ... + weights[i] >= rnd

In [13]:
for iter in range(1000):
    for d in range(D):
        for i, (word, topic) in enumerate(zip(documents[d],
                                              document_topics[d])):

            # remove this word / topic from the counts
            # so that it doesn't influence the weights
#             print(i, word, topic)
            document_topic_counts[d][topic] -= 1
            topic_word_counts[topic][word] -= 1
            topic_counts[topic] -= 1
            document_lengths[d] -= 1

            # choose a new topic based on the weights
            new_topic = choose_new_topic(d, word)
#             print(new_topic)
            document_topics[d][i] = new_topic

            # and now add it back to the counts
            document_topic_counts[d][new_topic] += 1
            topic_word_counts[new_topic][word] += 1
            topic_counts[new_topic] += 1
            document_lengths[d] += 1

In [26]:
for d in range(D):
    for i, (word, topic) in enumerate(zip(documents[d], document_topics[d])):
        # pwz
        pwz = p_word_given_topic(word, topic)
#         print('Word {} from topic {} given {}'.format(word, topic, pwz))
        
        # pzd
        pzd = p_topic_given_document(topic, d)
        print('Topic {} from document {} given {}'.format(topic, d, pzd))

Topic 1 from document 0 given 0.9726027397260274
Topic 1 from document 0 given 0.9726027397260274
Topic 1 from document 0 given 0.9726027397260274
Topic 1 from document 0 given 0.9726027397260274
Topic 1 from document 0 given 0.9726027397260274
Topic 1 from document 0 given 0.9726027397260274
Topic 1 from document 0 given 0.9726027397260274
Topic 1 from document 1 given 0.9622641509433962
Topic 1 from document 1 given 0.9622641509433962
Topic 1 from document 1 given 0.9622641509433962
Topic 1 from document 1 given 0.9622641509433962
Topic 1 from document 1 given 0.9622641509433962
Topic 0 from document 2 given 0.4920634920634921
Topic 0 from document 2 given 0.4920634920634921
Topic 2 from document 2 given 0.4920634920634921
Topic 0 from document 2 given 0.4920634920634921
Topic 2 from document 2 given 0.4920634920634921
Topic 2 from document 2 given 0.4920634920634921
Topic 2 from document 3 given 0.5849056603773585
Topic 0 from document 3 given 0.39622641509433965
Topic 2 from docume

In [15]:
for k, word_counts in enumerate(topic_word_counts):
    for word, count in word_counts.most_common():
        if count > 0: print(k, word, count)

0 regression 3
0 scikit-learn 2
0 libsvm 2
0 machine learning 2
0 C++ 2
0 neural networks 2
0 artificial intelligence 2
0 deep learning 2
0 Python 2
0 numpy 1
0 Haskell 1
0 decision trees 1
0 support vector machines 1
0 Mahout 1
1 Big Data 3
1 Java 3
1 HBase 3
1 Hadoop 2
1 Cassandra 2
1 MongoDB 2
1 Postgres 2
1 Storm 1
1 NoSQL 1
1 Spark 1
1 MapReduce 1
1 databases 1
1 MySQL 1
2 R 4
2 statistics 3
2 probability 3
2 Python 2
2 pandas 2
2 statsmodels 2
2 mathematics 1
2 theory 1
2 scipy 1
2 programming languages 1


In [16]:
topic_names = ["Topik 0: ",
               "Topik 1: ",
               "Topik 2: "]
for document, topic_counts in zip(documents, document_topic_counts):
    print(document)
    for topic, count, in topic_counts.most_common():
        if count > 0:
            print(topic_names[topic], count)

['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']
Topik 1:  7
['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
Topik 1:  5
['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas']
Topik 2:  3
Topik 0:  3
['R', 'Python', 'statistics', 'regression', 'probability']
Topik 2:  3
Topik 0:  2
['machine learning', 'regression', 'decision trees', 'libsvm']
Topik 0:  4
['Python', 'R', 'Java', 'C++', 'Haskell', 'programming languages']
Topik 2:  3
Topik 0:  2
Topik 1:  1
['statistics', 'probability', 'mathematics', 'theory']
Topik 2:  4
['machine learning', 'scikit-learn', 'Mahout', 'neural networks']
Topik 0:  4
['neural networks', 'deep learning', 'Big Data', 'artificial intelligence']
Topik 0:  3
Topik 1:  1
['Hadoop', 'Java', 'MapReduce', 'Big Data']
Topik 1:  4
['statistics', 'R', 'statsmodels']
Topik 2:  3
['C++', 'deep learning', 'artificial intelligence', 'probability']
Topik 0:  3
Topik 2:  1
['pandas', 'R', 'Python']
Topik 2:  3
['databases', 'HBa