LDA

In [1]:
# -*- coding: utf-8 -*-
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck <L.J.Buitinck@uva.nl>
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data
print("done in %0.3fs." % (time() - t0))

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, #max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Fit the NMF model
print("Fitting the NMF model with tf-idf features,"
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
#exit()
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


Loading dataset...
done in 13.102s.
Extracting tf-idf features for NMF...
done in 2.101s.
Extracting tf features for LDA...
done in 2.048s.
Fitting the NMF model with tf-idf features,n_samples=2000 and n_features=1000...
done in 2.236s.

Topics in NMF model:
Topic #0:
don people just like think know good time ve right make say want really did way new use going ll
Topic #1:
windows file dos files window program use running version ms using problem server pc screen ftp run os application software
Topic #2:
god jesus bible christ faith believe christians christian heaven sin hell life church truth lord say belief does existence man
Topic #3:
geb dsl chastity n3jxp cadre shameful pitt intellect skepticism surrender gordon banks soon edu lyme blood weight patients medical probably
Topic #4:
key chip encryption clipper keys escrow government algorithm security secure encrypted public nsa des enforcement law bit privacy secret use
Topic #5:
drive scsi ide drives disk hard controller floppy hd



done in 17.686s.

Topics in LDA model:
Topic #0:
people gun armenian armenians war turkish states israel said children jews 000 state new guns israeli vs military years american
Topic #1:
government people law mr use president don think right public make state going privacy private security know new rights want
Topic #2:
space program output entry data nasa use science research build section center launch time high earth year rules long satellite
Topic #3:
key car chip used keys bike use bit clipper number phone like cars just engine ground des algorithm good secret
Topic #4:
edu file com available mail ftp files information image send list use version server email pub software cs code window
Topic #5:
god people does jesus say think believe don know just way like true question life time christian did point bible
Topic #6:
windows use drive thanks does problem know card like using db scsi dos disk bit need pc memory mac work
Topic #7:
ax max b8f g9v a86 pl 145 1d9 0t 34u 1t 3t giz bhj 