In [24]:
from __future__ import print_function
from json import JSONDecoder
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.datasets import fetch_20newsgroups

n_samples = 1000
n_features = 1000
n_topics = 10
n_top_words = 50

t0 = time()
print("Loading dataset and extracting TF-IDF features...")

file_documents = open("documents.txt", "r")
documents = JSONDecoder().decode(file_documents.read())
file_documents.close()

file_stopwords = open("stopwords.txt", "r", errors="replace")
stopwords = file_stopwords.read().split()
file_stopwords.close()

dataset = []
for key, value in documents.items():
    dataset.append(value)

vectorizer = TfidfVectorizer(
    max_df=0.95, min_df=2, max_features=n_features, stop_words=stopwords)

tfidf = vectorizer.fit_transform(dataset[:n_samples])

print("done in %0.3fs." % (time() - t0))
print()

print("Fitting the NMF model with n_samples=%d and n_features=%d..." %
      (n_samples, n_features))

nmf = NMF(n_components=n_topics, random_state=1).fit(tfidf)

print("done in %0.3fs." % (time() - t0))

feature_names = vectorizer.get_feature_names()

print()

for topic_idx, topic in enumerate(nmf.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


Loading dataset and extracting TF-IDF features...
done in 0.114s.

Fitting the NMF model with n_samples=1000 and n_features=1000...
done in 0.290s.

Topic #0:
000 cts shr revs qtr oper 4th mths avg shrs mln 1st gain 2nd excludes extraordinary share sales gains ct preferred discontinued tonnes 107 110 resources operations credits 151 700 0000 sale 130 101 accounting health industries 04 communications adjusted mail gold ford credit data 140 investment change presidential 120

Topic #1:
mln dlrs sales shr shrs avg dlr profit share tax 4th rose stg extraordinary revenues operating qtr loans earnings gain reserves assets cents borrowings revs loan fiscal months reported income 03 150 federal commercial period funds 101 data weeks 06 sale comalco losses 425 cash pre gas restructuring deficit discount

Topic #2:
cts march div record qtly pay prior dividend payout sets franklin quarterly fund mthly income payable tax insured regular raises annual initial declared split share realty trust cali