#### Data Preprocessing

In [1]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# from nltk.corpus import stopwords

no_features = 1000
# stop_words = stopwords.words('english')

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

#### NMF and LDA with SKlearn

In [5]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 20

# run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0).fit(tf)

#### Displaying and Evaluating Topics

In [12]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f'Topic {topic_idx}:')
        print(' '.join([feature_names[i] for i in topic.argsort()[:no_top_words -1:1]]))

In [13]:
no_top_words = 10

print('NMF')
display_topics(nmf, tfidf_feature_names, no_top_words)

NMF
Topic 0:
00 serial condition season controller scsi screen sale cx
Topic 1:
00 ma mail making man mark market mass material
Topic 2:
00 military million mit mode model modem money monitor
Topic 3:
00 mind mit mode model modem monitor month months
Topic 4:
instead define defense decided death person personal physical held
Topic 5:
00 members memory men mention mike military million mind
Topic 6:
00 matter max mb mean meaning medical members men
Topic 7:
00 moral motif mouse ms multiple national nature necessary
Topic 8:
00 man manager manual mark mass matter max mb
Topic 9:
00 money month months moral motif mr multiple nasa
Topic 10:
00 needs net network news nhl nice night non
Topic 11:
00 multiple nasa national nature near necessary need needed
Topic 12:
00 money month moral motif mr ms multiple nasa
Topic 13:
00 night non normal north note nsa number numbers
Topic 14:
00 model modem monitor month months moral motif mouse
Topic 15:
00 million mind mit mode modem monitor month mora

In [14]:
print('LDA')
display_topics(lda, tf_feature_names, no_top_words)

LDA
Topic 0:
145 orbit giz gk a86 bxn lk encryption bhj
Topic 1:
g9v 6ei 1d9 2tm gk giz w7 1t 3t
Topic 2:
75u soviet escrow 2tm russian 2di christians keyboard protect
Topic 3:
d9 red church bxn christ b8f 7u bh atheism
Topic 4:
security peace congress printer talking father giving act bxn
Topic 5:
insurance b8f fbi w7 turkish 7ey 55 75u gk
Topic 6:
crime giz turkish 3t israeli 0d published 75u enforcement
Topic 7:
ide clipper bus unit orbit 1d9 g9v drives 34u
Topic 8:
3d cx bhj pts clipper x11 giz 1t 3t
Topic 9:
d9 0d turkish firearms turkey turks bh 2tm wm
Topic 10:
0t 7ey armenians oh religion truth chz jews gm
Topic 11:
voice clipper key wire chip installed guns shuttle turkey
Topic 12:
3t chz 7ey g9v pts gk la x11 2di
Topic 13:
shall gm nature pp gets bike christians escrow enforcement
Topic 14:
75u turkish turks father bxn cx 6ei gm 34u
Topic 15:
7u escrow 3t armenians citizens stephanopoulos clipper 0t board
Topic 16:
atheism authority wire belief ax a86 7u christians scsi
Topic