In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [11]:
for i in range(3):
    print(i)
    print(documents[i])

0
Well i'm not sure about the story nad it did seem biased. What
I disagree with is your statement that the U.S. Media is out to
ruin Israels reputation. That is rediculous. The U.S. media is
the most pro-israeli media in the world. Having lived in Europe
I realize that incidences such as the one described in the
letter have occured. The U.S. media as a whole seem to try to
ignore them. The U.S. is subsidizing Israels existance and the
Europeans are not (at least not to the same degree). So I think
that might be a reason they report more clearly on the
atrocities.
	What is a shame is that in Austria, daily reports of
the inhuman acts commited by Israeli soldiers and the blessing
received from the Government makes some of the Holocaust guilt
go away. After all, look how the Jews are treating other races
when they got power. It is unfortunate.

1







Yeah, do you expect people to read the FAQ, etc. and actually accept hard
atheism?  No, you need a little leap of faith, Jimmy.  Your lo

In [15]:
no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
people time right did good said say make way government
Topic 1:
window problem using server application screen display motif manager running
Topic 2:
god jesus bible christ faith believe christian christians sin church
Topic 3:
game team year games season players play hockey win league
Topic 4:
new 00 sale 10 price offer shipping condition 20 15
Topic 5:
thanks mail advance hi looking info help information address appreciated
Topic 6:
windows file files dos program version ftp ms directory running
Topic 7:
edu soon cs university ftp internet article email pub david
Topic 8:
key chip clipper encryption keys escrow government public algorithm nsa
Topic 9:
drive scsi drives hard disk ide floppy controller cd mac
Topic 10:
just ll thought tell oh little fine work wanted mean
Topic 11:
does know anybody mean work say doesn help exist program
Topic 12:
card video monitor cards drivers bus vga driver color memory
Topic 13:
like sounds looks look bike sound lot things really thing
To

In [16]:
#print which documents belong to which topics
doc_topic = lda.transform(tf)
for n in range(doc_topic.shape[0]):
    topic_most_pr = doc_topic[n].argmax()
    print("doc: {} topic: {}\n".format(n,topic_most_pr))

doc: 0 topic: 7

doc: 1 topic: 8

doc: 2 topic: 1

doc: 3 topic: 3

doc: 4 topic: 2

doc: 5 topic: 8

doc: 6 topic: 1

doc: 7 topic: 8

doc: 8 topic: 8

doc: 9 topic: 9

doc: 10 topic: 10

doc: 11 topic: 9

doc: 12 topic: 4

doc: 13 topic: 15

doc: 14 topic: 8

doc: 15 topic: 19

doc: 16 topic: 0

doc: 17 topic: 16

doc: 18 topic: 3

doc: 19 topic: 15

doc: 20 topic: 8

doc: 21 topic: 8

doc: 22 topic: 0

doc: 23 topic: 0

doc: 24 topic: 19

doc: 25 topic: 15

doc: 26 topic: 0

doc: 27 topic: 3

doc: 28 topic: 4

doc: 29 topic: 8

doc: 30 topic: 12

doc: 31 topic: 1

doc: 32 topic: 6

doc: 33 topic: 15

doc: 34 topic: 18

doc: 35 topic: 8

doc: 36 topic: 8

doc: 37 topic: 15

doc: 38 topic: 14

doc: 39 topic: 6

doc: 40 topic: 14

doc: 41 topic: 19

doc: 42 topic: 1

doc: 43 topic: 6

doc: 44 topic: 0

doc: 45 topic: 10

doc: 46 topic: 6

doc: 47 topic: 9

doc: 48 topic: 8

doc: 49 topic: 13

doc: 50 topic: 8

doc: 51 topic: 0

doc: 52 topic: 2

doc: 53 topic: 8

doc: 54 topic: 1

doc: