In [1]:
import numpy
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [2]:
news20 = fetch_20newsgroups()

In [3]:
X = news20.data[:1000]
y = news20.target[:1000]

In [4]:
X[0][:100]  # X.shape = (D, V)

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam."

In [5]:
cv = CountVectorizer(min_df=0.04, stop_words="english")
X_bow = cv.fit_transform(X, y)


In [6]:
feature_names = numpy.array(cv.get_feature_names())
feature_names[30:30+10]




array(['address', 'ago', 'agree', 'answer', 'anybody', 'apr', 'area',
       'article', 'ask', 'asked'], dtype='<U12')

In [7]:
K = 50
beta = 1/len(feature_names)
# beta = 1/K
print("beta:", beta)
lda = LatentDirichletAllocation(n_components=K, max_iter=50, n_jobs=-1, topic_word_prior=beta)
lda.fit(X_bow)


beta: 0.0029154518950437317


LatentDirichletAllocation(max_iter=50, n_components=50, n_jobs=-1,
                          topic_word_prior=0.0029154518950437317)

In [8]:
len(feature_names)


343

In [9]:
lda.components_.shape       # (K, V)

(50, 343)

In [10]:
def print_components(components, vocab):
    for k, cmp in enumerate(components[:10]):
        print("component:", k)
        indices = cmp.argsort()[::-1][:5]
        for idx in indices:
            print("\t", vocab[idx], cmp[idx])


In [11]:
print_components(
    components = lda.components_,
    vocab = feature_names,
)


component: 0
	 data 90.58271280560483
	 mike 67.02351781433362
	 michael 53.839830786258084
	 group 42.882898710817564
	 just 17.848019783174482
component: 1
	 don 59.067301540644856
	 edu 52.855871856895654
	 know 39.71916780106561
	 like 39.015342643370595
	 opinions 30.60366324730273
component: 2
	 phone 46.75460234823403
	 com 23.38333336325196
	 instead 21.152256340876455
	 point 20.428235867239355
	 number 16.84996981203739
component: 3
	 com 643.297523958372
	 article 126.89494574642613
	 writes 126.64508368170604
	 organization 105.54413028058326
	 lines 102.75139942323652
component: 4
	 good 163.74453027048477
	 50 75.386295284577
	 00 30.961892079759355
	 25 13.595781360727972
	 usually 10.016500366767179
component: 5
	 people 97.46663282973311
	 list 65.57119483295544
	 today 56.97635477224958
	 government 48.389126352939975
	 world 46.62722991403743
component: 6
	 use 65.41738110126722
	 bit 53.93097590426823
	 used 49.51694979498767
	 memory 46.88401985168998
	 possible 46

In [12]:
X[0][:100]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam."

In [22]:
from gensim.corpora.dictionary import Dictionary
from gensim.parsing.preprocessing import remove_stopwords
# from gensim.utils import tokenize
from gensim.models import LdaModel

In [23]:
X[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [24]:

remove_stopwords(X[0])

"From: lerxst@wam.umd.edu (where's thing) Subject: WHAT car this!? Nntp-Posting-Host: rac3.wam.umd.edu Organization: University Maryland, College Park Lines: 15 I wondering enlighten car I saw day. It 2-door sports car, looked late 60s/ early 70s. It called Bricklin. The doors small. In addition, bumper separate rest body. This I know. If tellme model name, engine specs, years production, car made, history, info funky looking car, e-mail. Thanks, - IL ---- brought neighborhood Lerxst ----"

In [35]:
import re
token_pattern=r"(?u)\b\w\w+\b"
tokenizer = re.compile(token_pattern).findall


In [59]:
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

def filter_sw(tokens: list):
    return [tkn for tkn in tokens if tkn.lower() not in ENGLISH_STOP_WORDS]

def do_tokenize(text: str):
    tokens = tokenizer(remove_stopwords(text))
    return filter_sw(tokens)


In [60]:
tokenized = [do_tokenize(doc) for doc in X]
dct = Dictionary(tokenized)
X_nummed = [dct.doc2bow(docwords) for docwords in tokenized]

In [62]:
tokenized[0][:5]

['lerxst', 'wam', 'umd', 'edu', 'thing']

In [63]:
dct.doc2bow(tokenized[0])[:7]


[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]

In [82]:
lda = LdaModel(X_nummed, num_topics=K, iterations=100, alpha="auto", eta="auto")

In [83]:
lda.expElogbeta.shape       # (K, V)

(10, 37253)

In [84]:
print_components(
    components = lda.expElogbeta,
    vocab = dct,
)

component: 0
	 edu 0.005320049
	 com 0.005196726
	 Lines 0.00485873
	 Subject 0.004308453
	 Organization 0.0033927427
component: 1
	 AX 0.14302205
	 MAX 0.010282509
	 edu 0.007504135
	 Subject 0.004270342
	 Organization 0.0040714443
component: 2
	 com 0.0055519743
	 edu 0.004426515
	 Subject 0.0042640483
	 Organization 0.004191764
	 Lines 0.0038057787
component: 3
	 edu 0.006079315
	 Lines 0.0051687676
	 Subject 0.0051241443
	 Organization 0.005111527
	 com 0.0045508477
component: 4
	 edu 0.0111352
	 Lines 0.0058781747
	 Organization 0.005253405
	 Subject 0.0048217257
	 writes 0.0047382675
component: 5
	 edu 0.0049373577
	 Subject 0.003859734
	 Lines 0.003629669
	 Organization 0.003110671
	 people 0.0021959816
component: 6
	 edu 0.0062674913
	 Subject 0.005220407
	 Organization 0.005022306
	 Lines 0.0047647376
	 com 0.0045420593
component: 7
	 edu 0.005333427
	 com 0.004477421
	 Organization 0.004084646
	 Subject 0.0039406223
	 Lines 0.003889292
component: 8
	 edu 0.012822302
	 Subject