In [1]:
from tqdm import tqdm

def lda(X, n_topics, alpha, beta, n_iter=10):
    n_kw = np.zeros((n_topics, X.shape[1]))
    n_dk = np.zeros((X.shape[0], n_topics))
    n_k = np.zeros(n_topics)
    
    docs, words = X.nonzero()
    z = np.random.choice(n_topics, len(docs))
    
    for doc, word, cur_z in zip(docs, words, z):
        n_dk[doc, cur_z] += 1
        n_kw[cur_z, word] += 1
        n_k[cur_z] += 1
    
    for cur_iter in tqdm(range(n_iter)):
        for i in range(len(docs)):
            cur_word = words[i]
            cur_doc = docs[i]
            cur_topic = z[i]
            
            n_dk[cur_doc, cur_topic] -= 1
            n_kw[cur_topic, cur_word] -= 1
            n_k[cur_topic] -= 1
            
            p = (n_dk[cur_doc, :] + alpha) * (n_kw[:, cur_word] + beta[cur_word]) / \
                (n_k + beta.sum())
            z[i] = np.random.choice(np.arange(n_topics), p=p / p.sum())
            
            n_dk[cur_doc, z[i]] += 1
            n_kw[z[i], cur_word] += 1
            n_k[z[i]] += 1
    
    return z, n_kw, n_dk, n_k

In [3]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,analyzer='word', binary=True)
X_train = vectorizer.fit_transform(newsgroups_train.data)

X_train.toarray()
X_train.shape

n_topics = 20
z, n_kw, n_dk, n_k = lda(X_train, n_topics, 2 * np.ones(n_topics), \
                         2 * np.ones(X_train.shape[1]), 30)

100%|██████████| 30/30 [36:41<00:00, 68.83s/it]


In [4]:
top_words = np.argsort(n_kw, axis=1)[:, -10:]

for topic in range(20):
    doc = np.zeros((1, X_train.shape[1]))
    for word in top_words[topic]:
        doc[0, word] = 1
    print('Topic {}:\t{}'.format(topic, '\t'.join(vectorizer.inverse_transform(doc)[0])))

Topic 0:	1968	chemical	diagnosed	ensures	ford	market	nixon	timmons	ucs	yeah
Topic 1:	1174	amps	avail	devices	gl	involved	modems	open	pro	verify
Topic 2:	cards	deleted	generators	goals	minutes	rip	secondly	stop	trigger	ve
Topic 3:	classic	course	games	hi	hint	honda	laserjet	saw	soundblaster	think
Topic 4:	bd	bout	ca	don	monitor	oh	park	resistors	sincerely	walter
Topic 5:	685	brothers	does	graphics	internet	latest	looks	repost	runners	zip
Topic 6:	brad	cutting	dances	don	em	hr	pointer	reverse	seat	slight
Topic 7:	610	64	air	food	necessarily	powerpc	ride	shaft	sorry	wc
Topic 8:	application	deletion	edu	explosive	library	max	motorola	religious	ve	vi
Topic 9:	900	cica	dsl	looking	manuals	orientation	soon	talking	turbo	twm
Topic 10:	chapter	com	explaining	ftp	ken	mentioned	nambla	option	otto	thanks
Topic 11:	105	713	beer	books	compile	microsoft	mono	quality	san	wondering
Topic 12:	14	34	m4	mb	mc	mi	mn	mq	mw	wm
Topic 13:	appreciate	cheers	clock	compatible	gets	like	obo	pc	tom	utility
Topic 14