In [2]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS



newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,analyzer='word', binary=True)
X_train = vectorizer.fit_transform(newsgroups_train.data)

X_train.toarray()
# X_train.shape #(11314, 101322)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [5]:
from tqdm import tqdm
def lda(n_dk,n_kw,n_k,z,docs,words,alpha,beta,NITER):
    for tek_iter in tqdm(range(NITER)):
        for i in range(N):
            n_dk[docs[i],z[i]]-=1
            n_kw[z[i],words[i]]-=1
            n_k[z[i]]-=1
            p = (n_dk[docs[i], :] + alpha) * (n_kw[:, words[i]] + beta[words[i]]) / (n_k + beta.sum())
            z[i] = np.random.choice(np.arange(K), p=p / p.sum())
            
        
            n_dk[docs[i],z[i]]+=1
            n_kw[z[i],words[i]]+=1
            n_k[z[i]]+=1
    return  n_dk, n_kw,n_k,z

In [3]:
import numpy as np
M=11314 #amount of texts
W=101322 #amount of different words
N=755809#total amount of words in the corpus
K=20 #amount of tags
NITER=30


alpha=2*np.ones(K)
beta=2*np.ones(N)

n_dk=np.zeros(M*K).reshape(M,K) #amount of words in document d assigned to tag k
n_kw=np.zeros(K*W).reshape(K,W)
n_k=np.zeros(K) #total amount of words assigned to tag k

X=X_train.toarray()
docs,words=X.nonzero() #print(len(docs)) #it is N=755809
z=[np.random.choice(K) for i in range(N)] #randomly assign tags to words
for doc, word, cur_z in zip(docs, words, z):
        n_dk[doc, cur_z] += 1
        n_kw[cur_z, word] += 1
        n_k[cur_z] += 1
print(alpha[0:10])

[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]


In [6]:
n_dk,n_kw,n_k,z=lda(n_dk,n_kw,n_k,z,docs,words,alpha,beta,NITER)
#print(n_kw)

100%|██████████| 30/30 [25:05:20<00:00, 1138.43s/it]    


In [7]:
n_kw_sorted=np.argsort(n_kw, axis=1)
#print(vectorizer.vocabulary_.get('car'))
#print(type(vectorizer.vocabulary_))

my_dict={}
for item in vectorizer.vocabulary_:
    my_dict[vectorizer.vocabulary_[item]]=item

for k in range(K):
    print("tag=",k,end=" ")
    for j in range(10):
        print(my_dict[n_kw_sorted[k,W-1-j]],end=" ")
    print("\n")

tag= 0 68070 ear infrequent bt batch colours 1980 women _k asshole 

tag= 1 like just don know think does people time use good 

tag= 2 zenith geez parameters cii mercury swapped semitic ucc skates seizures 

tag= 3 r1 alexia kkeller winmarks lame confuse devout forsale doublespace appetite 

tag= 4 nickname 2l scrolling breath cutter ovg moh wtc d4 dl 

tag= 5 prelude steady funniest similarly 130mph shameful safer thanx wheel ram 

tag= 6 contradiction sails drug docs lors libtermcap pov cutting design cuts 

tag= 7 negative dialog says nai vpic rosa checker nazi jerry technically 

tag= 8 s4 er undefined whats uranium course udel timmons curiosities fj 

tag= 9 told moto response feature wa amen wether effective ysebaert 8mb 

tag= 10 wonder mpeg nz knows vf comparing reign seldom switches makes 

tag= 11 annoyed stefan csmes pl combo offers choke designs dated tonite 

tag= 12 r_ 99m j6 1o od w8 y8 9s rx b6 

tag= 13 jungle vegetarian shopping 7u xwindows notepad gld rotation eisa 

In [8]:
top_words = np.argsort(n_kw, axis=1)[:, -10:]

for topic in range(20):
    doc = np.zeros((1, X_train.shape[1]))
    for word in top_words[topic]:
        doc[0, word] = 1
    print('Topic {}:\t{}'.format(topic, '\t'.join(vectorizer.inverse_transform(doc)[0])))

Topic 0:	8v	bxn	dk	hm	immaculate	ns	rusty	sells	wing	xi
Topic 1:	1d9	3t	9f9	a7	ax	chunks	ey	qk	vc	wt
Topic 2:	030	e1	iq	mf0	mp	n6	runner	s6	student	vy
Topic 3:	0_	2i	6f	bz	cd	ether	joke	posts	processing	stealth
Topic 4:	5e	6c	dv	dy	e1	ec	f9d	mw	qq	t5
Topic 5:	6j	70	calstate	devils	ei	f0	kt	l2	tigers	wr
Topic 6:	11th	1fpl	4t	5f	a865	giz	hitters	io	m9	tnx
Topic 7:	13s	1t	ei0l	jays	k8	kp	m0	m6	movement	z4
Topic 8:	0l	0t	1u	ck	dy	g8	gk	glover	gt	m6
Topic 9:	bu	gt	gy	hartford	pd	pnei	starter	terrorists	uy	watchman
Topic 10:	1z6e1t	9d	aviv	cal	contacts	drinking	kn	netnews	processor	replies
Topic 11:	does	don	good	just	know	like	people	think	time	use
Topic 12:	0t	2di	au	cz	dial	ladies	marlins	mdi	otc	uj
Topic 13:	5g	a86	bu	ci	fu	gu	ht	nq	pg	seperately
Topic 14:	2f	6g	6v	albicans	b8	cz	gk	mm	tiger	w8
Topic 15:	3h	6l	9e	cu	i5	jg	ml	r6	shrink	steel
Topic 16:	5u	7t	e0	fictional	giz	gn	gx	khz	m1	wa
Topic 17:	0i	4b	dig	f0	g9p	hm	m2	pl	qr_	qv
Topic 18:	c4	md	mv	pointer	rb	rs232	t9	tom	tx	wa
Topic 19