In [11]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
from pprint import pprint

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
# create sample documents
# doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
# doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
# doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
# doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
# doc_e = "Health professionals say that brocolli is good for your health." 
doc_f = "I eat fish and vegetables."
doc_g = "Dog and fish are pets."
doc_h = "My kitten eats fish."
# doc_i = "Fish Fish Eat Eat Vegetables"
# doc_j = "Fish Fish Milk Kitten Kitten"
# compile sample documents into a list
# doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]
doc_set = [doc_f, doc_g, doc_h]
# doc_set = [doc_i, doc_j]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model


In [18]:
import gensim
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)
# pprint(ldamodel.print_topics(num_topics=2))

'running online LDA training, 2 topics, 20 passes over the supplied corpus of 3 documents, updating model once every 3 documents, evaluating perplexity every 3 documents, iterating 50x with a convergence threshold of 0.001'
0
[(0, u'0.287*"fish" + 0.177*"eat" + 0.153*"pet" + 0.152*"dog"'),
 (1, u'0.240*"fish" + 0.230*"eat" + 0.201*"veget" + 0.111*"kitten"')]
1
[(0, u'0.285*"fish" + 0.167*"eat" + 0.161*"pet" + 0.160*"dog"'),
 (1, u'0.245*"eat" + 0.242*"fish" + 0.220*"veget" + 0.101*"kitten"')]
2
[(0, u'0.283*"fish" + 0.165*"pet" + 0.165*"dog" + 0.161*"kitten"'),
 (1, u'0.254*"eat" + 0.244*"fish" + 0.228*"veget" + 0.095*"kitten"')]
3
[(0, u'0.282*"fish" + 0.168*"pet" + 0.168*"dog" + 0.163*"kitten"'),
 (1, u'0.260*"eat" + 0.245*"fish" + 0.231*"veget" + 0.092*"kitten"')]
4
[(0, u'0.282*"fish" + 0.170*"pet" + 0.170*"dog" + 0.165*"kitten"'),
 (1, u'0.265*"eat" + 0.246*"fish" + 0.231*"veget" + 0.090*"kitten"')]
5
[(0, u'0.281*"fish" + 0.172*"pet" + 0.172*"dog" + 0.166*"kitten"'),
 (1, u'0.269

In [9]:
ldamodel.show_topics()

[(0,
  u'0.279*"eat" + 0.276*"fish" + 0.175*"kitten" + 0.147*"veget" + 0.062*"dog" + 0.062*"pet"'),
 (1,
  u'0.267*"pet" + 0.267*"dog" + 0.137*"veget" + 0.120*"fish" + 0.115*"eat" + 0.094*"kitten"')]

In [64]:
result = ldamodel.print_topics(num_topics=2, num_words=4)
print('1st Topic is : {}\n2nd Topic is : {}\n'.format(result[0][1], result[1][1]))
for i, text in enumerate(texts):
    print('No.{} Document : {}\n 1st Topic: {:4.2f}%\n 2nd Topic: {:4.2f}%\n'.format(i+1,\
                                                                       text,\
                                                                       ldamodel[corpus[i]][0][1]*100,\
                                                                       ldamodel[corpus[i]][1][1]*100\
                                                                      ))

1st Topic is : 0.268*"fish" + 0.210*"pet" + 0.210*"dog" + 0.147*"kitten"
2nd Topic is : 0.296*"eat" + 0.265*"fish" + 0.189*"veget" + 0.121*"kitten"

No.1 Document : ['eat', 'fish', u'veget']
 1st Topic: 14.67%
 2nd Topic: 85.33%

No.2 Document : ['dog', 'fish', u'pet']
 1st Topic: 85.44%
 2nd Topic: 14.56%

No.3 Document : ['kitten', u'eat', 'fish']
 1st Topic: 19.95%
 2nd Topic: 80.05%

