In [1]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim


In [2]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

In [3]:
# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

In [4]:
tokenizer = RegexpTokenizer(r'\w+')


In [5]:
en_stop = get_stop_words('en')

In [6]:
p_stemmer = PorterStemmer()

In [7]:
texts = []

for i in doc_set:
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    
    stopped_tokens = [i for i in tokens if not i in en_stop]
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    texts.append(stemmed_tokens)

In [8]:
dictionary = corpora.Dictionary(texts)

In [9]:
print(dictionary.token2id)

{'often': 22, 'brother': 1, 'like': 3, 'drive': 8, 'time': 9, 'practic': 10, 'brocolli': 5, 'profession': 30, 'increas': 13, 'well': 23, 'school': 25, 'feel': 27, 'never': 26, 'suggest': 20, 'blood': 21, 'good': 0, 'caus': 19, 'better': 28, 'mother': 2, 'tension': 16, 'spend': 7, 'eat': 4, 'lot': 11, 'seem': 29, 'around': 6, 'pressur': 14, 'expert': 15, 'say': 31, 'health': 17, 'perform': 24, 'may': 18, 'basebal': 12}


In [10]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [11]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

In [12]:
print(ldamodel.print_topics(num_topics=2, num_words=4))

[(0, '0.086*"health" + 0.086*"brocolli" + 0.086*"good" + 0.061*"eat"'), (1, '0.068*"mother" + 0.068*"brother" + 0.068*"drive" + 0.041*"pressur"')]
