In [1]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

In [2]:
# tokenization
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [3]:
raw = doc_a.lower()
raw1 = doc_b.lower()
raw2 = doc_c.lower()
raw3 = doc_d.lower()
raw4 = doc_e.lower()

tokens = tokenizer.tokenize(raw)
tokens1 = tokenizer.tokenize(raw1)
tokens2 = tokenizer.tokenize(raw2)
tokens3 = tokenizer.tokenize(raw3)
tokens4 = tokenizer.tokenize(raw4)

doc_tokens = [tokens, tokens1, tokens2, tokens3, tokens4]
print(doc_tokens)

[['brocolli', 'is', 'good', 'to', 'eat', 'my', 'brother', 'likes', 'to', 'eat', 'good', 'brocolli', 'but', 'not', 'my', 'mother'], ['my', 'mother', 'spends', 'a', 'lot', 'of', 'time', 'driving', 'my', 'brother', 'around', 'to', 'baseball', 'practice'], ['some', 'health', 'experts', 'suggest', 'that', 'driving', 'may', 'cause', 'increased', 'tension', 'and', 'blood', 'pressure'], ['i', 'often', 'feel', 'pressure', 'to', 'perform', 'well', 'at', 'school', 'but', 'my', 'mother', 'never', 'seems', 'to', 'drive', 'my', 'brother', 'to', 'do', 'better'], ['health', 'professionals', 'say', 'that', 'brocolli', 'is', 'good', 'for', 'your', 'health']]


In [4]:
# stopwording
from stop_words import get_stop_words

# create English stop words list
en_stop = get_stop_words('en')

In [5]:
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
stopped_tokens1 = [i for i in tokens1 if not i in en_stop]
stopped_tokens2 = [i for i in tokens2 if not i in en_stop]
stopped_tokens3 = [i for i in tokens3 if not i in en_stop]
stopped_tokens4 = [i for i in tokens4 if not i in en_stop]

doc_stopped_tokens = [stopped_tokens, stopped_tokens1, stopped_tokens2, stopped_tokens3, stopped_tokens4]

print(doc_stopped_tokens)

[['brocolli', 'good', 'eat', 'brother', 'likes', 'eat', 'good', 'brocolli', 'mother'], ['mother', 'spends', 'lot', 'time', 'driving', 'brother', 'around', 'baseball', 'practice'], ['health', 'experts', 'suggest', 'driving', 'may', 'cause', 'increased', 'tension', 'blood', 'pressure'], ['often', 'feel', 'pressure', 'perform', 'well', 'school', 'mother', 'never', 'seems', 'drive', 'brother', 'better'], ['health', 'professionals', 'say', 'brocolli', 'good', 'health']]


In [6]:
# stemming with PorterStemmer
from nltk.stem.porter import PorterStemmer

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

In [7]:
# stem token
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
stemmed_tokens1 = [p_stemmer.stem(i) for i in stopped_tokens1]
stemmed_tokens2 = [p_stemmer.stem(i) for i in stopped_tokens2]
stemmed_tokens3 = [p_stemmer.stem(i) for i in stopped_tokens3]
stemmed_tokens4 = [p_stemmer.stem(i) for i in stopped_tokens4]

doc_stemmed_tokens = [stemmed_tokens, stemmed_tokens1, stemmed_tokens2, stemmed_tokens3, stemmed_tokens4]

print(doc_stemmed_tokens)

[['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother'], ['mother', 'spend', 'lot', 'time', 'drive', 'brother', 'around', 'basebal', 'practic'], ['health', 'expert', 'suggest', 'drive', 'may', 'caus', 'increas', 'tension', 'blood', 'pressur'], ['often', 'feel', 'pressur', 'perform', 'well', 'school', 'mother', 'never', 'seem', 'drive', 'brother', 'better'], ['health', 'profession', 'say', 'brocolli', 'good', 'health']]


In [8]:
import gensim
from gensim import corpora, models

# membuat dictionary dari corpus
dictionary = corpora.Dictionary(doc_stemmed_tokens)



In [9]:
# dibentuk menjadi document term matrix
corpus = [dictionary.doc2bow(doc) for doc in doc_stemmed_tokens]

In [10]:
print(corpus[0])

[(0, 2), (1, 1), (2, 1), (3, 2), (4, 2), (5, 1)]


In [16]:
# membuat lda model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)

In [18]:
# mencetak 
print(ldamodel.print_topics(num_topics=3, num_words=1))

[(0, '0.074*"brocolli"'), (1, '0.091*"health"'), (2, '0.065*"drive"')]
