In [10]:
from core.helpers.common import compose

In [9]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

In [18]:
# FUNCTIONAL HELPERS
def curry2(func):
    """Curry the two arguments function"""
    def f1(x):
        def f2(y):
            return func(x, y)
        return f2
    return f1

# CURRY our map and filter
curried_filter = curry2(filter)
curried_map = curry2(map)

map_lower = curried_map(str.lower)

In [38]:
# TOKENIZATION
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

# STOP WORDS
from stop_words import get_stop_words

# create English stop words list
en_stop = get_stop_words('en')
# function that returns true if not stop word
not_stop = lambda x: x not in en_stop

# STEMMING
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
# Create p_stemmer of class PorterStemmer
stemmer = PorterStemmer()
stemmer = SnowballStemmer('english')
# map stem
map_stem = curried_map(stemmer.stem)

# Create a pre-processor function
# First tokenize, lowercase, then filter nonstop and then stem.
# Result will be a map. convert to list
tokenize_remove_stop_stem = compose(list, map_stem, curried_filter(not_stop), map_lower, tokenizer.tokenize)

In [39]:
# Create our processed texts
texts = [tokenize_remove_stop_stem(x) for x in doc_set]
print(texts)

[['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother'], ['mother', 'spend', 'lot', 'time', 'drive', 'brother', 'around', 'basebal', 'practic'], ['health', 'expert', 'suggest', 'drive', 'may', 'caus', 'increas', 'tension', 'blood', 'pressur'], ['often', 'feel', 'pressur', 'perform', 'well', 'school', 'mother', 'never', 'seem', 'drive', 'brother', 'better'], ['health', 'profession', 'say', 'brocolli', 'good', 'health']]


In [40]:
# CONSTRUCTING A DOCUMENT TERM MATRIX
from gensim import corpora, models

dictionary = corpora.Dictionary(texts)
print(dictionary)

Dictionary(32 unique tokens: ['brocolli', 'good', 'eat', 'brother', 'like']...)


In [41]:
# CONVERT TO BAG OF WORDS
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus[2])

[(9, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)]


In [66]:
# APPLY LDA
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)
for x in ldamodel.print_topics(num_topics=3, num_words=2):
    print(x)

(0, '0.074*"mother" + 0.074*"brother"')
(1, '0.130*"good" + 0.130*"brocolli"')
(2, '0.065*"pressur" + 0.065*"health"')
