In [2]:
from helpers.common import compose

In [3]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

In [4]:
# FUNCTIONAL HELPERS
def curry2(func):
    """Curry the two arguments function"""
    def f1(x):
        def f2(y):
            return func(x, y)
        return f2
    return f1

# CURRY our map and filter
curried_filter = curry2(filter)
curried_map = curry2(map)

map_lower = curried_map(str.lower)

In [5]:
# TOKENIZATION
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

# STOP WORDS
from stop_words import get_stop_words

# create English stop words list
en_stop = get_stop_words('en')
# function that returns true if not stop word
not_stop = lambda x: x not in en_stop

# STEMMING
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
# Create p_stemmer of class PorterStemmer
stemmer = PorterStemmer()
stemmer = SnowballStemmer('english')
# map stem
map_stem = curried_map(stemmer.stem)

# Create a pre-processor function
# First tokenize, lowercase, then filter nonstop and then stem.
# Result will be a map. convert to list
tokenize_remove_stop_stem = compose(list, map_stem, curried_filter(not_stop), map_lower, tokenizer.tokenize)

In [7]:
# Create our processed texts
texts = [tokenize_remove_stop_stem(x) for x in doc_set]
print(texts)

[['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother'], ['mother', 'spend', 'lot', 'time', 'drive', 'brother', 'around', 'basebal', 'practic'], ['health', 'expert', 'suggest', 'drive', 'may', 'caus', 'increas', 'tension', 'blood', 'pressur'], ['often', 'feel', 'pressur', 'perform', 'well', 'school', 'mother', 'never', 'seem', 'drive', 'brother', 'better'], ['health', 'profession', 'say', 'brocolli', 'good', 'health']]


In [8]:
# CONSTRUCTING A DOCUMENT TERM MATRIX
from gensim import corpora, models

dictionary = corpora.Dictionary(texts)
print(dir(dictionary))
print(dictionary.get(1))

['__abstractmethods__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', '_adapt_by_suffix', '_load_specials', '_save_specials', '_smart_save', 'add_documents', 'compactify', 'dfs', 'doc2bow', 'filter_extremes', 'filter_n_most_frequent', 'filter_tokens', 'from_corpus', 'from_documents', 'get', 'id2token', 'items', 'iteritems', 'iterkeys', 'itervalues', 'keys', 'load', 'load_from_text', 'merge_with', 'num_docs', 'num_nnz', 'num_pos', 'save', 'save_as_text', 'token2id', 'values']
good


In [9]:
# CONVERT TO BAG OF WORDS
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus[2])

[(9, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)]


In [10]:
# APPLY LDA
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)
for x in ldamodel.print_topics(num_topics=3, num_words=2):
    print(x)

(0, '0.120*"good" + 0.120*"brocolli"')
(1, '0.074*"drive" + 0.074*"brother"')
(2, '0.125*"health" + 0.050*"pressur"')


In [12]:
print(dir(ldamodel))
print(ldamodel.get_document_topics(corpus[0]))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_apply', '_load_specials', '_save_specials', '_smart_save', 'alpha', 'bound', 'callbacks', 'chunksize', 'clear', 'decay', 'diff', 'dispatcher', 'distributed', 'do_estep', 'do_mstep', 'eta', 'eval_every', 'expElogbeta', 'gamma_threshold', 'get_document_topics', 'get_term_topics', 'get_topic_terms', 'get_topics', 'id2word', 'inference', 'init_dir_prior', 'iterations', 'load', 'log_perplexity', 'minimum_phi_value', 'minimum_probability', 'num_terms', 'num_topics', 'num_updates', 'numworkers', 'offset', 'optimize_alpha', 'optimize_eta', 'passes', 'per_word_topics', 'print_topic', 'print_topics', 'random_state', 'sav

In [13]:
topics = ldamodel.show_topics(num_words=40)
topics1 = ldamodel.get_topics()
print(topics)
print()
print(topics1)

[(0, '0.121*"brocolli" + 0.121*"good" + 0.118*"eat" + 0.068*"brother" + 0.068*"mother" + 0.067*"like" + 0.017*"health" + 0.017*"profession" + 0.017*"say" + 0.017*"drive" + 0.017*"pressur" + 0.017*"tension" + 0.017*"suggest" + 0.017*"increas" + 0.017*"expert" + 0.017*"caus" + 0.017*"around" + 0.017*"practic" + 0.017*"spend" + 0.017*"basebal" + 0.017*"time" + 0.017*"lot" + 0.017*"may" + 0.017*"blood" + 0.017*"never" + 0.017*"feel" + 0.017*"perform" + 0.017*"seem" + 0.017*"better" + 0.017*"school" + 0.017*"often" + 0.017*"well"'), (1, '0.082*"health" + 0.047*"pressur" + 0.047*"mother" + 0.047*"brother" + 0.047*"well" + 0.047*"often" + 0.047*"school" + 0.047*"seem" + 0.047*"better" + 0.047*"perform" + 0.047*"never" + 0.047*"feel" + 0.047*"say" + 0.047*"profession" + 0.047*"drive" + 0.044*"good" + 0.044*"brocolli" + 0.012*"eat" + 0.012*"like" + 0.012*"tension" + 0.012*"caus" + 0.012*"suggest" + 0.012*"increas" + 0.012*"blood" + 0.012*"may" + 0.012*"expert" + 0.012*"spend" + 0.012*"practic" 

In [15]:
from topic_modeling.lda import LDAModel

lda = LDAModel()

lda.create_model(doc_set, 3)
print(lda.get_topics_and_keywords(num_words=2))

[[('pressur', 0.06462822782686807), ('health', 0.064537432068554529)], [('brother', 0.073823833351888926), ('mother', 0.073821879207508184)], [('good', 0.12988932077895299), ('brocolli', 0.12988898877140737)]]
