In [1]:
from core.helpers.common import compose

In [1]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

In [3]:
# FUNCTIONAL HELPERS
def curry2(func):
    """Curry the two arguments function"""
    def f1(x):
        def f2(y):
            return func(x, y)
        return f2
    return f1

# CURRY our map and filter
curried_filter = curry2(filter)
curried_map = curry2(map)

map_lower = curried_map(str.lower)

In [4]:
# TOKENIZATION
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

# STOP WORDS
from stop_words import get_stop_words

# create English stop words list
en_stop = get_stop_words('en')
# function that returns true if not stop word
not_stop = lambda x: x not in en_stop

# STEMMING
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
# Create p_stemmer of class PorterStemmer
stemmer = PorterStemmer()
stemmer = SnowballStemmer('english')
# map stem
map_stem = curried_map(stemmer.stem)

# Create a pre-processor function
# First tokenize, lowercase, then filter nonstop and then stem.
# Result will be a map. convert to list
tokenize_remove_stop_stem = compose(list, map_stem, curried_filter(not_stop), map_lower, tokenizer.tokenize)

In [5]:
# Create our processed texts
texts = [tokenize_remove_stop_stem(x) for x in doc_set]
print(texts)

[['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother'], ['mother', 'spend', 'lot', 'time', 'drive', 'brother', 'around', 'basebal', 'practic'], ['health', 'expert', 'suggest', 'drive', 'may', 'caus', 'increas', 'tension', 'blood', 'pressur'], ['often', 'feel', 'pressur', 'perform', 'well', 'school', 'mother', 'never', 'seem', 'drive', 'brother', 'better'], ['health', 'profession', 'say', 'brocolli', 'good', 'health']]


In [37]:
# CONSTRUCTING A DOCUMENT TERM MATRIX
from gensim import corpora, models

dictionary = corpora.Dictionary(texts)
print(dir(dictionary))
print(dictionary.get(1))

['__abstractmethods__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', '_adapt_by_suffix', '_load_specials', '_save_specials', '_smart_save', 'add_documents', 'compactify', 'dfs', 'doc2bow', 'filter_extremes', 'filter_n_most_frequent', 'filter_tokens', 'from_corpus', 'from_documents', 'get', 'id2token', 'items', 'iteritems', 'iterkeys', 'itervalues', 'keys', 'load', 'load_from_text', 'merge_with', 'num_docs', 'num_nnz', 'num_pos', 'save', 'save_as_text', 'token2id', 'values']
good


In [7]:
# CONVERT TO BAG OF WORDS
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus[2])

[(9, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)]


In [8]:
# APPLY LDA
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)
for x in ldamodel.print_topics(num_topics=3, num_words=2):
    print(x)

(0, '0.082*"brother" + 0.082*"mother"')
(1, '0.141*"health" + 0.080*"profession"')
(2, '0.072*"drive" + 0.071*"pressur"')


In [13]:
print(dir(ldamodel))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_apply', '_load_specials', '_save_specials', '_smart_save', 'alpha', 'bound', 'callbacks', 'chunksize', 'clear', 'decay', 'diff', 'dispatcher', 'distributed', 'do_estep', 'do_mstep', 'eta', 'eval_every', 'expElogbeta', 'gamma_threshold', 'get_document_topics', 'get_term_topics', 'get_topic_terms', 'get_topics', 'id2word', 'inference', 'init_dir_prior', 'iterations', 'load', 'log_perplexity', 'minimum_phi_value', 'minimum_probability', 'num_terms', 'num_topics', 'num_updates', 'numworkers', 'offset', 'optimize_alpha', 'optimize_eta', 'passes', 'per_word_topics', 'print_topic', 'print_topics', 'random_state', 'sav

In [38]:
topics = ldamodel.show_topics(num_words=40)
topics1 = ldamodel.get_topics()
print(topics)
print()
print(topics1)

[(0, '0.082*"brother" + 0.082*"mother" + 0.081*"brocolli" + 0.081*"good" + 0.081*"eat" + 0.046*"basebal" + 0.046*"lot" + 0.046*"practic" + 0.046*"around" + 0.046*"spend" + 0.046*"time" + 0.046*"like" + 0.046*"drive" + 0.012*"health" + 0.012*"increas" + 0.012*"tension" + 0.012*"pressur" + 0.012*"expert" + 0.012*"may" + 0.012*"caus" + 0.012*"suggest" + 0.012*"blood" + 0.012*"feel" + 0.012*"perform" + 0.012*"better" + 0.012*"often" + 0.012*"well" + 0.012*"seem" + 0.012*"never" + 0.012*"school" + 0.012*"say" + 0.012*"profession"'), (1, '0.141*"health" + 0.080*"profession" + 0.080*"say" + 0.080*"brocolli" + 0.080*"good" + 0.020*"drive" + 0.020*"mother" + 0.020*"brother" + 0.020*"increas" + 0.020*"may" + 0.020*"tension" + 0.020*"expert" + 0.020*"caus" + 0.020*"suggest" + 0.020*"blood" + 0.020*"pressur" + 0.020*"like" + 0.020*"eat" + 0.020*"perform" + 0.020*"seem" + 0.020*"never" + 0.020*"school" + 0.020*"time" + 0.020*"better" + 0.020*"spend" + 0.020*"around" + 0.020*"practic" + 0.020*"lot" 

In [4]:
from core.topic_modeling.lda import LDAModel

lda = LDAModel()

model = lda.create_model(doc_set, 3)
print(model.print_topics(num_words=2))
lda.get_topics_and_keywords(num_words=2)

[(0, '0.082*"brother" + 0.082*"mother"'), (1, '0.031*"brocolli" + 0.031*"good"'), (2, '0.125*"health" + 0.050*"pressur"')]


[[('brother', 0.081972768357628872), ('mother', 0.081972667651560513)],
 [('brocolli', 0.031347669001034882), ('good', 0.031347571204249164)],
 [('health', 0.12501218829869792), ('pressur', 0.050117218597194459)]]