# Topic Modeling 

### Preprocessing

In [11]:
from sklearn.decomposition import NMF
import lda
import numpy as np
import graphlab as gl
import pyLDAvis
import pyLDAvis.graphlab
import pandas as pd
from collections import Counter, defaultdict
from itertools import islice
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from IPython.core.display import HTML
HTML("<style>.container { width:90% !important; }</style>")
import warnings
warnings.filterwarnings('ignore')

In [3]:
pyLDAvis.enable_notebook()

In [5]:
df_labeled = pd.read_csv('data/Final_df.csv', na_values=['?'])
df_labeled.dropna(inplace = True)
df_labeled.head()

Unnamed: 0,boolean_relationship,chemical_name,disease_name,all_info
0,True,caffeine,ventricular fibrillation,describe yearold woman preexisting mitral valv...
1,False,lactulose,critically ill,laxation critically ill patients lactulose pol...
2,False,sulfasalazine/SSZ,PsA,methotrexate mtx sulfasalazine ssz cyclosporin...
3,True,Polychlorinated biphenyls/PCBs,NAFLD/non-alcoholic fatty liver disease,polychlorinated biphenyls pcbs persistent envi...
4,True,PCP,sarcoma,although pcp classified human carcinogen epide...


In [6]:
#Extend all the words in all_info into a list. 
all_words = []
for doc in df_labeled.all_info:
    all_words.extend(doc.split())

In [7]:
#Create a vocabulary
vocab = set(all_words)

In [8]:
len(vocab)

13440

In [13]:
#Create a dictionary of stemmed words and count of the true words.
stem = PorterStemmer()
vocab_dict = defaultdict(list)
for word in vocab: 
    vocab_dict[stem.stem(word)].append((word,all_words.count(word)))
#view first 20 items. 
list(islice(vocab_dict.items(), 20))

[(u'highdens', [('highdensity', 11)]),
 (u'pleuriti', [('pleuritis', 3)]),
 (u'four', [('four', 123)]),
 (u'marfan', [('marfan', 8)]),
 (u'crisisdescrib', [('crisisdescribe', 1)]),
 (u'oxygensens', [('oxygensensing', 1)]),
 (u'nanoccurr', [('nanoccurrence', 1)]),
 (u'assaysnannk', [('assaysnannk', 2)]),
 (u'gavag', [('gavage', 1), ('gavaged', 1)]),
 (u'hyperoxid', [('hyperoxidized', 1), ('hyperoxidation', 7)]),
 (u'infectionnanadenoviru', [('infectionnanadenovirus', 1)]),
 (u'deathnaningest', [('deathnaningestion', 5)]),
 (u'dithian', [('dithianes', 4), ('dithiane', 3)]),
 (u'digit', [('digit', 1)]),
 (u'leuprolid', [('leuprolide', 4)]),
 (u'lactoferrin', [('lactoferrin', 2)]),
 (u'reportnanreport', [('reportnanreport', 1)]),
 (u'highprogesteron', [('highprogesterone', 6)]),
 (u'pigment', [('pigment', 5), ('pigmentation', 2)]),
 (u'nansertindol', [('nansertindole', 1)])]

In [14]:
#key = stemmed word... value = most likely true word
final_vocab_dict = {}
for k,v in vocab_dict.items():
    final_vocab_dict[k] = max(v, key=lambda x:x[1])[0]
#view first 20 items. 
list(islice(final_vocab_dict.items(), 20))

[(u'highdens', 'highdensity'),
 (u'pleuriti', 'pleuritis'),
 (u'four', 'four'),
 (u'marfan', 'marfan'),
 (u'crisisdescrib', 'crisisdescribe'),
 (u'oxygensens', 'oxygensensing'),
 (u'nanoccurr', 'nanoccurrence'),
 (u'assaysnannk', 'assaysnannk'),
 (u'gavag', 'gavage'),
 (u'hyperoxid', 'hyperoxidation'),
 (u'infectionnanadenoviru', 'infectionnanadenovirus'),
 (u'list', 'listed'),
 (u'deathnaningest', 'deathnaningestion'),
 (u'dithian', 'dithianes'),
 (u'digit', 'digit'),
 (u'lactoferrin', 'lactoferrin'),
 (u'reportnanreport', 'reportnanreport'),
 (u'highprogesteron', 'highprogesterone'),
 (u'pigment', 'pigment'),
 (u'nansertindol', 'nansertindole')]

In [15]:
#Stemming each word in X. 
stemmed_info = []
for doc in df_labeled['all_info']:
    d = []
    for word in doc.split():
        d.append(str(stem.stem(word)))
    stemmed_info.append(" ".join(d))
df_labeled['stemmed_info'] = stemmed_info

In [16]:
df_labeled.head()

Unnamed: 0,boolean_relationship,chemical_name,disease_name,all_info,stemmed_info
0,True,caffeine,ventricular fibrillation,describe yearold woman preexisting mitral valv...,describ yearold woman preexist mitral valv pro...
1,False,lactulose,critically ill,laxation critically ill patients lactulose pol...,laxat critic ill patient lactulos polyethylen ...
2,False,sulfasalazine/SSZ,PsA,methotrexate mtx sulfasalazine ssz cyclosporin...,methotrex mtx sulfasalazin ssz cyclosporin csa...
3,True,Polychlorinated biphenyls/PCBs,NAFLD/non-alcoholic fatty liver disease,polychlorinated biphenyls pcbs persistent envi...,polychlorin biphenyl pcb persist environment t...
4,True,PCP,sarcoma,although pcp classified human carcinogen epide...,although pcp classifi human carcinogen epidemi...


### Vectorize data

In [17]:
X = df_labeled['stemmed_info']
y = df_labeled['boolean_relationship']

In [18]:
vect = CountVectorizer(min_df=1)
tfidf = TfidfVectorizer(min_df=1)

X_vect = vect.fit_transform(X)
X_tfidf = tfidf.fit_transform(X)

In [19]:
train_features_vect = vect.get_feature_names()
train_features_tfidf = tfidf.get_feature_names()
print("len_feature_vect: ", len(train_features_vect))
print("len_feature_tfidf: ", len(train_features_tfidf))

('len_feature_vect: ', 10726)
('len_feature_tfidf: ', 10726)


### NMF (Non-Negative Matrix Factorization)

#### Count Vectorizer

In [20]:
nmf = NMF(n_components=10, random_state=1, alpha=.1, l1_ratio=.5).fit(X_vect)
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
#         print("Topic stemmed #%d:" % topic_idx)
#         print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
#         print()
        print("Topic words #%d:" % topic_idx)
        topic_words = []
        for i in topic.argsort()[:-n_top_words - 1:-1]:
            try:
                #change word back to most likely true word
                topic_words.append(final_vocab_dict[feature_names[i]])
            except:
                topic_words.append(feature_names[i])
        print(" ".join(topic_words))
print_top_words(nmf, train_features_vect, 30)

Topic words #0:
fentanyl patients transdermal opioid effects pain hours may adverse decreased improved developed chronic hypoventilation opioidrelated initiation immediately used treatment oral administration management conditions antagonist administered naloxone required blood adequate medical
Topic words #1:
cells activity human increased inhibition melanoma cancer expression induced effects apoptosis results treatment adhesion inhibitor hetes growth mice tumor 12 potential lines erk fak lung protein cytotoxicity kinase showed acid
Topic words #2:
calcium carbachol eserine chloride vocalization fighting tremor convulsions mydriasis clonictonic injection produced claws paws action central cats changes cause significantly hand similarly motor defense evoked cerebral attack upon ventricles biting
Topic words #3:
receptor contractions nm ketanserin ec log ht rings artery hta agonist mediated endothelium temporal htinduced human found components antagonized partial ketanserinresistant int

** Top 3 topics: **
- opiates
- skin cell diseases
- eyes and cats

#### TF-IDF

In [21]:
nmf = NMF(n_components=10, random_state=1, alpha=.1, l1_ratio=.5).fit(X_tfidf)
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        #print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        topic_words = []
        for i in topic.argsort()[:-n_top_words - 1:-1]:
            try:
                #change word back to most likely true word
                topic_words.append(final_vocab_dict[feature_names[i]])
            except:
                topic_words.append(feature_names[i])
        print(" ".join(topic_words))
print_top_words(nmf, train_features_tfidf, 30)

Topic #0:
patients treatment disease therapy risk study effects increased used case clinical diabetes drug dose reported associated combination developed cancer may cardiac mg group trials failure toxicity efficacy response treated receiving
Topic #1:
carbachol eserine chloride calcium fighting vocalization clonictonic mydriasis tremor convulsions claws paws injection central produced action phenomena unanaesthetized hissing rage biting snarling grouphoused dissociate autonomic ventricles evoked hand cats apparent
Topic #2:
cells cancer activity inhibition human expression melanoma growth apoptosis tumor lines curcumin cytotoxicity hetes induced inhibitor adhesion breast erk protein 12 effects mice ndma fak kinase increased ags carcinoma results
Topic #3:
chromium vi chromiumiii salts ascorbic dermatitis acid system therefore occurred toxicity chrome contact allergic advocated poisoning burns trivalent chromiumviinduced topical may bioavailable soluble readily inhaled ingestion cross f

** Top 3 topics: **
- diabetes and cancer
- cats and rage
- skin cell diseases

### LDA (Latent Dirichlet Allocation)

#### Count Vectorizer

In [22]:
model = lda.LDA(n_topics=10, n_iter=3000, random_state=1)
model.fit_transform(X_vect) 
topic_word = model.topic_word_  # model.components_ also works
n_top_words = 30
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vect.get_feature_names())[np.argsort(topic_dist)][:-n_top_words:-1]
    top_words = []
    for word in (' '.join(topic_words)).split():
        try:
            #change word back to most likely true word
            top_words.append(final_vocab_dict[str(word)])
        except:
            top_words.append(str(word))
    print('Topic {}: {}'.format(i, ' '.join(top_words)))

Topic 0: patients treatment study cancer associated effects used survival therapy neuropathy agents advanced combination toxicity clinical day severe response inhibitor drug trials opioid dose efficacy breast management progression assessed symptoms
Topic 1: cells activity effects inhibition cancer tumor expression human increased induced growth protein lines results drug apoptosis treatment study cytotoxicity also used levels resistance dna inhibitor investigated gene breast mechanisms
Topic 2: patients case treatment dose reported therapy developed associated clinical syndrome used drug toxicity renal months complications cause disease low receiving due symptoms treated day agents cisplatin one related acute
Topic 3: patients diabetes group calcium levels serum study insulin concentrations type chloride disease metabolism carbachol eserine control increased also changes effects iron diet vocalization fighting cholesterol syndrome action acid glucose
Topic 4: effects activity induced 

** Top 5 Topics:**
- Drug treatments at clinical trials
- Cancer cells
- Effects of chemo drugs
- Diabetes
- Chemicals that affect the brain

**Note: ** Works only with matrix with integer values, so tfidf would not work.

### GraphLab

In [24]:
#create list of dictionaries of word counts of each document.
docs = []
for doc in df_labeled.stemmed_info:
    docs.append(dict(Counter(doc.split())))

In [25]:
docs[0]

{'adequ': 1,
 'caffein': 2,
 'caffeinedescrib': 1,
 'case': 1,
 'concentr': 2,
 'consum': 3,
 'contain': 3,
 'describ': 1,
 'develop': 3,
 'drink': 3,
 'energi': 2,
 'fibril': 3,
 'guarana': 2,
 'health': 3,
 'high': 2,
 'highlight': 1,
 'intract': 2,
 'label': 1,
 'mitral': 2,
 'natur': 2,
 'need': 1,
 'preexist': 2,
 'product': 1,
 'prolaps': 2,
 'regul': 1,
 'valv': 2,
 'ventricular': 3,
 'woman': 2,
 'yearold': 2}

In [108]:
final_vocab_dict['adequ']

'adequate'

In [26]:
#change stemmed words back to human-readable words. 
new_docs = []

for doc in docs:
    real_counter = defaultdict(int)
    for k,v in doc.items():
        try:
            real_counter[final_vocab_dict[k]] = v
        except:
            real_counter[k] = v
    new_docs.append(dict(real_counter))
new_docs[0]

{'adequate': 1,
 'caffeine': 2,
 'caffeinedescribe': 1,
 'case': 1,
 'concentrations': 2,
 'consumer': 3,
 'containing': 3,
 'describe': 1,
 'developed': 3,
 'drinking': 3,
 'energy': 2,
 'fibrillation': 3,
 'guarana': 2,
 'health': 3,
 'high': 2,
 'highlights': 1,
 'intractable': 2,
 'label': 1,
 'mitral': 2,
 'nature': 2,
 'needed': 1,
 'preexisting': 2,
 'production': 1,
 'prolapse': 2,
 'regulation': 1,
 'valve': 2,
 'ventricular': 3,
 'woman': 2,
 'yearold': 2}

In [27]:
docs = gl.SArray(new_docs)

In [34]:
%%time
topic_model = gl.topic_model.create(docs, num_topics=10, num_iterations=3000)

CPU times: user 2min 51s, sys: 2.54 s, total: 2min 53s
Wall time: 1min 24s


In [37]:
pyLDAvis.graphlab.prepare(topic_model, docs)

![title](images/graphLab_image.png)

** Top 3 Topics:**
- Cell cancer
- Diabetes effects
- High blood pressure

Even though GraphLab produces a very nice looking graph, my favorite Topic Modeler is LDA, because they make more sense.