# Topic Modeling 

### Preprocessing

In [27]:
from sklearn.decomposition import NMF
import pandas as pd, numpy as np, lda, warnings
import pyLDAvis
import pyLDAvis.graphlab
from collections import Counter, defaultdict
from itertools import islice
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from IPython.core.display import HTML
HTML("<style>.container { width:95% !important; }</style>")
warnings.filterwarnings('ignore')

In [28]:
pyLDAvis.enable_notebook()

In [3]:
df_labeled = pd.read_csv('data/df_all.csv', na_values=['?'])
df_labeled.dropna(inplace = True)
df_labeled.head()

Unnamed: 0,verify_relationship,chemical_name,disease_name,all_info
0,yes_direct,5-HT,psychotic disorders,animal models considered reflect positive symp...
1,no_relation,D-penicillamine,localized scleroderma,case reports patients severe extensive localiz...
2,no_relation,yohimbine,affective disorders,method six patients either obsessive compulsiv...
3,no_relation,calcium,muscle spasms,severe hypokalemia may cause muscle weakness s...
4,no_relation,dexrazoxane,hematologic toxicity,clinical trials patients brain metastases comb...


In [4]:
#Extend all the words in all_info into a list. 
all_words = []
for doc in df_labeled.all_info:
    all_words.extend(doc.split())

In [5]:
#Create a vocabulary
vocab = set(all_words)

In [6]:
len(vocab)

18971

In [7]:
#Create a dictionary of stemmed words and count of the true words.
stem = PorterStemmer()
vocab_dict = defaultdict(list)
for word in vocab: 
    vocab_dict[stem.stem(word)].append((word,all_words.count(word)))
#view first 20 items. 
list(islice(vocab_dict.items(), 20))

[(u'andresist', [('andresistant', 1)]),
 (u'highdens', [('highdensity', 47)]),
 (u'dosageconvers', [('dosageconversion', 2)]),
 (u'telmisartan', [('telmisartan', 29)]),
 (u'polypeptid', [('polypeptide', 2)]),
 (u'four', [('four', 309)]),
 (u'marfan', [('marfan', 16)]),
 (u'oxygensens', [('oxygensensing', 8)]),
 (u'nanoccurr', [('nanoccurrence', 1)]),
 (u'assaysnannk', [('assaysnannk', 5)]),
 (u'putamen', [('putamen', 1)]),
 (u'hyperoxid', [('hyperoxidized', 2), ('hyperoxidation', 15)]),
 (u'cellsnangastr', [('cellsnangastric', 1)]),
 (u'hyperoxia', [('hyperoxia', 1)]),
 (u'nanposterior', [('nanposterior', 1)]),
 (u'infectionnanadenoviru', [('infectionnanadenovirus', 1)]),
 (u'deathnaningest', [('deathnaningestion', 5)]),
 (u'microgramsl', [('microgramsl', 4)]),
 (u'dithian', [('dithianes', 7), ('dithiane', 5)]),
 (u'digit', [('digit', 4), ('digitalization', 2)])]

In [8]:
#key = stemmed word... value = most likely true word
final_vocab_dict = {}
for k,v in vocab_dict.items():
    final_vocab_dict[k] = max(v, key=lambda x:x[1])[0]
#view first 20 items. 
list(islice(final_vocab_dict.items(), 20))

[(u'andresist', 'andresistant'),
 (u'highdens', 'highdensity'),
 (u'dosageconvers', 'dosageconversion'),
 (u'telmisartan', 'telmisartan'),
 (u'polypeptid', 'polypeptide'),
 (u'four', 'four'),
 (u'marfan', 'marfan'),
 (u'crisisdescrib', 'crisisdescribe'),
 (u'oxygensens', 'oxygensensing'),
 (u'nanoccurr', 'nanoccurrence'),
 (u'assaysnannk', 'assaysnannk'),
 (u'putamen', 'putamen'),
 (u'hyperoxid', 'hyperoxidation'),
 (u'cellsnangastr', 'cellsnangastric'),
 (u'hyperoxia', 'hyperoxia'),
 (u'nanposterior', 'nanposterior'),
 (u'arterioven', 'arteriovenous'),
 (u'infectionnanadenoviru', 'infectionnanadenovirus'),
 (u'list', 'listed'),
 (u'deathnaningest', 'deathnaningestion')]

In [9]:
#Stemming each word in X. 
stemmed_info = []
for doc in df_labeled['all_info']:
    d = []
    for word in doc.split():
        d.append(str(stem.stem(word)))
    stemmed_info.append(" ".join(d))
df_labeled['stemmed_info'] = stemmed_info

In [10]:
#boolean verify_relationship (merging yes_direct and yes_indirect to simplify our graph)
temp = {'yes_direct': True, 'yes_indirect':True, 'no_relation':False}
df_labeled['boolean_relationship'] = df_labeled['verify_relationship'].map(temp)
df_labeled = df_labeled.drop(['verify_relationship'], axis=1).copy()

In [11]:
df_labeled.head()

Unnamed: 0,chemical_name,disease_name,all_info,stemmed_info,boolean_relationship
0,5-HT,psychotic disorders,animal models considered reflect positive symp...,anim model consid reflect posit symptom schizo...,True
1,D-penicillamine,localized scleroderma,case reports patients severe extensive localiz...,case report patient sever extens local sclerod...,False
2,yohimbine,affective disorders,method six patients either obsessive compulsiv...,method six patient either obsess compuls disor...,False
3,calcium,muscle spasms,severe hypokalemia may cause muscle weakness s...,sever hypokalemia may caus muscl weak sever hy...,False
4,dexrazoxane,hematologic toxicity,clinical trials patients brain metastases comb...,clinic trial patient brain metastas combin dex...,False


### Vectorize data

In [12]:
X = df_labeled['stemmed_info']
y = df_labeled['boolean_relationship']

In [13]:
vect = CountVectorizer(min_df=1)
tfidf = TfidfVectorizer(min_df=1)

X_vect = vect.fit_transform(X)
X_tfidf = tfidf.fit_transform(X)

In [14]:
train_features_vect = vect.get_feature_names()
train_features_tfidf = tfidf.get_feature_names()
print("len_feature_vect: ", len(train_features_vect))
print("len_feature_tfidf: ", len(train_features_tfidf))

('len_feature_vect: ', 15389)
('len_feature_tfidf: ', 15389)


### NMF (Non-Negative Matrix Factorization)

#### Count Vectorizer

In [18]:
nmf = NMF(n_components=10, random_state=1, alpha=.1, l1_ratio=.5).fit(X_vect)
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
#         print("Topic stemmed #%d:" % topic_idx)
#         print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
#         print()
        print("Topic words #%d:" % topic_idx)
        topic_words = []
        for i in topic.argsort()[:-n_top_words - 1:-1]:
            try:
                #change word back to most likely true word
                topic_words.append(final_vocab_dict[feature_names[i]])
            except:
                topic_words.append(feature_names[i])
        print(" ".join(topic_words))
print_top_words(nmf, train_features_vect, 30)

Topic words #0:
patients treatment study response months therapy disease median days treated dose clinical levels years survival complete evaluated rate one trials total chronic weeks performed hepatitis receiving toxicity observed severe case
Topic words #1:
cells activity cancer inhibition human expression tumor lines increased induced apoptosis growth protein results melanoma potential effects inhibitor pathway showed kinase cytotoxicity treatment lung also erk mice signaling dna study
Topic words #2:
used increased risk study associated may drug clozapine reported children also bodyweight toxicity levobupivacaine less potential due development glucose olanzapine acid disease however risperidone adults profile clinical frequently well treatment
Topic words #3:
mg tumor te dose med ep highgrade gliomas cpt days adults zaleplon activity including system dosage demonstrated study central mgkg malignant derived nude 11 nervous current well produced resistance subjects
Topic words #4:
ef

** Top 3 topics: **
- opiates
- skin cell diseases
- eyes and cats

#### TF-IDF

In [19]:
nmf = NMF(n_components=10, random_state=1, alpha=.1, l1_ratio=.5).fit(X_tfidf)
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        #print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        topic_words = []
        for i in topic.argsort()[:-n_top_words - 1:-1]:
            try:
                #change word back to most likely true word
                topic_words.append(final_vocab_dict[feature_names[i]])
            except:
                topic_words.append(feature_names[i])
        print(" ".join(topic_words))
print_top_words(nmf, train_features_tfidf, 30)

Topic #0:
cells cancer tumor inhibition activity human growth lines apoptosis expression induced effects protein drug combination curcumin inhibitor pathway cytotoxicity breast fu signaling kinase chloroquine autophagy receptor increased carcinoma vitro results
Topic #1:
patients treatment study therapy disease effects aspirin dose group used trials days pain clinical efficacy stroke fentanyl response toxicity drug associated combination compared may treated rate receiving years severe symptoms
Topic #2:
clozapine bodyweight olanzapine risperidone risk children gain antipsychotic glucose quetiapine ziprasidone mania adolescents adults ep increased receptor profile monitoring atypical less frequently hyperprolactinaemia sedation baseline intolerance reported used published toward
Topic #3:
mg te med cpt gliomas highgrade ep tumor 11 adults procarbazine melphalan medulloblastomas sublines ependymomas br rhabdomyosarcoma busulfan derived childhood intracranially dosage mr athymic pr cyclo

** Top 3 topics: **
- diabetes and cancer
- cats and rage
- skin cell diseases

### LDA (Latent Dirichlet Allocation)

#### Count Vectorizer

In [20]:
model = lda.LDA(n_topics=10, n_iter=3000, random_state=1)
model.fit_transform(X_vect) 
topic_word = model.topic_word_  # model.components_ also works
n_top_words = 30
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vect.get_feature_names())[np.argsort(topic_dist)][:-n_top_words:-1]
    top_words = []
    for word in (' '.join(topic_words)).split():
        try:
            #change word back to most likely true word
            top_words.append(final_vocab_dict[str(word)])
        except:
            top_words.append(str(word))
    print('Topic {}: {}'.format(i, ' '.join(top_words)))

Topic 0: patients diabetes study group treatment insulin effects hypertension disease inhibitor therapy type glucose combination blood levels serum metabolic mg cardiovascular control prevention risk vascular reduced drug angiotensin blocker obesity
Topic 1: patients days effects study treatment group pain symptoms score dose used clinical receiving associated infusion morphine severe mg scale rate improved trials weeks efficacy significantly compared therapy placebo daily
Topic 2: effects used may study patients drug associated receptor treatment disease toxicity clinical potential pain also fentanyl development increased evidence risk results data role dose adverse however activity transdermal improved
Topic 3: patients study used case effects treatment increased risk associated development may months years drug dose artery myocardial therapy intake cause vitamin coronary one disease acid acute treated cancer reported
Topic 4: patients treatment therapy used disease case reported ste

** Top 5 Topics:**
- Drug treatments at clinical trials
- Cancer cells
- Effects of chemo drugs
- Diabetes
- Chemicals that affect the brain

**Note: ** Works only with matrix with integer values, so tfidf would not work.

### GraphLab

In [15]:
#create list of dictionaries of word counts of each document.
docs = []
for doc in df_labeled.stemmed_info:
    docs.append(dict(Counter(doc.split())))

In [16]:
docs[0]

{'5-ht': 1,
 '5-htalter': 1,
 '5-htht': 1,
 'addit': 1,
 'alter': 2,
 'analysi': 1,
 'anim': 3,
 'antagonist': 2,
 'antipsychot': 1,
 'associ': 1,
 'clozapin': 1,
 'conduct': 1,
 'consid': 2,
 'correct': 1,
 'damphetamin': 1,
 'damphetamineinduc': 1,
 'disord': 4,
 'disrupt': 1,
 'evid': 2,
 'factor': 1,
 'gene': 1,
 'gener': 1,
 'ht': 6,
 'htr': 1,
 'hyperact': 1,
 'hypothes': 1,
 'induc': 1,
 'inhibit': 1,
 'involv': 2,
 'methamphetamin': 1,
 'methinduc': 2,
 'model': 2,
 'neural': 1,
 'olanzapin': 1,
 'paranoid': 1,
 'pathophysiolog': 2,
 'phencyclidin': 1,
 'posit': 2,
 'prepuls': 1,
 'psychosi': 2,
 'psychot': 4,
 'rat': 2,
 'receptor': 7,
 'reflect': 2,
 'relat': 1,
 'restor': 1,
 'schizophrenia': 4,
 'second': 1,
 'select': 1,
 'serotonerg': 1,
 'serotonin': 1,
 'sever': 1,
 'similar': 1,
 'studi': 1,
 'suggest': 2,
 'suscept': 1,
 'symptom': 3,
 'target': 1,
 'therapeut': 1,
 'therefor': 1,
 'transmiss': 1,
 'type': 1,
 'use': 2}

In [17]:
final_vocab_dict['adequ']

'adequate'

In [18]:
#change stemmed words back to human-readable words. 
new_docs = []

for doc in docs:
    real_counter = defaultdict(int)
    for k,v in doc.items():
        try:
            real_counter[final_vocab_dict[k]] = v
        except:
            real_counter[k] = v
    new_docs.append(dict(real_counter))
new_docs[0]

{'5-ht': 1,
 '5-htaltered': 1,
 '5-htht': 1,
 'addition': 1,
 'alterations': 2,
 'analysis': 1,
 'animals': 3,
 'antagonist': 2,
 'antipsychotic': 1,
 'associated': 1,
 'clozapine': 1,
 'conducted': 1,
 'considered': 2,
 'correction': 1,
 'damphetamine': 1,
 'damphetamineinduced': 1,
 'disorder': 4,
 'disruption': 1,
 'evidence': 2,
 'factor': 1,
 'gene': 1,
 'generally': 1,
 'ht': 6,
 'htr': 1,
 'hyperactivity': 1,
 'hypothesized': 1,
 'induced': 1,
 'inhibition': 1,
 'involved': 2,
 'methamphetamine': 1,
 'methinduced': 2,
 'models': 2,
 'neural': 1,
 'olanzapine': 1,
 'paranoid': 1,
 'pathophysiology': 2,
 'phencyclidine': 1,
 'positive': 2,
 'prepulse': 1,
 'psychosis': 2,
 'psychotic': 4,
 'rats': 2,
 'receptor': 7,
 'reflecting': 2,
 'related': 1,
 'restored': 1,
 'schizophrenia': 4,
 'second': 1,
 'selective': 1,
 'serotonergic': 1,
 'serotonin': 1,
 'severe': 1,
 'similar': 1,
 'study': 1,
 'suggest': 2,
 'susceptibility': 1,
 'symptoms': 3,
 'target': 1,
 'therapeutic': 1,
 't

In [19]:
docs = gl.SArray(new_docs)

In [31]:
%%time
topic_model = gl.topic_model.create(docs, num_topics=10, num_iterations=3000)

In [32]:
pyLDAvis.graphlab.prepare(topic_model, docs)

![title](images/graphLab_image.png)

** Top 3 Topics:**
- Cell cancer
- Diabetes effects
- High blood pressure

Even though GraphLab produces a very nice looking graph, my favorite Topic Modeler is LDA, because they make more sense.