# Thinkful Unsupervised Capstone - Part II
### Using Gensim for Topic Modeling and Clustering

Topic modeling is a form of dimensionality reduction and is comparable to clustering.  It is also a form of tagging.

Data Source: mtsamples.com

In [1]:
import pandas as pd
import numpy as np
import scipy

import spacy
from spacy import displacy
import nltk
import re

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim import corpora, models, similarities

np.random.seed(27)

In [2]:
%matplotlib inline

plt.rcParams['figure.figsize'] = [20.0, 7.0]
plt.rcParams.update({'font.size': 22})

sns.set_palette('bright')
sns.set_style('white')
sns.set_context('talk', font_scale=0.8)

### Data Cleaning and Pre-Processing

In [3]:
# read in the data
raw_data = pd.read_csv('mtsamples.csv')
raw_data.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [4]:
# checking our transcriptions for null values
raw_data.transcription.isnull().sum()

33

In [5]:
# dropping the 33 rows with no transcription text
df = raw_data.dropna(subset=['transcription'])

print('Transcriptions with null values:')
print(df.transcription.isnull().sum())

df.head()

Transcriptions with null values:
0


Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [6]:
# also dropping the unnnamed column as it doesnt provide any value
df = df.drop(columns=['Unnamed: 0'], axis=1)

In [7]:
# loading nltk stopwords
stop_words = nltk.corpus.stopwords.words('english')

# from visual inspection of corpus adding to our stopword list
more_stops = ['aa', 'ab', 'abc', 'abcd', 'xxx', 'xyz', 'xii', 'dr', 'x', 'mg', 'p', 'ml', 'right', 'left']

stop_words = stop_words + more_stops
print (stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
# process documents
## source: https://towardsdatascience.com/understanding-feature-engineering-part-3-traditional-methods-for-text-data-f6f7d70acd41

wpt = nltk.WordPunctTokenizer()

def normalize_document(doc):
    # visual inspection shows a lot of dashes between words
    # I don't want the words to run together so replacing with spaces before stripping special characters
    doc = doc.replace('-', ' ')
    # replacing : with whitespace to ensure words don't run together
    doc = re.sub(r'[:]', ' ', doc)
    # replacing . with whitespace to help tokenizer capture correct sentences
    doc = re.sub(r'[.]', ' ', doc)
    # lowercase and remove all nonalphanumeric characters
    doc = re.sub(r'([^\s\w]|_)+', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # remove non-alpha characters
    tokens = filter(lambda x: x.isalpha(), tokens)
    # filter out stopwords
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [9]:
norm_df = df[['medical_specialty', 'sample_name']].copy()

norm_df['transcription'] = normalize_corpus(df.transcription)
norm_df.head()

Unnamed: 0,medical_specialty,sample_name,transcription
0,Allergy / Immunology,Allergic Rhinitis,subjective year old white female presents comp...
1,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,past medical history difficulty climbing stair...
2,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,history present illness seen today pleasant ge...
3,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,mode atrial enlargement atrial diameter cm nor...
4,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,ventricular cavity size wall thickness appear ...


In [10]:
# lemmatizing documents
nlp = spacy.load('en')

# source: https://github.com/dipanjanS/practical-machine-learning-with-python/blob/master/notebooks/Ch07_Analyzing_Movie_Reviews_Sentiment/Text%20Normalization%20Demo.ipynb

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [11]:
# clean and lemmatize texts
lemma_transcription = [lemmatize_text(item) for item in norm_df.transcription]

In [12]:
tokens = [wpt.tokenize(text) for text in norm_df.transcription]

In [15]:
# create ngrams
ngram_phraser = models.Phrases(tokens, threshold=1)
ngram = models.phrases.Phraser(ngram_phraser)
#print example
print(ngram[tokens[0]])

# apply model to corpus
texts = [ngram[token] for token in tokens]

['subjective_year', 'old_white', 'female_presents', 'complaint', 'allergies', 'used', 'allergies', 'lived', 'seattle', 'thinks', 'worse_past', 'tried', 'claritin', 'zyrtec', 'worked', 'short', 'time_seemed', 'lose', 'effectiveness', 'used', 'allegra', 'also_used', 'last', 'summer', 'began', 'using', 'two_weeks', 'ago', 'appear', 'working', 'well', 'used', 'counter', 'sprays', 'prescription', 'nasal', 'sprays', 'asthma', 'doest', 'require', 'daily', 'medication', 'think', 'flaring', 'medications_medication', 'currently', 'ortho_tri', 'cyclen', 'allegra', 'allergies_known', 'medicine_allergies', 'objective_vitals', 'weight_pounds', 'blood_pressure', 'heent', 'throat', 'mildly', 'erythematous', 'without_exudate', 'nasal_mucosa', 'erythematous', 'swollen', 'clear_drainage', 'seen', 'tms_clear', 'neck_supple', 'without_adenopathy', 'lungs_clear', 'assessment_allergic', 'rhinitis', 'plan', 'try', 'zyrtec', 'instead', 'allegra', 'another', 'option', 'use', 'loratadine', 'think', 'prescription

In [27]:
lemmas = [lemmatize_text(text) for text in norm_df.transcription]

In [40]:
tokens = [wpt.tokenize(text) for text in lemmas]
texts = ngram[tokens]
texts

<gensim.interfaces.TransformedCorpus at 0x137c799e8>

### LDA
Latent Dirichlet Allocation is a topic model that generates topics based on word frequency.

In [41]:
#create a Gensim dictionary from the texts
dictionary = corpora.Dictionary(texts)
print(dictionary)

#remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
dictionary.filter_extremes(no_below=1, no_above=0.7)

#convert the dictionary to a bag of words corpus for reference
corpus = [dictionary.doc2bow(text) for text in texts]

Dictionary(34762 unique tokens: ['ago', 'allegra', 'allergy', 'also', 'another']...)


In [42]:
[[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('ago', 1),
  ('allegra', 3),
  ('allergy', 4),
  ('also', 1),
  ('another', 1),
  ('appear', 1),
  ('assessment_allergic', 1),
  ('asthma', 1),
  ('bad', 1),
  ('begin', 1),
  ('blood_pressure', 1),
  ('cheap', 1),
  ('claritin', 1),
  ('clear', 1),
  ('clear_drainage', 1),
  ('complaint', 1),
  ('counter', 1),
  ('coverage', 1),
  ('currently', 1),
  ('cyclen', 1),
  ('daily', 1),
  ('doest', 1),
  ('effectiveness', 1),
  ('erythematous', 2),
  ('female', 1),
  ('flare', 1),
  ('give', 1),
  ('heent', 1),
  ('instead', 1),
  ('know', 1),
  ('last', 1),
  ('live', 1),
  ('loratadine', 1),
  ('lose', 1),
  ('lung', 1),
  ('may', 1),
  ('medication', 3),
  ('medicine', 1),
  ('mildly', 1),
  ('nasal_mucosa', 1),
  ('nasal_spray', 1),
  ('nasonex', 1),
  ('neck_supple', 1),
  ('objective_vital', 1),
  ('old_white', 1),
  ('option', 1),
  ('ortho_tri', 1),
  ('past', 1),
  ('plan', 1),
  ('pound', 1),
  ('prescription', 3),
  ('present', 1),
  ('require', 1),
  ('rhinitis', 1),
  ('samp

In [43]:
%time lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary, update_every=5, chunksize=10000, passes=100)

CPU times: user 10min 6s, sys: 579 ms, total: 10min 6s
Wall time: 10min 6s


In [44]:
lda.show_topics()

[(0,
  '0.011*"use" + 0.008*"artery" + 0.007*"catheter" + 0.007*"perform" + 0.007*"place" + 0.006*"remove" + 0.004*"procedure" + 0.004*"obtain" + 0.004*"eye" + 0.004*"coronary_artery"'),
 (1,
  '0.005*"normal" + 0.005*"see" + 0.004*"show" + 0.004*"reveal" + 0.004*"low" + 0.004*"also" + 0.004*"time" + 0.003*"well" + 0.003*"follow" + 0.003*"decrease"'),
 (2,
  '0.010*"place" + 0.010*"remove" + 0.010*"procedure" + 0.007*"perform" + 0.007*"use" + 0.005*"note" + 0.005*"finding" + 0.005*"bleed" + 0.005*"take" + 0.004*"obtain"'),
 (3,
  '0.016*"use" + 0.016*"place" + 0.006*"take" + 0.006*"perform" + 0.006*"incision" + 0.006*"close" + 0.006*"remove" + 0.006*"well" + 0.006*"make" + 0.005*"note"'),
 (4,
  '0.006*"history" + 0.006*"deny" + 0.005*"time" + 0.005*"well" + 0.005*"note" + 0.005*"medication" + 0.005*"pain" + 0.005*"normal" + 0.005*"also" + 0.005*"state"')]

#### Compute Model Perplexity and Coherence Score

In [45]:
# compute perplexity => lower = better
print('Perplexity: ', lda.log_perplexity(corpus))

# compute coherence score
coherence_model_lda = models.CoherenceModel(model=lda,
                                            texts=texts,
                                            dictionary=dictionary,
                                            coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)


Perplexity:  -8.478066472469237
Coherence Score:  0.45633465157654873


In [57]:
import pyLDAvis
from pyLDAvis import gensim

pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda, corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [58]:
# from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

def get_lda_topics(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict);

lda_topics = get_lda_topics(lda, num_topics=5)
lda_topics

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05
0,use,normal,place,use,history
1,artery,see,remove,place,deny
2,catheter,show,procedure,take,time
3,perform,reveal,perform,perform,well
4,place,low,use,incision,note
5,remove,also,note,close,medication
6,procedure,time,finding,remove,pain
7,obtain,well,bleed,well,normal
8,eye,follow,take,make,also
9,coronary_artery,decrease,obtain,note,state


In [61]:
def format_topics_sentences(ldamodel=lda, corpus=corpus, texts=texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda, corpus=corpus, texts=texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(15)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,4.0,0.9921,"history, deny, time, well, note, medication, p...","[subjective_year, old_white, female, present, ..."
1,1,4.0,0.9956,"history, deny, time, well, note, medication, p...","[past_medical, history, difficulty, climb, sta..."
2,2,4.0,0.9976,"history, deny, time, well, note, medication, p...","[history_present, illness, see, today_pleasant..."
3,3,0.0,0.8729,"use, artery, catheter, perform, place, remove,...","[mode, atrial_enlargement, atrial, diameter_cm..."
4,4,0.0,0.9928,"use, artery, catheter, perform, place, remove,...","[ventricular_cavity, size, wall_thickness, app..."
5,5,3.0,0.8038,"use, place, take, perform, incision, close, re...","[preoperative_diagnosis, morbid_obesity, posto..."
6,6,3.0,0.9968,"use, place, take, perform, incision, close, re...","[preoperative_diagnosis, deformity_breast, rec..."
7,7,0.0,0.9855,"use, artery, catheter, perform, place, remove,...","[echocardiogrammultiple, view, heart, great, v..."
8,8,3.0,0.9914,"use, place, take, perform, incision, close, re...","[preoperative_diagnosis, lipodystrophy_abdomen..."
9,9,0.0,0.9781,"use, artery, catheter, perform, place, remove,...","[description, normal, cardiac, chamber, size_n..."
