#Topic Modeling and Clustering

Use topic modeling and clustering to categorize tweets (hopefully eliminating garbage tweets)

see: https://radimrehurek.com/gensim/tut1.html

In [1]:
import pickle
import gzip
import sys
from pprint import pprint
import collections
import os
import json

from pattern.en import parse
from pattern.en import ngrams
# import lshash
import logging

import nltk
from nltk.corpus import stopwords
from gensim import corpora, models, similarities
import gensim.matutils

import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.cluster import KMeans

In [2]:
import pattern
import pattern.en

#Configure

In [3]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
NUM_TOPICS = 10

#Code

In [5]:
def buildModel(corpus, modelType='lsi', num_topics=NUM_TOPICS, verbose=False,
              iterations=500, passes=20):  # interations and passes inly used for lda
    # Apply tf-idf
    tfidf = models.TfidfModel(corpus) # build tfidf model
    corpus_tfidf = tfidf[corpus]  # Normalize corpus
    
    if modelType == 'lsi':
        model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=NUM_TOPICS) # initialize an LSI transformation

    elif modelType == 'lda':
        model = models.LdaModel(corpus, id2word=dictionary,
                                num_topics=num_topics,
                                iterations=500, passes=20)
    elif modelType == 'hdp':
        model = models.HdpModel(corpus, id2word=dictionary)
    else:
        print 'Unknown model type: {}'.format(modelType)
        raise Exception
        
    if verbose:
        showTopics(model)
        
    transformed_corpus = model[corpus_tfidf]
    
    return transformed_corpus, model

In [6]:
def showTopics(model):
    if (isinstance(model, gensim.models.lsimodel.LsiModel)
            or isinstance(model, gensim.models.ldamodel.LdaModel)):
        pprint(model.show_topics(formatted=False))
    elif isinstance(model, gensim.models.hdpmodel.HdpModel):
        pprint(model.show_topics(topics=10, topn=5, formatted=False))
    else:
        print 'Unknown model type'

In [7]:
def clusterTweets(tweets, tweetCorpus, n_clusters=30):
    """Apply clustering to a set of tweets and pring results"""
    
    #Convert from Gensim format to numpy/scipy for use by sklearm
    tweetMatrix = gensim.matutils.corpus2csc(tweetCorpus).T
    
    estimator = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
    cluster_idx = estimator.fit_predict(tweetMatrix) 
    
    # Now show Sample tweets for each centroid
    for i in range(n_clusters):
        print
        print
        print '***** Cluster: {} *****'.format(i)
        for j in range(len(cluster_idx)):
            if i == cluster_idx[j]:
                print '  ', j, ':', tweets[j]['text']

In [8]:
def countKeywords(keywords, parsedText):
    """Count instances of symptoms"""
    keywords = set(keywords)
    keywords_counts = collections.defaultdict(int)
    for entry in parsedText:
        filtered_entry = {filterToken(e) for e in entry}
        entry_keywords = keywords.intersection(filtered_entry)
        for k in entry_keywords:
            keywords_counts[k] += 1
    pprint(sorted(keywords_counts.items(), key=lambda x:x[1], reverse=True))

In [9]:
def analyseTweet(entry, previouslyParsed=False):
    """Get sentiment, mood, and modality of tweet"""
    line = ' '.join(entry) if previouslyParsed else entry
    polarity, subjectivity = pattern.en.sentiment(line)
    s = pattern.en.Sentence(pattern.en.parse(line, lemmata=True))
    return polarity, subjectivity, pattern.en.mood(s), pattern.en.modality(s)

In [10]:
def getSentimentPolarity(line):
    """Get sentiment of tweet"""
    polarity, subjectivity = pattern.en.sentiment(line)
    return polarity

###Code copied from CleanseMedTweets notebook

In [11]:
from pattern.en.wordlist import BASIC as BASIC_WORDS

In [12]:
def filterToken(tok):
    """Removes @ and # from token"""
    return tok[1:] if (tok.startswith('#') or tok.startswith('@')) else tok

In [13]:
def createDrugLookupTable(fname):
    """Converts drug list to set for performing token lookups
    For ngrame names, only use the first word in name"""

    drug_list = pickle.load(open(fname, "rb" ))
    idx_drugs = {name.strip().lower().split()[0] for name in drug_list}
    # Remove drug names that are also part of the basic english language
    idx_drugs = idx_drugs.difference({ w.lower() for w in BASIC_WORDS})
    print 'Total Unique Drug Name Prefixes: {0} in file {1}'.format(len(idx_drugs), fname)
    return idx_drugs

#Load data

In [14]:
deduped_tweets = pickle.load(open( "deduped_tweets.p", "rb" ))

In [15]:
stoplist = stopwords.words('english')

In [16]:
combined_symptoms = pickle.load(open("symptoms.p", "rb" ))

#Explore Sentiment, Mood, Modality 

####Sentiment
The sentiment() function returns a (polarity, subjectivity)-tuple for the given sentence, based on the adjectives it contains, where polarity is a value between -1.0 and +1.0 and subjectivity between 0.0 and 1.0.

####Mood
The mood() function returns either INDICATIVE, IMPERATIVE, CONDITIONAL or SUBJUNCTIVE for a given parsed Sentence. See the table below for an overview of moods.


Mood |	Form |Use| Example
-----|-------|---|---------
INDICATIVE|none of the below 	|fact, belief	|It rains.
IMPERATIVE|	infinitive without to	|command, warning	|Don't rain!
CONDITIONAL|	would, could, should, may, or will, can + if	|conjecture	|It might rain.
SUBJUNCTIVE|	wish, were, or it is + infinitive	|wish, opinion	|I hope it rains.


####Modality
The modality() function returns the degree of certainty as a value between -1.0 and +1.0, where values > +0.5 represent facts. For example, "I wish it would stop raining" scores -0.35, whereas "It will stop raining" scores +0.75. Accuracy is about 68% for Wikipedia texts.

###Tweets with negative sentiment

In [17]:
if True:
    for tweet in deduped_tweets[:40]:
        line = tweet['text']
        polarity, subjectivity, mood, modality = analyseTweet(line)
        if polarity < 0:
            print line
            print 'Sentiment -- Polarity: {0}, Subjectivity: {1}'.format(polarity, subjectivity)
            print 'Mood:', mood, 'Modality:', modality
            print

one week off the vasopressin; our son's anxiety is back, hitting himself, and the repetitive behavior-all back #asd http://t.co/movz6tp7kw
Sentiment -- Polarity: -0.0833333333333, Subjectivity: 0.0833333333333
Mood: indicative Modality: 0.625

promethazine is nausea medicine chill ur not that hard
Sentiment -- Polarity: -0.291666666667, Subjectivity: 0.541666666667
Mood: indicative Modality: 1.0

@yungtaxi @wolfiehan yasmin chill u hoe :/
Sentiment -- Polarity: -0.25, Subjectivity: 1.0
Mood: indicative Modality: 1.0

green tea for weight loss!!! - http://t.co/pxgywpyzup #diabetes #diabetic #diabetics #insulin
Sentiment -- Polarity: -0.390625, Subjectivity: 0.3
Mood: indicative Modality: 1.0

was planning to enter three scripts in the @nywift fellowship but got sick. now will attempt one or two through codeine haze.
Sentiment -- Polarity: -0.714285714286, Subjectivity: 0.857142857143
Mood: conditional Modality: 0.75

soma, sci-fi horror game from amnesia devs, gets trailer and sept ... 

###Tweets with positive or neutral sentiment

In [18]:
if True:
    for tweet in deduped_tweets[:40]:
        line = tweet['text']
        polarity, subjectivity, mood, modality = analyseTweet(line)
        if polarity >= 0:
            print line
            print 'Sentiment -- Polarity: {0}, Subjectivity: {1}'.format(polarity, subjectivity)
            print 'Mood:', mood, 'Modality:', modality
            print

@billygunn19 they put me on it to counteract the decrease in hunger from the wellbutrin. thankfully i could use a few pounds :)
Sentiment -- Polarity: 0.15, Subjectivity: 0.55
Mood: conditional Modality: 0.0

listen to lecture on overcoming sadness and depression - yasmin mogahed by yasmin.mogahed #np on #soundcloud
http://t.co/kfehqx4yw1
Sentiment -- Polarity: 0.0, Subjectivity: 0.0
Mood: indicative Modality: 0.75

diabetes treatment drug, metformin, could help prevent blindness according #diabetesfree, #diabetes, #stopdiabetes http://t.co/au8xfme5jm
Sentiment -- Polarity: 0.0, Subjectivity: 0.0
Mood: conditional Modality: 0.0

#ovariancyst risk factors: infertility treatment, tamoxifen, pregnancy, hypothyroidism, maternal gonadotropins, cigarettes, tubal ligation
Sentiment -- Polarity: 0.0, Subjectivity: 0.0
Mood: indicative Modality: 0.75

re: vaccine treatments for brain tumours: dexamethasone remains critical to control edema, but does this inhibit immune therapy? #asco15
Sentimen

#Build Dictionary

- Only include tweets with negative sentiment
- remove stopwords (k-means and gensim only)
- remove singular words (k-means and gensim only)

In [19]:
parsedText = [[one_gram[0] for one_gram in ngrams(tweet['text'], n=1)]
              for tweet in deduped_tweets
              if getSentimentPolarity(tweet['text']) < 0]

In [20]:
print 'Before: {}   After:{}'.format(len(deduped_tweets), len(parsedText))

Before: 721   After:165


In [21]:
dictionary = corpora.Dictionary(parsedText)
print(dictionary)

Dictionary(1319 unique tokens: [u'all', u':/', u':(', u'forget', u'bengoldacre']...)


In [22]:
# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
            if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
print(dictionary)

Dictionary(258 unique tokens: [u':/', u':(', u'forget', u'prometh', u'resistance']...)


In [23]:
corpus = [dictionary.doc2bow(line) for line in parsedText]
print len(corpus)

165


#Apply K-Means directly to tweets

In [24]:
# Before Gensim
clusterTweets(deduped_tweets, corpus, n_clusters=10)



***** Cluster: 0 *****
   1 : @billygunn19 they put me on it to counteract the decrease in hunger from the wellbutrin. thankfully i could use a few pounds :)
   14 : free prilosec sample for heartburn (new link) http://t.co/fr70xvvxp2) http://t.co/9wn1skegde
   25 : soma, sci-fi horror game from amnesia devs, gets trailer and sept ... http://t.co/pscdeps3nr
   28 : check this deal : http://t.co/scgrmjeiem #8223 5 lb. l glycine powder pure energy muscle brain health pharmaceuti… http://t.co/zfows0asuk
   29 : heard from someone else who recently had their dosage of sertraline increased, and they reported the same side-effects (anxiety/lethargy).
   39 : phentermine average weight loss per week ebook: phentermine average weight loss per week - the 3 week diet - o... http://t.co/libhygqukg
   42 : "would you rather have adult acne (ew!) or bloody diarrhea  (yay!)?" a summary of the commercial i just heard for onexton.
   143 : @okuutheengineer shit...do you have neosporin or something t

#Apply Topic Modeling, then Cluster

In [25]:
transformedCorpus, model = buildModel(corpus, modelType='lsi')
# newCorpus, model = buildModel(corpus, modelType='lda')
# newCorpus, model = buildModel(corpus, modelType='hdp')

In [26]:
clusterTweets(deduped_tweets, transformedCorpus, n_clusters=10)



***** Cluster: 0 *****
   1 : @billygunn19 they put me on it to counteract the decrease in hunger from the wellbutrin. thankfully i could use a few pounds :)
   2 : listen to lecture on overcoming sadness and depression - yasmin mogahed by yasmin.mogahed #np on #soundcloud
http://t.co/kfehqx4yw1
   23 : green tea for weight loss!!! - http://t.co/pxgywpyzup #diabetes #diabetic #diabetics #insulin
   26 : ive never experienced more anxiety than watching san andreas i need a fuking xanax
   34 : #stanakatic phentermine diet pills for weight loss http://t.co/apk1t46zof #hollywood #celeb
   42 : "would you rather have adult acne (ew!) or bloody diarrhea  (yay!)?" a summary of the commercial i just heard for onexton.
   74 : my ears are ringing i need a xanax and it hurts my throat to swallow pills rn... i want to die
   95 : plastering neosporin all over my face....i really need this cut to heal by tuesday and this swelling to go down.
   120 : osphena (ospemifene) � new drug approved for

#Look at presence of symptoms within filtered tweets

In [27]:
#countSymptoms(combined_symptoms)
countKeywords(combined_symptoms, parsedText)

[(u'pain', 39),
 (u'sick', 23),
 (u'anxiety', 17),
 (u'insomnia', 8),
 (u'chill', 7),
 (u'throat', 6),
 (u'low', 6),
 (u'muscle', 6),
 (u'stomach', 5),
 (u'bleeding', 4),
 (u'depression', 4),
 (u'cough', 4),
 (u'bloody', 3),
 (u'sweating', 3),
 (u'painful', 3),
 (u'infection', 3),
 (u'thirst', 2),
 (u'fit', 2),
 (u'amnesia', 2),
 (u'itching', 2),
 (u'coughing', 2),
 (u'nausea', 2),
 (u'edema', 1),
 (u'swelling', 1),
 (u'dizzy', 1),
 (u'sciatica', 1),
 (u'gas', 1),
 (u'wheezing', 1),
 (u'diarrhea', 1),
 (u'rash', 1),
 (u'vagina', 1),
 (u'injury', 1),
 (u'hurting', 1),
 (u'fever', 1),
 (u'itch', 1),
 (u'vomiting', 1),
 (u'sore', 1),
 (u'fatigue', 1),
 (u'tuberculosis', 1),
 (u'blindness', 1),
 (u'bleed', 1),
 (u'sweat', 1),
 (u'vomit', 1)]


#Look at presence of RX Names within filtered tweets

In [28]:
idx_rx_drugs = createDrugLookupTable('rx_drugs.p')

Total Unique Drug Name Prefixes: 3190 in file rx_drugs.p


In [29]:
countKeywords(idx_rx_drugs, parsedText)

[(u'xanax', 19),
 (u'morphine', 16),
 (u'codeine', 11),
 (u'insulin', 7),
 (u'tramadol', 6),
 (u'adderall', 6),
 (u'soma', 5),
 (u'yaz', 4),
 (u'promethazine', 3),
 (u'naproxen', 3),
 (u'ambien', 3),
 (u'cetirizine', 2),
 (u'percocet', 2),
 (u'diclofenac', 2),
 (u'hydrocodone', 2),
 (u'nuvigil', 2),
 (u'paxil', 2),
 (u'penicillin', 2),
 (u'prometh', 2),
 (u'albuterol', 2),
 (u'prozac', 2),
 (u'doxepin', 2),
 (u'neosporin', 2),
 (u'metformin', 2),
 (u'zofran', 2),
 (u'cortisone', 2),
 (u'clonazepam', 2),
 (u'plan', 2),
 (u'zyrtec', 2),
 (u'yasmin', 2),
 (u'zoloft', 2),
 (u'ortho', 2),
 (u'norco', 2),
 (u'fentanyl', 1),
 (u'ventolin', 1),
 (u'selenium', 1),
 (u'nortriptyline', 1),
 (u'dopamine', 1),
 (u'cipro', 1),
 (u'doxycycline', 1),
 (u'sovaldi', 1),
 (u'phentermine', 1),
 (u'tranexamic', 1),
 (u'diazepam', 1),
 (u'pred', 1),
 (u'xarelto', 1),
 (u'chlorambucil', 1),
 (u'warfarin', 1),
 (u'adrenalin', 1),
 (u'viagra', 1),
 (u'ethambutol', 1),
 (u'acetaminophen', 1),
 (u'simvastatin', 

#Boneyard