#Topic Modeling and Clustering

Use topic modeling and clustering to categorize tweets (hopefully eliminating garbage tweets)

see: https://radimrehurek.com/gensim/tut1.html

In [1]:
import pickle
import gzip
import sys
from pprint import pprint
import collections
import os
import json

from pattern.en import parse
from pattern.en import ngrams
# import lshash
import logging

import nltk
from nltk.corpus import stopwords
from gensim import corpora, models, similarities
import gensim.matutils

import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.cluster import KMeans

In [2]:
import pattern
import pattern.en

#Configure

In [3]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
NUM_TOPICS = 10

#Code

In [5]:
def buildModel(corpus, modelType='lsi', num_topics=NUM_TOPICS, verbose=False,
              iterations=500, passes=20):  # interations and passes inly used for lda
    # Apply tf-idf
    tfidf = models.TfidfModel(corpus) # build tfidf model
    corpus_tfidf = tfidf[corpus]  # Normalize corpus
    
    if modelType == 'lsi':
        model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=NUM_TOPICS) # initialize an LSI transformation

    elif modelType == 'lda':
        model = models.LdaModel(corpus, id2word=dictionary,
                                num_topics=num_topics,
                                iterations=500, passes=20)
    elif modelType == 'hdp':
        model = models.HdpModel(corpus, id2word=dictionary)
    else:
        print 'Unknown model type: {}'.format(modelType)
        raise Exception
        
    if verbose:
        showTopics(model)
        
    transformed_corpus = model[corpus_tfidf]
    
    return transformed_corpus, model

In [6]:
def showTopics(model):
    if (isinstance(model, gensim.models.lsimodel.LsiModel)
            or isinstance(model, gensim.models.ldamodel.LdaModel)):
        pprint(model.show_topics(formatted=False))
    elif isinstance(model, gensim.models.hdpmodel.HdpModel):
        pprint(model.show_topics(topics=10, topn=5, formatted=False))
    else:
        print 'Unknown model type'

In [7]:
def clusterTweets(tweets, tweetCorpus, n_clusters=30):
    """Apply clustering to a set of tweets and pring results"""
    
    #Convert from Gensim format to numpy/scipy for use by sklearm
    tweetMatrix = gensim.matutils.corpus2csc(tweetCorpus).T
    
    estimator = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
    cluster_idx = estimator.fit_predict(tweetMatrix) 
    
    # Now show Sample tweets for each centroid
    for i in range(n_clusters):
        print
        print
        print '***** Cluster: {} *****'.format(i)
        for j in range(len(cluster_idx)):
            if i == cluster_idx[j]:
                print '  ', j, ':', tweets[j]['text']

In [8]:
def countSymptoms(symptoms):
    """Count instances of symptoms"""
    symptoms = set(symptoms)
    symptom_counts = collections.defaultdict(int)
    for entry in parsedText:
        entrySymptoms = symptoms.intersection(set(entry))
        for k in entrySymptoms:
            symptom_counts[k] += 1
    pprint(sorted(symptom_counts.items(), key=lambda x:x[1], reverse=True))

#Load data

In [9]:
deduped_tweets = pickle.load(open( "deduped_tweets.p", "rb" ))

In [10]:
stoplist = stopwords.words('english')

In [11]:
combined_symptoms = pickle.load(open("symptoms.p", "rb" ))

#Build Dictionary

In [12]:
parsedText = [[one_gram[0] for one_gram in ngrams(tweet['text'], n=1)]
              for tweet in deduped_tweets]

In [13]:
dictionary = corpora.Dictionary(parsedText)
print(dictionary)

Dictionary(2221 unique tokens: [u'adverse\u2026', u'suicidal', u'dyingtotune', u'http://t.co/pxsjttcdvm', u'appetite']...)


In [14]:
# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
            if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
print(dictionary)

Dictionary(489 unique tokens: [u'gt', u':(', u'forget', u'caused', u'resistance']...)


In [15]:
corpus = [dictionary.doc2bow(line) for line in parsedText]
print len(corpus)

341


#Apply K-Means

In [16]:
# Before Gensim
clusterTweets(deduped_tweets, corpus, n_clusters=10)



***** Cluster: 0 *****
   165 : taylor: she can have an anxiety attack. i dont like her
yasmin: anxiety attacks are not funny
taylor: neither is she
   240 : buy ativan 1mg - ativan 1mg, used to treat anxiety disorders, sleeping disorder and to relieve anxiety... http://t.co/hwmcfxtjwz
   304 : modafinil for anxiety attack disorder and social anxiety treatment http://t.co/aivh4opscg


***** Cluster: 1 *****
   1 : @billygunn19 they put me on it to counteract the decrease in hunger from the wellbutrin. thankfully i could use a few pounds :)
   2 : listen to lecture on overcoming sadness and depression - yasmin mogahed by yasmin.mogahed #np on #soundcloud
http://t.co/kfehqx4yw1
   3 : diabetes treatment drug, metformin, could help prevent blindness according #diabetesfree, #diabetes, #stopdiabetes http://t.co/au8xfme5jm
   4 : #ovariancyst risk factors: infertility treatment, tamoxifen, pregnancy, hypothyroidism, maternal gonadotropins, cigarettes, tubal ligation
   5 : re: vaccine tre

#Apply Topic Modeling, then Cluster

In [17]:
transformedCorpus, model = buildModel(corpus, modelType='lsi')
# newCorpus, model = buildModel(corpus, modelType='lda')
# newCorpus, model = buildModel(corpus, modelType='hdp')

In [18]:
clusterTweets(deduped_tweets, transformedCorpus, n_clusters=10)



***** Cluster: 0 *****
   6 : tramadol prescription pain medication spike emergency room visits http://t.co/cqbpznajmr
   31 : dear percocet,
you were supposed to make my pain go away, not make me sick. fuck. you. 😖
no love,
lori
   37 : more xanax and maalox, stat! :-) #gutfeld rt @greggutfeld: it's 45 minutes away and the butterflies in my stomach have diarrhea. #gutfeld
   58 : yasmin canada discount card panklav 250mg .. spironolactone teratogenic effects pain adverse… http://t.co/3q0mb5k9eh
   62 : a male pharmacist trying to convince me id be better off with ibuprofen + paracetamol than ibuprofen + codeine for period pain
   77 : gonna overdose on codeine to sleep the pain away...
   80 : u were supposed to pick me up from work and give me tramadol but now i'm off rideless &amp; in pain.. 👎
   90 : @iacoguy @interneteh @rex4711 i had iv dilaudid in er once. only drug that *ever* blocked pain being caused by my pinched nerve. wowser.
   92 : @ilovaussiesheps @interneteh @rex4711

#Look at presence of symptoms within filtered tweets

In [19]:
countSymptoms(combined_symptoms)

[(u'pain', 62),
 (u'anxiety', 27),
 (u'low', 20),
 (u'cough', 17),
 (u'chill', 16),
 (u'sick', 15),
 (u'depression', 14),
 (u'fit', 14),
 (u'muscle', 14),
 (u'insomnia', 12),
 (u'infection', 11),
 (u'bleeding', 9),
 (u'throat', 8),
 (u'gas', 7),
 (u'nausea', 7),
 (u'heartburn', 5),
 (u'stomach', 5),
 (u'appetite', 5),
 (u'coughing', 5),
 (u'injury', 5),
 (u'bloody', 4),
 (u'amnesia', 4),
 (u'wound', 3),
 (u'bleed', 3),
 (u'sweat', 3),
 (u'edema', 2),
 (u'swelling', 2),
 (u'urine', 2),
 (u'ache', 2),
 (u'diarrhea', 2),
 (u'vagina', 2),
 (u'tinnitus', 2),
 (u'drowsiness', 2),
 (u'vomiting', 2),
 (u'blindness', 2),
 (u'chills', 2),
 (u'gout', 1),
 (u'delusion', 1),
 (u'sweating', 1),
 (u'itch', 1),
 (u'hunger', 1),
 (u'thirst', 1),
 (u'impotence', 1),
 (u'stool', 1),
 (u'fainting', 1),
 (u'painful', 1),
 (u'hurting', 1),
 (u'fever', 1),
 (u'sore', 1),
 (u'fatigue', 1),
 (u'hearing', 1),
 (u'infertility', 1),
 (u'pneumonia', 1),
 (u'shaking', 1),
 (u'blackout', 1)]


#Explore Sentiment, Mood, Modality 

####Sentiment
The sentiment() function returns a (polarity, subjectivity)-tuple for the given sentence, based on the adjectives it contains, where polarity is a value between -1.0 and +1.0 and subjectivity between 0.0 and 1.0.

####Mood
The mood() function returns either INDICATIVE, IMPERATIVE, CONDITIONAL or SUBJUNCTIVE for a given parsed Sentence. See the table below for an overview of moods.


Mood |	Form |Use| Example
-----|-------|---|---------
INDICATIVE|none of the below 	|fact, belief	|It rains.
IMPERATIVE|	infinitive without to	|command, warning	|Don't rain!
CONDITIONAL|	would, could, should, may, or will, can + if	|conjecture	|It might rain.
SUBJUNCTIVE|	wish, were, or it is + infinitive	|wish, opinion	|I hope it rains.


####Modality
The modality() function returns the degree of certainty as a value between -1.0 and +1.0, where values > +0.5 represent facts. For example, "I wish it would stop raining" scores -0.35, whereas "It will stop raining" scores +0.75. Accuracy is about 68% for Wikipedia texts.

In [20]:
def analyseTweet(entry):
    """Get sentiment, mood, and modality of tweet"""
    line = ' '.join(entry)
    polarity, subjectivity = pattern.en.sentiment(line)
    s = pattern.en.Sentence(pattern.en.parse(line, lemmata=True))
    return polarity, subjectivity, pattern.en.mood(s), pattern.en.modality(s)

In [25]:
if True:
    for entry in parsedText:
        polarity, subjectivity, mood, modality = analyseTweet(entry)
        if polarity < 0:
            line = ' '.join(entry)
            print line
            print 'Sentiment -- Polarity: {0}, Subjectivity: {1}'.format(polarity, subjectivity)
            print 'Mood:', mood, 'Modality:', modality
            print

one week off the vasopressin our son 's anxiety is back hitting himself and the repetitive behavior-all back asd http://t.co/movz6tp7kw
Sentiment -- Polarity: -0.0833333333333, Subjectivity: 0.0833333333333
Mood: indicative Modality: 0.583333333333

promethazine is nausea medicine chill ur not that hard
Sentiment -- Polarity: -0.291666666667, Subjectivity: 0.541666666667
Mood: indicative Modality: 1.0

yungtaxi wolfiehan yasmin chill u hoe :/
Sentiment -- Polarity: -0.25, Subjectivity: 1.0
Mood: indicative Modality: 1.0

green tea for weight loss http://t.co/pxgywpyzup diabetes diabetic diabetics insulin
Sentiment -- Polarity: -0.2, Subjectivity: 0.3
Mood: indicative Modality: 1.0

was planning to enter three scripts in the nywift fellowship but got sick now will attempt one or two through codeine haze
Sentiment -- Polarity: -0.714285714286, Subjectivity: 0.857142857143
Mood: conditional Modality: 0.75

soma sci-fi horror game from amnesia devs gets trailer and sept ... http://t.co/psc

#Boneyard