#Analyze Medical Symptom related tweets

Steps:
- Load list of drugs [done]
- Filter tweets by reference to drug [done]
- Add spam filter - tweets with large amount of common content
 - Consider locality sensitive hashing
- Additional filter to determine relevance
- Sentiment analysis of selected tweets

To Do:
- Fix IOError exception handling (continue with next file)

###Tweet Format

```python
[u'contributors',
 u'truncated',
 u'text',
 u'in_reply_to_status_id',
 u'id',
 u'favorite_count',
 u'source',
 u'retweeted',
 u'coordinates',
 u'timestamp_ms',
 u'entities',
 u'in_reply_to_screen_name',
 u'id_str',
 u'retweet_count',
 u'in_reply_to_user_id',
 u'favorited',
 u'retweeted_status',
 u'user',
 u'geo',
 u'in_reply_to_user_id_str',
 u'possibly_sensitive',
 u'lang',
 u'created_at',
 u'filter_level',
 u'in_reply_to_status_id_str',
 u'place',
 u'extended_entities']
```

In [1]:
import pickle
import gzip
import sys
from pprint import pprint
import collections
import os
import json

from pattern.en import parse
from pattern.en import ngrams

In [2]:
DIR_PATH = './Corpus/'
MAX_DRUG_NGRAM = 3

#Code

In [3]:
#generator function to return all tweets in a zipped firectory
def all_tweets(directory, extract=lambda x:json.loads(x)):
    for fn in os.listdir(directory):
        if fn.endswith('.json.gz'):
            try:
                f = gzip.open(directory+fn, 'rb')
                for line in f:
                    yield extract(line)
                f.close()
            except IOError:
                print 'IOError for file {}'.format(fn)
                pass

In [4]:
def myfilter(x):
    x = json.loads(x)
    result={}
    if 'delete' not in x:
        
        if 'text' in x:
            result['text'] = x['text']
        else:
            False
        if 'id_str' in x:
            result['id_str'] = x['id_str']
        if 'user' in x and 'screen_name' in x['user']:
            result['screen_name'] = x['user']['screen_name']
        #result['name'] = x['user']['name']
        if 'entities' in x:
            result['user_mentions'] = [men['screen_name'].lower() for men in x['entities']['user_mentions']]
            result['hashtags'] = [entry['text'].lower() for entry in x['entities']['hashtags']]
        if 'lang' in x:
            result['lang'] = x['lang']
        return result
    else:
        False

In [5]:
#test routine for all_tweets
def test_load_tweet(dir_path=DIR_PATH):
    all_count = 0
    count = 0
    has_text = 0
    users = collections.defaultdict(int)
    mentions = collections.defaultdict(int)
    hashtags = collections.defaultdict(int)
    try:
        for x in all_tweets(dir_path, extract=myfilter):
            if x:
                all_count += 1
            else:
                continue
            if 'text' in x:
                has_text += 1
            if x and x['lang']=='en':
                count += 1
                users[x['screen_name'].lower()] += 1
                for tag in x['hashtags']:
                    #pprint.pprint(tag)
                    hashtags[tag] += 1
                for men in x['user_mentions']:
                    mentions[men] += 1
    except IOError:
        print 'IOError'
        print 'foobar'
        pass
    print 'total tweets:', all_count
    print 'total tweets with text:', has_text
    print 'total english tweets: ', count
    print 'unique users: ',len(users)
    print 'unique mentions: ',len(mentions)
    print 'unique hashtags: ',len(hashtags)

In [6]:
def load_tweet(dir_path=DIR_PATH, progress=10000):
    result = []

    for i, x in enumerate(all_tweets(dir_path, extract=myfilter)):
        if i%(progress*10) == 0 and i > 0:
            print i,
            sys.stdout.flush()
        elif i%progress == 0 and i > 0:
            print '*',
            sys.stdout.flush()
            
        if not x:
            continue
        if 'text' not in x:
            continue
        if x and x['lang']=='en':
            result.append(x)

    return result

In [7]:
def createDrugLookupTable(fname, max_drug_ngram=MAX_DRUG_NGRAM):
    ngrams_lookup = [list([]) for i in range(max_drug_ngram+1)]
    ignored = 0
    drug_list = pickle.load(open(fname, "rb" ))
    
    idx_drugs = {name.strip().lower() for name in drug_list if ' ' not in name.strip()}
    all_ngrams = [name.strip().lower() for name in drug_list if ' ' in name.strip()]
    
    for ngram in all_ngrams:
        size = len(ngram.split())
        if size <= max_drug_ngram:
            ngrams_lookup[size].append(ngram)
        else:
            ignored += 1
            
    idx_drugs_ngram = [set(x) for x in ngrams_lookup]
    print '{}:'.format(fname)
    
    print '    Simple drug names: {0}  NGRAM Drug names: {1}'.format(len(idx_drugs),
                                                                 len(all_ngrams))
    print '    NGRAM drug names ignore due to length: {}'.format(ignored)
    return idx_drugs, idx_drugs_ngram

In [8]:
def filterToken(tok):
    """Removes @ and # from token"""
    return tok[1:] if (tok.startswith('#') or tok.startswith('@')) else tok

In [9]:
def analyzeTweets(tweet_corpus, ignore=[], progress=10000):
    counts = collections.defaultdict(int)
    results = []
    idx_rx_drugs_subset = idx_rx_drugs.difference(set(ignore))  # filter out ignored drugs
    for tweetno, t in enumerate(tweet_corpus):
        if tweetno%(progress*10) == 0 and tweetno > 0:
            print tweetno,
            sys.stdout.flush()
        elif tweetno%progress == 0 and tweetno > 0:
            print '*',
            sys.stdout.flush()
        
        txt = t['text'].lower()
    
        if txt.startswith('rt'):  # ignore retweets
            continue
        
        # process simple drug names
        all_tokens = {filterToken(one_gram[0]) for one_gram in ngrams(txt, n=1)}
        drug_ref = all_tokens.intersection(idx_rx_drugs_subset)
        if drug_ref:
            results.append({'drugs':list(drug_ref), 'text':txt})
            # print '**', list(drug_ref), '**', txt
            # print
            for tok in drug_ref:
                counts[tok] += 1

        # now process ngram drug names  ** Currently Disabled **
        if False:
            for ngram_len in range(2, MAX_DRUG_NGRAM+1):
                for ngram in ngrams(txt, n=ngram_len):
                    for i, val in enumerate(ngram):
                        if val.startswith('#') or val.startswith('@'):
                            ngram[i] = val[1:]
                    tok = ' '.join(ngram)
                    if tok in idx_rx_drugs and tok not in ignore:
                        print '**', tok, '**', txt
                        counts[tok] += 1
    return results, counts

#Load Data

In [10]:
combined_symptoms = pickle.load(open("symptoms.p", "rb" ))

In [11]:
idx_all_drugs, idx_all_drugs_ngram = createDrugLookupTable('all_drugs.p')
idx_current_drugs, idx_current_drugs_ngram = createDrugLookupTable('current_drugs.p')
idx_rx_drugs, idx_rx_drugs_ngram = createDrugLookupTable('rx_drugs.p')
idx_otc_drugs, idx_otc_drugs_ngram = createDrugLookupTable('otc_drugs.p')
idx_discontinued_drugs, idx_discontinued_drugs_ngram = createDrugLookupTable('discontinued_drugs.p')

all_drugs.p:
    Simple drug names: 3972  NGRAM Drug names: 3810
    NGRAM drug names ignore due to length: 1302
current_drugs.p:
    Simple drug names: 2312  NGRAM Drug names: 2408
    NGRAM drug names ignore due to length: 828
rx_drugs.p:
    Simple drug names: 2253  NGRAM Drug names: 2215
    NGRAM drug names ignore due to length: 774
otc_drugs.p:
    Simple drug names: 78  NGRAM Drug names: 219
    NGRAM drug names ignore due to length: 56
discontinued_drugs.p:
    Simple drug names: 2311  NGRAM Drug names: 2236
    NGRAM drug names ignore due to length: 704


In [12]:
tweet_corpus = load_tweet()

* * * IOError for file 1433112631492061.json.gz
* * * * * * 100000 * * * * * * * * * 200000 * * * * * * * * * 300000 * * * * * * * * * 400000 * * * * * * * * * 500000 * * * * * * * * * 600000 * * * * * * * * * 700000 * * * * * * * * * 800000 * * * * * * * * * 900000 * * IOError for file 1433160165278168.json.gz


In [13]:
print len(tweet_corpus)

787237


#Analysis

In [14]:
ignore = [
'camila',
'nikki',
'testosterone',
'ella',
'fml',
'muse', 
'viagra',
'talc',
'bal', 'nicotine', 'oxytocin'
    ]

In [15]:
results, counts = analyzeTweets(tweet_corpus, ignore)

* * * * * * * * * 100000 * * * * * * * * * 200000 * * * * * * * * * 300000 * * * * * * * * * 400000 * * * * * * * * * 500000 * * * * * * * * * 600000 * * * * * * * * * 700000 * * * * * * * *


In [16]:
print 'Number of tweets with possible drug references: {}'.format(len(results))

Number of tweets with possible drug references: 485


Sample tweets with possible drug references

In [17]:
pprint(results[:100])

[{'drugs': [u'vasopressin'],
  'text': u"one week off the vasopressin; our son's anxiety is back, hitting himself, and the repetitive behavior-all back #asd http://t.co/movz6tp7kw"},
 {'drugs': [u'wellbutrin'],
  'text': u'@billygunn19 they put me on it to counteract the decrease in hunger from the wellbutrin. thankfully i could use a few pounds :)'},
 {'drugs': [u'yasmin'],
  'text': u'listen to lecture on overcoming sadness and depression - yasmin mogahed by yasmin.mogahed #np on #soundcloud\nhttp://t.co/kfehqx4yw1'},
 {'drugs': [u'dexamethasone'],
  'text': u're: vaccine treatments for brain tumours: dexamethasone remains critical to control edema, but does this inhibit immune therapy? #asco15'},
 {'drugs': [u'zyrtec'],
  'text': u'@jacob_ladder @besafe71 try zyrtec, works the same w/o the drowsiness.'},
 {'drugs': [u'prilosec'],
  'text': u'free prilosec sample for heartburn (new link) http://t.co/fr70xvvxp2) http://t.co/9wn1skegde'},
 {'drugs': [u'sonata'],
  'text': u'fit 2006-20

Number of instances where trade name RX drug referenced in tweet containing symptom-related keyword

In [18]:
pprint(sorted(counts.items(), key=lambda z: z[1], reverse=True))

[(u'qsymia', 138),
 (u'copper', 43),
 (u'ibuprofen', 42),
 (u'xanax', 34),
 (u'ssd', 27),
 (u'heather', 19),
 (u'soma', 12),
 (u'valium', 8),
 (u'sonata', 7),
 (u'ambien', 7),
 (u'betadine', 6),
 (u'yasmin', 6),
 (u'reglan', 6),
 (u'prozac', 6),
 (u'glycine', 5),
 (u'percocet', 5),
 (u'seroquel', 4),
 (u'acetaminophen', 4),
 (u'neosporin', 4),
 (u'tobi', 3),
 (u'zofran', 3),
 (u'cipro', 3),
 (u'paxil', 3),
 (u'adrenalin', 3),
 (u'yaz', 3),
 (u'naproxen', 3),
 (u'selsun', 3),
 (u'norco', 3),
 (u'cialis', 3),
 (u'xifaxan', 2),
 (u'doral', 2),
 (u'progesterone', 2),
 (u'zoladex', 2),
 (u'zoloft', 2),
 (u'lasix', 2),
 (u'mirena', 2),
 (u'diclofenac', 2),
 (u'cardura', 2),
 (u'capex', 2),
 (u'clonidine', 2),
 (u'abilify', 2),
 (u'prilosec', 2),
 (u'rivaroxaban', 2),
 (u'azithromycin', 2),
 (u'sps', 2),
 (u'zantac', 2),
 (u'nuvigil', 2),
 (u'ativan', 2),
 (u'lyrica', 1),
 (u'modafinil', 1),
 (u'xarelto', 1),
 (u'zithromax', 1),
 (u'pce', 1),
 (u'lamictal', 1),
 (u'albendazole', 1),
 (u'wellb

In [19]:
pickle.dump( results, open( "filtered_tweets.p", "wb" ))
pickle.dump( counts, open( "filtered_tweet_counts.p", "wb" ))