#Analyze Medical Symptom related tweets

Steps:
- Load list of drugs [done]
- Filter tweets by reference to drug [done]
- Additional filter to determine relevance
- Sentiment analysis of selected tweets

To Do:
- Fix IOError exception handling (continue with next file)

###Tweet Format

```python
[u'contributors',
 u'truncated',
 u'text',
 u'in_reply_to_status_id',
 u'id',
 u'favorite_count',
 u'source',
 u'retweeted',
 u'coordinates',
 u'timestamp_ms',
 u'entities',
 u'in_reply_to_screen_name',
 u'id_str',
 u'retweet_count',
 u'in_reply_to_user_id',
 u'favorited',
 u'retweeted_status',
 u'user',
 u'geo',
 u'in_reply_to_user_id_str',
 u'possibly_sensitive',
 u'lang',
 u'created_at',
 u'filter_level',
 u'in_reply_to_status_id_str',
 u'place',
 u'extended_entities']
```

In [1]:
import pickle
import gzip
import sys
from pprint import pprint
import collections
import os
import json

from pattern.en import parse
from pattern.en import ngrams

In [2]:
DIR_PATH = './Corpus/'
MAX_DRUG_NGRAM = 3

#Code

In [3]:
#generator function to return all tweets in a zipped firectory
def all_tweets(directory, extract=lambda x:json.loads(x)):
    for fn in os.listdir(directory):
        if fn.endswith('.json.gz'):
            try:
                f = gzip.open(directory+fn, 'rb')
                for line in f:
                    yield extract(line)
                f.close()
            except IOError:
                print 'IOError for file {}'.format(fn)
                pass

In [4]:
def myfilter(x):
    x = json.loads(x)
    result={}
    if 'delete' not in x:
        
        if 'text' in x:
            result['text'] = x['text']
        else:
            False
        if 'id_str' in x:
            result['id_str'] = x['id_str']
        if 'user' in x and 'screen_name' in x['user']:
            result['screen_name'] = x['user']['screen_name']
        #result['name'] = x['user']['name']
        if 'entities' in x:
            result['user_mentions'] = [men['screen_name'].lower() for men in x['entities']['user_mentions']]
            result['hashtags'] = [entry['text'].lower() for entry in x['entities']['hashtags']]
        if 'lang' in x:
            result['lang'] = x['lang']
        return result
    else:
        False

In [5]:
#test routine for all_tweets
def test_load_tweet(dir_path=DIR_PATH):
    all_count = 0
    count = 0
    has_text = 0
    users = collections.defaultdict(int)
    mentions = collections.defaultdict(int)
    hashtags = collections.defaultdict(int)
    try:
        for x in all_tweets(dir_path, extract=myfilter):
            if x:
                all_count += 1
            else:
                continue
            if 'text' in x:
                has_text += 1
            if x and x['lang']=='en':
                count += 1
                users[x['screen_name'].lower()] += 1
                for tag in x['hashtags']:
                    #pprint.pprint(tag)
                    hashtags[tag] += 1
                for men in x['user_mentions']:
                    mentions[men] += 1
    except IOError:
        print 'IOError'
        print 'foobar'
        pass
    print 'total tweets:', all_count
    print 'total tweets with text:', has_text
    print 'total english tweets: ', count
    print 'unique users: ',len(users)
    print 'unique mentions: ',len(mentions)
    print 'unique hashtags: ',len(hashtags)

In [6]:
def load_tweet(dir_path=DIR_PATH):
    result = []

    for x in all_tweets(dir_path, extract=myfilter):
            if not x:
                continue
            if 'text' not in x:
                continue
            if x and x['lang']=='en':
                result.append(x)

    return result

In [7]:
def createDrugLookupTable(fname, max_drug_ngram=MAX_DRUG_NGRAM):
    ngrams_lookup = [list([]) for i in range(max_drug_ngram+1)]
    ignored = 0
    drug_list = pickle.load(open(fname, "rb" ))
    
    idx_drugs = {name.strip().lower() for name in drug_list if ' ' not in name.strip()}
    all_ngrams = [name.strip().lower() for name in drug_list if ' ' in name.strip()]
    
    for ngram in all_ngrams:
        size = len(ngram.split())
        if size <= max_drug_ngram:
            ngrams_lookup[size].append(ngram)
        else:
            ignored += 1
            
    idx_drugs_ngram = [set(x) for x in ngrams_lookup]
    print '{}:'.format(fname)
    
    print '    Simple drug names: {0}  NGRAM Drug names: {1}'.format(len(idx_drugs),
                                                                 len(all_ngrams))
    print '    NGRAM drug names ignore due to length: {}'.format(ignored)
    return idx_drugs, idx_drugs_ngram

#Load Data

In [8]:
combined_symptoms = pickle.load(open("symptoms.p", "rb" ))

In [9]:
idx_all_drugs, idx_all_drugs_ngram = createDrugLookupTable('all_drugs.p')
idx_current_drugs, idx_current_drugs_ngram = createDrugLookupTable('current_drugs.p')
idx_rx_drugs, idx_rx_drugs_ngram = createDrugLookupTable('rx_drugs.p')
idx_otc_drugs, idx_otc_drugs_ngram = createDrugLookupTable('otc_drugs.p')
idx_discontinued_drugs, idx_discontinued_drugs_ngram = createDrugLookupTable('discontinued_drugs.p')

all_drugs.p:
    Simple drug names: 3622  NGRAM Drug names: 2691
    NGRAM drug names ignore due to length: 951
current_drugs.p:
    Simple drug names: 2076  NGRAM Drug names: 1706
    NGRAM drug names ignore due to length: 615
rx_drugs.p:
    Simple drug names: 2019  NGRAM Drug names: 1546
    NGRAM drug names ignore due to length: 576
otc_drugs.p:
    Simple drug names: 68  NGRAM Drug names: 174
    NGRAM drug names ignore due to length: 40
discontinued_drugs.p:
    Simple drug names: 2099  NGRAM Drug names: 1500
    NGRAM drug names ignore due to length: 476


#Analysis

In [10]:
tweet_corpus = load_tweet()

IOError for file 1433112631492061.json.gz
IOError for file 1433160165278168.json.gz


In [11]:
print len(tweet_corpus)

787237


In [12]:
ignore = [
'camila',
'nikki',
'testosterone',
'ella',
'fml',
'muse', 
'viagra',
'talc',
'bal',
    ]

In [13]:
counts = collections.defaultdict(int)
for t in tweet_corpus:
    txt = t['text'].lower()
    
    # process simple drug names
    for one_gram in ngrams(txt, n=1):
        tok = one_gram[0]
        if tok.startswith('#') or tok.startswith('@'):
            tok = tok[1:]
        if tok in idx_rx_drugs and tok not in ignore:
            print '**', tok, '**', txt
            counts[tok] += 1

    # now process ngram drug names
    for ngram_len in range(2, MAX_DRUG_NGRAM+1):
        for ngram in ngrams(txt, n=ngram_len):
            for i, val in enumerate(ngram):
                if val.startswith('#') or val.startswith('@'):
                    ngram[i] = val[1:]
            tok = ' '.join(ngram)
            if tok in idx_rx_drugs and tok not in ignore:
                print '**', tok, '**', txt
                counts[tok] += 1

** wellbutrin ** @billygunn19 they put me on it to counteract the decrease in hunger from the wellbutrin. thankfully i could use a few pounds :)
** yasmin ** listen to lecture on overcoming sadness and depression - yasmin mogahed by yasmin.mogahed #np on #soundcloud
http://t.co/kfehqx4yw1
** soma ** rt @nichegamer: amnesia spiritual successor, soma, is set for a september 22 release on ps4 and pc http://t.co/bhrrcesaen http://t.co/exw9b…
** dexamethasone ** re: vaccine treatments for brain tumours: dexamethasone remains critical to control edema, but does this inhibit immune therapy? #asco15
** zyrtec ** @jacob_ladder @besafe71 try zyrtec, works the same w/o the drowsiness.
** prilosec ** free prilosec sample for heartburn (new link) http://t.co/fr70xvvxp2) http://t.co/9wn1skegde
** sonata ** fit 2006-2010 06-09 10 hyundai sonata wind deflector window visor sun guard 4pc http://t.co/ncuhhmgggj http://t.co/jluvx8d2gx
** yasmin ** @yungtaxi @wolfiehan yasmin chill u hoe :/
** abilify ** 

Number of instances where trade name RX drug referenced in tweet containing symptom-related keywork

In [14]:
pprint(counts.items())

[(u'betadine', 6),
 (u'lyrica', 1),
 (u'modafinil', 1),
 (u'doral', 2),
 (u'tobi', 5),
 (u'xarelto', 1),
 (u'zithromax', 6),
 (u'chantix', 1),
 (u'pce', 1),
 (u'zofran', 4),
 (u'belviq', 1),
 (u'sonata', 7),
 (u'ssd', 28),
 (u'yasmin', 6),
 (u'progesterone', 2),
 (u'zoladex', 2),
 (u'zoloft', 5),
 (u'lasix', 21),
 (u'lamictal', 1),
 (u'nitroglycerin', 2),
 (u'mirena', 2),
 (u'wellbutrin', 1),
 (u'gabapentin', 1),
 (u'reglan', 6),
 (u'klonopin', 1),
 (u'soma', 19),
 (u'cipro', 3),
 (u'paxil', 5),
 (u'ibuprofen', 43),
 (u'avage', 1),
 (u'propofol', 1),
 (u'xanax', 41),
 (u'uloric', 1),
 (u'omeprazole', 2),
 (u'skyla', 1),
 (u'androgel', 1),
 (u'levitra', 1),
 (u'flagyl', 11),
 (u'valium', 8),
 (u'dexamethasone', 1),
 (u'amoxicillin', 3),
 (u'dilaudid', 1),
 (u'cardura', 2),
 (u'oxandrolone', 1),
 (u'ondansetron', 1),
 (u'capex', 3),
 (u'clonidine', 2),
 (u'nuvigil', 2),
 (u'abilify', 2),
 (u'seroquel', 4),
 (u'adrenalin', 3),
 (u'ciprofloxacin', 1),
 (u'prozac', 9),
 (u'yaz', 4),
 (u'ten