#Cleanse Medical Symptom related tweets

###Updated 6/4/15

Steps:
- Load list of drugs [done]
- Filter tweets by reference to drug [done]
- Add spam filter - tweets with large amount of common content [done]
- Apply Topic Modeling and Clustering [See separate notebook]
- Additional filter to determine relevance
- Sentiment analysis of selected tweets

###Tweet Format

```python
[u'contributors',
 u'truncated',
 u'text',
 u'in_reply_to_status_id',
 u'id',
 u'favorite_count',
 u'source',
 u'retweeted',
 u'coordinates',
 u'timestamp_ms',
 u'entities',
 u'in_reply_to_screen_name',
 u'id_str',
 u'retweet_count',
 u'in_reply_to_user_id',
 u'favorited',
 u'retweeted_status',
 u'user',
 u'geo',
 u'in_reply_to_user_id_str',
 u'possibly_sensitive',
 u'lang',
 u'created_at',
 u'filter_level',
 u'in_reply_to_status_id_str',
 u'place',
 u'extended_entities']
```

In [1]:
import pickle
import gzip
import sys
from pprint import pprint
import collections
import os
import json

from pattern.en import parse
from pattern.en import ngrams
from pattern.en.wordlist import BASIC as BASIC_WORDS
# import lshash

#Configure

In [2]:
DIR_PATH = './Corpus/'
MAX_DRUG_NGRAM = 3
HASH_LEN = 5
HASH_BITS = 28

In [3]:
ignore = [
    'camila',
    'nikki',
    'testosterone',
    'ella',
    'fml',
    'muse', 
    'viagra',
    'talc',
    'bal',
    'nicotine',
    'heather',
    'plan',
    'alcohol',
    'vitamin',
    'metro',
    'android',
    'calcium',
    'capital',
    'xenon',
    'lo', 
    'sodium',
    'amino',
    'caffeine',
    'sterile',
    'extended',
    'aspirin',
    'ssd',
    'potassium',
    'olive',
    'balanced',
    'magnesium',
    'flavored',
    'peg', 
    'ammonia',
    'ms',
    'soybean',
    'zinc',
    'tobi',
    'capex',
    'sulfur',
    'sps',
    'folic',
    'citric',
    'arsenic',
    'ibuprofen',
    'tylenol',
    ]

#Code

##Filter Tweets based on reference to drug

In [4]:
def all_tweets(directory, extract=lambda x:json.loads(x)):
    """Generator function to return all tweets in a zipped firectory"""
    for fn in os.listdir(directory):
        if fn.endswith('.json.gz'):
            try:
                f = gzip.open(directory+fn, 'rb')
                for line in f:
                    yield extract(line)
                f.close()
            except IOError:
                print 'IOError for file {}'.format(fn)
                pass

In [5]:
def myfilter(x):
    """Filters fields within tweet to reduce size"""
    x = json.loads(x)
    result={}
    if 'delete' not in x:
        
        if 'text' in x:
            result['text'] = x['text']
        else:
            False
        if 'id_str' in x:
            result['id_str'] = x['id_str']
        if 'user' in x and 'screen_name' in x['user']:
            result['screen_name'] = x['user']['screen_name']
        #result['name'] = x['user']['name']
        if 'entities' in x:
            result['user_mentions'] = [men['screen_name'].lower() for men in x['entities']['user_mentions']]
            result['hashtags'] = [entry['text'].lower() for entry in x['entities']['hashtags']]
        if 'lang' in x:
            result['lang'] = x['lang']
        return result
    else:
        False

In [6]:
#test routine for all_tweets
def test_load_tweet(dir_path=DIR_PATH):
    """Debug routine, should be deletd in future version"""
    all_count = 0
    count = 0
    has_text = 0
    users = collections.defaultdict(int)
    mentions = collections.defaultdict(int)
    hashtags = collections.defaultdict(int)
    try:
        for x in all_tweets(dir_path, extract=myfilter):
            if x:
                all_count += 1
            else:
                continue
            if 'text' in x:
                has_text += 1
            if x and x['lang']=='en':
                count += 1
                users[x['screen_name'].lower()] += 1
                for tag in x['hashtags']:
                    #pprint.pprint(tag)
                    hashtags[tag] += 1
                for men in x['user_mentions']:
                    mentions[men] += 1
    except IOError:
        print 'IOError'
        print 'foobar'
        pass
    print 'total tweets:', all_count
    print 'total tweets with text:', has_text
    print 'total english tweets: ', count
    print 'unique users: ',len(users)
    print 'unique mentions: ',len(mentions)
    print 'unique hashtags: ',len(hashtags)

In [7]:
def load_tweet(dir_path=DIR_PATH, progress=10000):
    """Loads corpus of tweets, where corpus is composed of a set of
    xxx.json.gz files within a specified directory"""
    result = []

    for i, x in enumerate(all_tweets(dir_path, extract=myfilter)):
        if i%(progress*10) == 0 and i > 0:
            print i,
            sys.stdout.flush()
        elif i%progress == 0 and i > 0:
            print '*',
            sys.stdout.flush()
            
        if not x:
            continue
        if 'text' not in x:
            continue
        if x and x['lang']=='en':
            result.append(x)

    return result

In [8]:
def createDrugLookupTable(fname):
    """Converts drug list to set for performing token lookups
    For ngrame names, only use the first word in name"""

    drug_list = pickle.load(open(fname, "rb" ))
    idx_drugs = {name.strip().lower().split()[0] for name in drug_list}
    # Remove drug names that are also part of the basic english language
    idx_drugs = idx_drugs.difference({ w.lower() for w in BASIC_WORDS})
    print 'Total Unique Drug Name Prefixes: {0} in file {1}'.format(len(idx_drugs), fname)
    return idx_drugs

In [9]:
def filterToken(tok):
    """Removes @ and # from token"""
    return tok[1:] if (tok.startswith('#') or tok.startswith('@')) else tok

In [10]:
def filterTweetCorpus(tweet_corpus, drugs, ignore=[], progress=10000, ignore_retweet=True):
    """Top level routine to filter tweets based on presence of drug name
    ngram drug name lookup currently disabled (low hit rate, improves speed)"""
    counts = collections.defaultdict(int)
    results = []
    drugs_subset = drugs.difference(set(ignore))  # filter out ignored drugs
    for tweetno, t in enumerate(tweet_corpus):
        if tweetno%(progress*10) == 0 and tweetno > 0:
            print tweetno,
            sys.stdout.flush()
        elif tweetno%progress == 0 and tweetno > 0:
            print '*',
            sys.stdout.flush()
        
        txt = t['text'].lower()
    
        if ignore_retweet and txt.startswith('rt'):  # ignore retweets
            continue
        
        # drug names
        all_tokens = {filterToken(one_gram[0]) for one_gram in ngrams(txt, n=1)}
        drug_ref = all_tokens.intersection(drugs_subset)
        if drug_ref:
            results.append({'drugs':list(drug_ref), 'text':txt})
            # print '**', list(drug_ref), '**', txt
            # print
            for tok in drug_ref:
                counts[tok] += 1

    return results, counts

##Dedupe

In [11]:
# Source: http://primes.utm.edu/lists/small/10000.txt
# Largest primes where prime <= 2**n_bits
primeTable = [
    1, #0
    1, #1
    3, #2 
    7, #3
    13, #4
    31, # 5
    61, # 6
    127, # 7
    251, # 8
    509, # 9
    1021, # 10
    2039, # 11
    4093, # 12
    8191, # 13
    16381, # 14
    32749, #15
    65521, # 16
]

In [12]:
def adlerHashN(text, modulo=2**12):
    """Computes N-bit Adler hash for a text string
    where modulo = (N-bit/2)**2"""
    A = 1
    B = 1
    for c in text.lower():
        c_val = ord(c)
        A += c_val
        B += A
    return (B % modulo) * modulo + A    

if False:  # Test code
    print adlerHashN('the quick brown fox jumped over the lazy dog')
    print adlerHashN('the quick brown fox jumped over the lazy do')
    print adlerHashN('the quick brown cat jumped over the lazy dog')
    print adlerHashN('the')

In [13]:
def adler_lookup_or_insert(hashTable, tweet_text, tweetid, similarity_threshold=0.6,
                         hash_len=10,  modulo=4093, verbose=False):
    """Use LSH to determine dedupe, return True if match exists, if not, insert
    similarity_threshold = percentable of hashes that need to match to declare duplicate
    """

    total_hashes = len(tweet_text) - hash_len
    threshold = total_hashes * similarity_threshold
    
    # Determine whether similar tweet already exists
    hits = 0
    candidates = collections.defaultdict(int)
    for i in range(total_hashes+1):
        hashKey = adlerHashN(tweet_text[i:i+hash_len], modulo=modulo)
        result = hashTable[hashKey]
        if len(result) > 0:
            hits += 1
            for otherTweetid in result:
                candidates[otherTweetid] += 1
                    
    if hits >= threshold:
        for candidateTweetid, candidateHists in candidates.items():
            if candidateHists >= threshold:
                if verbose:
                    print '{0} Found matching tweet {1}  hits: {2} threshold:{3}'.format(tweetid,
                                                                                         candidateTweetid,
                                                                                         candidateHists, threshold)
                return True

    
    # Insert tweetid in  appropriate hash buckets
    for i in range(total_hashes+1):
        hashKey = adlerHashN(tweet_text[i:i+hash_len], modulo=modulo)
        hashTable[hashKey] = hashTable[hashKey].union(set([tweetid]))
            
    return False

In [14]:
def dedupeTweets(tweets, hash_bits=24, hash_len=10, progress=1000, similarity_threshold=0.6, verbose=False):
    """Top level routine to dedupe a set of tweets using locality sensitive hashing.
    hash_bits = number of buckets in hash table = 2**hash_bits
    hash_len = number of characters included in each hash
    """
    global primeTable
    modulo = primeTable[int(hash_bits/2)]
    print 'Modulo:{0}  Max: {1}'.format(modulo, 2**int(hash_bits/2))
    hashTable = collections.defaultdict(set)

    count = 0
    results = []
    for i, tweet in enumerate(tweets):
        if i and i%progress == 0:
            print '*',
            sys.stdout.flush()
        # normalize text by removing URL's and balancinf whitespace
        normalized_text = ' '.join([x for x in tweet['text'].split() if not (x.startswith('http://')
                                                                            or x.startswith('https://'))])            
        if adler_lookup_or_insert(hashTable, normalized_text, i,
                                  hash_len=hash_len, modulo=modulo, similarity_threshold=similarity_threshold):
            count += 1
            if verbose:
                print 'Found match:'
                pprint(tweet)
                print
        else:
            results.append(tweet)
    print
    print 'Total duplicate tweets:', count
    return results

#Load Data

In [15]:
combined_symptoms = pickle.load(open("symptoms.p", "rb" ))

In [16]:
idx_all_drugs = createDrugLookupTable('all_drugs.p')
idx_current_drugs = createDrugLookupTable('current_drugs.p')
idx_rx_drugs = createDrugLookupTable('rx_drugs.p')
idx_otc_drugs = createDrugLookupTable('otc_drugs.p')
idx_discontinued_drugs = createDrugLookupTable('discontinued_drugs.p')

Total Unique Drug Name Prefixes: 5297 in file all_drugs.p
Total Unique Drug Name Prefixes: 3296 in file current_drugs.p
Total Unique Drug Name Prefixes: 3190 in file rx_drugs.p
Total Unique Drug Name Prefixes: 175 in file otc_drugs.p
Total Unique Drug Name Prefixes: 3245 in file discontinued_drugs.p


In [17]:
tweet_corpus = load_tweet()

* * * IOError for file 1433112631492061.json.gz
* * * * * * 100000 * * * * * * * * * 200000 * * * * * * * * * 300000 * * * * * * * * * 400000 * * * * * * * * * 500000 * * * * * * * * * 600000 * * * * * * * * * 700000 * * * * * * * * * 800000 * * * * * * * * * 900000 * * IOError for file 1433160165278168.json.gz
* * * * * * * 1000000 * * * * * * * * * 1100000 * * * * * * * * * 1200000 * * IOError for file 1433472551470078.json.gz
* * * * * * * 1300000 * * * * * * * * * 1400000 * * * * * * * * * 1500000 * * * * * * * * * 1600000 * * * * * * * * * 1700000 * * * * * * * * * 1800000 * * * * * * * * * 1900000 * * * * * * * * * 2000000 * * * * IOError for file 1433509491447986.json.gz


In [18]:
print len(tweet_corpus)

1737748


#Cleanse Data

In [19]:
results, counts = filterTweetCorpus(tweet_corpus, idx_rx_drugs, ignore)

* * * * * * * * * 100000 * * * * * * * * * 200000 * * * * * * * * * 300000 * * * * * * * * * 400000 * * * * * * * * * 500000 * * * * * * * * * 600000 * * * * * * * * * 700000 * * * * * * * * * 800000 * * * * * * * * * 900000 * * * * * * * * * 1000000 * * * * * * * * * 1100000 * * * * * * * * * 1200000 * * * * * * * * * 1300000 * * * * * * * * * 1400000 * * * * * * * * * 1500000 * * * * * * * * * 1600000 * * * * * * * * * 1700000 * * *


In [20]:
print 'Number of tweets with possible drug references: {}'.format(len(results))

Number of tweets with possible drug references: 1238


Sample tweets with possible drug references

In [21]:
pprint(results[:20])

[{'drugs': [u'vasopressin'],
  'text': u"one week off the vasopressin; our son's anxiety is back, hitting himself, and the repetitive behavior-all back #asd http://t.co/movz6tp7kw"},
 {'drugs': [u'wellbutrin'],
  'text': u'@billygunn19 they put me on it to counteract the decrease in hunger from the wellbutrin. thankfully i could use a few pounds :)'},
 {'drugs': [u'yasmin'],
  'text': u'listen to lecture on overcoming sadness and depression - yasmin mogahed by yasmin.mogahed #np on #soundcloud\nhttp://t.co/kfehqx4yw1'},
 {'drugs': [u'metformin'],
  'text': u'diabetes treatment drug, metformin, could help prevent blindness according #diabetesfree, #diabetes, #stopdiabetes http://t.co/au8xfme5jm'},
 {'drugs': [u'tamoxifen'],
  'text': u'#ovariancyst risk factors: infertility treatment, tamoxifen, pregnancy, hypothyroidism, maternal gonadotropins, cigarettes, tubal ligation'},
 {'drugs': [u'dexamethasone'],
  'text': u're: vaccine treatments for brain tumours: dexamethasone remains critic

Number of instances where trade name RX drug referenced in tweet containing symptom-related keyword

In [22]:
pprint(sorted(counts.items(), key=lambda z: z[1], reverse=True))

[(u'qsymia', 138),
 (u'saxenda', 111),
 (u'insulin', 81),
 (u'codeine', 72),
 (u'xanax', 67),
 (u'morphine', 52),
 (u'methamphetamine', 44),
 (u'betadine', 31),
 (u'sonata', 29),
 (u'orlistat', 21),
 (u'tramadol', 20),
 (u'soma', 19),
 (u'acetaminophen', 17),
 (u'prozac', 15),
 (u'ambien', 13),
 (u'adderall', 13),
 (u'yaz', 13),
 (u'norco', 13),
 (u'valium', 11),
 (u'oxytocin', 11),
 (u'cialis', 11),
 (u'dopamine', 10),
 (u'phentermine', 10),
 (u'cortisone', 9),
 (u'prilosec', 9),
 (u'promethazine', 9),
 (u'ortho', 9),
 (u'yasmin', 8),
 (u'paxil', 8),
 (u'hydrocodone', 8),
 (u'naproxen', 8),
 (u'glycine', 7),
 (u'oxycodone', 7),
 (u'polyethylene', 7),
 (u'penicillin', 7),
 (u'metformin', 7),
 (u'percocet', 7),
 (u'neosporin', 7),
 (u'zofran', 7),
 (u'ativan', 7),
 (u'zoloft', 6),
 (u'reglan', 6),
 (u'nitric', 6),
 (u'lorazepam', 6),
 (u'amoxicillin', 6),
 (u'avc', 6),
 (u'warfarin', 6),
 (u'selenium', 5),
 (u'gabapentin', 5),
 (u'adrenalin', 5),
 (u'lithium', 5),
 (u'celecoxib', 4),
 (

In [23]:
pickle.dump( results, open( "filtered_tweets.p", "wb" ))
pickle.dump( counts, open( "filtered_tweet_counts.p", "wb" ))

###Perform Dedupe

In [24]:
deduped_tweets = dedupeTweets(results, hash_len=8, hash_bits=24, similarity_threshold=0.6)
print 'Before: {0}  After: {1}'.format(len(results),len(deduped_tweets))

Modulo:4093  Max: 4096
*
Total duplicate tweets: 517
Before: 1238  After: 721


In [25]:
pickle.dump(deduped_tweets, open( "deduped_tweets.p", "wb" ))