In [36]:
import pickle
import pandas as pd

In [184]:
import re
from nltk.stem.snowball import SnowballStemmer

infection_words = ['get', 'got', 'recov', 'have', 'had', 'has', 'catch', 'cure', 'infect', 'rest', 'wors', 'weaken', 'weak']
possession_words = ['bird', 'the flu', 'flu', 'sick', 'epidem']
concern_words = ['afraid', 'worri', 'scare', 'fear', 'nervous', 'dread','terrifi']
vaccination_words = ['vaccin', 'shot', 'mist', 'tamiflu', 'jab', 'nasal spray']
symptom_words = ['fever', 'cough', 'sore', 'throat', 'runni', 'stuffi', 'ach', 'tire', 'fatigu']
cdc_words = ["acut", "respiratori", "ill", "ari", "adjuv", "antigen", "virus", "avian", "suspect", "investig", "contact", "close", "confirm", "care", "facil", "oseltamivir", "peramivir", "transmiss", "sever", "potenti", "probabl", "respiratori", "season", "sari", "zanamivir"]
positive_emoticons = [':)', ':D']
negative_emoticons = [':(', ':/']

stemmer = SnowballStemmer('english')

def count_infection_words(tweet_content):
    count = 0
    for word in tweet_content:
        word = stemmer.stem(word)
        if word in infection_words:
            count += 1
    return count

def count_cdc_words(tweet_content):
    count = 0
    for word in tweet_content:
        word = stemmer.stem(word)
        if word in cdc_words:
            count += 1
    return count

def count_possession_words(tweet_content):
    count = 0
    for word in tweet_content:
        word = stemmer.stem(word)
        if word in possession_words:
            count += 1
    return count

def count_concern_words(tweet_content):
    count = 0
    for word in tweet_content:
        word = stemmer.stem(word)
        if word in concern_words:
            count += 1
    return count

def count_vaccination_words(tweet_content):
    count = 0
    for word in tweet_content:
        word = stemmer.stem(word)
        if word in vaccination_words:
            count += 1
    return count

def count_symptom_words(tweet_content):
    count = 0
    for word in tweet_content:
        word = stemmer.stem(word)
        if word in symptom_words:
            count += 1
    return count

def count_positive_emoticons(tweet_content):
    count = 0
    for word in positive_emoticons:
        if word in tweet_content:
            count += 1
    return count

def count_negative_emoticons(tweet_content):
    count = 0
    for word in negative_emoticons:
        if word in tweet_content:
            count += 1
    return count

def count_mentions(tweet_content):
    return len(re.findall('^@\S+', str(tweet_content)))

def count_hashtags(tweet_content):
    return len(re.findall('^#\S+', str(tweet_content)))

def contains_url(tweet_content):
    return bool(re.search('http[s]?: // (?:[a-zA-Z] |[0-9] |[$-_ @.& +] |[! * \(\),] | (?: %[0-9a-fA-F][0-9a-fA-F]))+', str(tweet_content)))

def determine_length(tweet_content):
    return len(tweet_content)

In [164]:
with open("data/influenza/influenza.train", 'rb') as handle:
    train_x, train_y = pickle.load(handle)
    

In [165]:
all_data = list(zip(train_x, train_y)) 
df = pd.DataFrame(all_data, columns = ['Tweet Content', 'Label']) 


In [166]:
pd.set_option("display.min_rows", 100)
pd.set_option('display.max_colwidth', -1)

# Looking at some examples of "infection" tweets and "not infection" tweets

In [167]:
infection_set = df[df["Label"] == 1]
display(infection_set)

Unnamed: 0,Tweet Content,Label
5,yes virus dari gw. RT @mrcandries: eek im getting flu. *uhuk uhuk\n,1
8,I'm getting better.....Swine 'flu will NOT be the end of me.....MWAHAHAHAHAHAHAHAHAHAHA!! XP\n,1
9,@zeitgeist10 Thats good! I think I had the flu aswell this weekend. Now only have a sore throat and my teacher are terrified of me. LOL\n,1
10,"And Im not going to work anymore, not because im scared of u turning me in, getting caught. I have the swine flu!\n",1
12,@Jo5269 still got htis fcukin flu and back to work morrow nite :(. getting to post off ice morrow post that dvd xxx\n,1
13,so im starting to think ive got swine flu. cant hardly breathe. might have to go to the hospital if it keeps getting expontentially worse\n,1
16,muh. if I am getting sick and it's not swine flu I am going to be SO PISSED.\n,1
25,@MarcoCervantes baby ...eh hmm. idk what to say. ashley is sick like flu ish? im afraid\n,1
26,@GillianCohen uhh is this true? RT @dougbushbc @beckyhammer Gilly's mom has Bird Flu\n,1
29,@suga_boo good morning! I'm not feeling great. Hoping I'm not getting the flu!\n,1


In [168]:
noninfection_set = df[df["Label"] == 0]
display(noninfection_set)

Unnamed: 0,Tweet Content,Label
0,"@marathonali I'm trying to rest but always so much to do. I'm nervous about the swine flu, the high school has had a handful of kids so far.\n",0
1,I'm thinking about getting a flu shot\n,0
2,Flu experts gear up for pandemic of vaccine worry http://bit.ly/2Ujj6f\n,0
3,Is Dr. Oz Getting A Swine Flu Shot? : Sometimes it's awkward to talk about your health and your body. As you kn.. http://bit.ly/GkTUz\n,0
4,New research helps explain why bird flu has not caused a pandemic http://bit.ly/1Q6b3j\n,0
6,"You tweet bird flu, and you use oinkment for swine flu - grammar school humor!\n",0
7,A couple places to avoid if you're worried about swine flu. http://bit.ly/JIMTu\n,0
11,"Day 4: Seinfeld, Seinfeld, friends, friends, Seinfeld, Seinfeld, sex and the city, sex and the city....the flu is getting boring\n",0
14,Doctors Concerned FluMist Vaccine Could Spread Live H1N1 Virus (is the swine flu nasal spray a pandemic waiting to happen?)\n,0
15,Forum draws crowd concerned about H1N1 flu http://bit.ly/nHHFO\n,0


In [169]:
df['Infection Word Count'] = df["Tweet Content"].apply(count_infection_words)
df['Possession Word Count'] = df["Tweet Content"].apply(count_possession_words)
df['Concern Word Count'] = df["Tweet Content"].apply(count_concern_words)
df['Vaccination Word Count'] = df["Tweet Content"].apply(count_vaccination_words)
df['Symptom Word Count'] = df["Tweet Content"].apply(count_symptom_words)
df['Positive Emoticon Count'] = df["Tweet Content"].apply(count_positive_emoticons)
df['Negative Emoticon Count'] = df["Tweet Content"].apply(count_negative_emoticons)
df['CDC Word Count'] = df["Tweet Content"].apply(count_cdc_words)
df['Mention Count'] = df["Tweet Content"].apply(count_mentions)
df['Hashtag Count'] = df["Tweet Content"].apply(count_hashtags)
df['Contains URL'] = df["Tweet Content"].apply(contains_url)
df['Tweet Length'] = df["Tweet Content"].apply(determine_length)

In [170]:
df[df["Label"] == 0].sort_values("Infection Word Count")

Unnamed: 0,Tweet Content,Label,Infection Word Count,Possession Word Count,Concern Word Count,Vaccination Word Count,Symptom Word Count,Positive Emoticon Count,Negative Emoticon Count,CDC Word Count,Mention Count,Hashtag Count,Contains URL,Tweet Length
2645,"Someone shot it! @stefanpinto RT Hey, whatever happened to bird flu?\n",0,0,2,0,1,0,0,0,0,0,0,False,69
1267,Should You Get the Swine Flu Vaccine?: Worried about H1N1? The folks at the Center for Disease Control and Preve.. http://bit.ly/gAa3g\n,0,0,0,0,0,0,0,1,0,0,0,False,135
1265,"News Alert: ""White House Concerned About Spreading Flu in Churches"" Read more. http://bit.ly/13D1Fh\n",0,0,0,0,0,0,0,1,0,0,0,False,100
2503,PIG AIDS! #swineflu Half of people fear swine flu: poll - The Press Association: Washington .. http://bit.ly/NHX0L\n,0,0,1,1,0,0,0,1,0,0,0,False,115
1259,Swine Flu Spreading Widely; Worry Over Pregnant Women: With swine flu now widespread acros.. http://bit.ly/3irPrY http://bit.ly/1Y7RsT\n,0,0,1,0,0,0,0,1,0,0,0,False,135
1258,RT @jazgar Wall Street Banks Getting Swine Flu Vaccine Before Many High-Risk Groups (VIDEO) http://bit.ly/25p83g\n,0,0,0,0,0,0,0,1,0,0,0,False,113
1257,"Swine flu, bird flu, mad cows - I don't fancy Old McDonald's chances, d'you?\n",0,0,2,0,0,0,0,0,0,0,0,False,77
1256,http://bit.ly/4ngDZN Holy water in the era of swine flu: an electronic dispenser: Amid fear.. http://bit.ly/4mtQrI\n,0,0,1,1,0,0,0,1,0,0,0,False,115
1255,Getting swine flu vaccine top business concern - The Associated Press http://tinyurl.com/yeepnsn\n,0,0,1,0,1,0,0,1,0,0,0,False,97
1252,Who knew? Getting a flu shot at Safeway!\n,0,0,1,0,1,0,0,0,0,0,0,False,41


In [171]:
df[df["Label"] == 1].sort_values("Infection Word Count")

Unnamed: 0,Tweet Content,Label,Infection Word Count,Possession Word Count,Concern Word Count,Vaccination Word Count,Symptom Word Count,Positive Emoticon Count,Negative Emoticon Count,CDC Word Count,Mention Count,Hashtag Count,Contains URL,Tweet Length
1586,Happy Halloween everyone.. I'm stuck at home with the flu but dont worry its not swine flu =p\n,1,0,2,0,0,0,0,0,0,0,0,False,94
1266,Getting over the shortest bout of flu ever\n,1,0,1,0,0,0,0,0,0,0,0,False,43
2777,Just Getting over This Nasty Flu\n,1,0,0,0,0,0,0,0,0,0,0,False,33
1941,"Off To Bed, I'm Tired And Feel Like I'm Getting The Flu!!! I Hope Not Cause I Hate Feeling Sick...\n",1,0,0,0,0,0,0,0,0,0,0,False,99
1278,"In bed with flu today, meditating on God's attributes in Mary's song-Luke 1:46-55. Thought: Replace worry with Worship-creator of universe!\n",1,0,1,0,0,0,0,0,0,0,0,False,140
2789,38 degrees is possible swine flu watching the thermometer go up. at 36.9 right now im scared :/\n,1,0,1,1,0,0,0,1,0,0,0,False,96
2794,Feeling under the weather... thinking that Husband passed his flu-like symptoms on to me. Opting to go in a bit later this morning...\n,1,0,1,0,0,0,0,0,0,0,0,False,134
2797,broncos lost so i left town im grubbing bbq and hanging sum family but kinda worried sum of dem are coughing (oh no swine flu)\n,1,0,1,1,0,1,0,0,0,0,0,False,127
771,Everybody is qettin thaa flu.!!!! I'm scared.\n,1,0,1,1,0,0,0,0,0,0,0,False,46
433,"@KLBarber If it's 'flu, at the mo', it probably is A(H1N1). Not to worry, it's barely worse than seasonal at present. Get well soon.\n",1,0,1,0,0,0,0,0,0,1,0,False,133


Here, we're just seeing how many words in each set (and what percentage) have a particular feature. If there's a significant difference in the percentage especially, then we'll leave it as a feature to see what the model can do

In [172]:
print(noninfection_set[df["Infection Word Count"] < 2].count())
print(noninfection_set[df["Infection Word Count"] < 2].count() / noninfection_set.count())
print(infection_set[df["Infection Word Count"] < 2].count())
print(infection_set[df["Infection Word Count"] < 2].count() / infection_set.count())

Tweet Content    2344
Label            2344
dtype: int64
Tweet Content    0.908879
Label            0.908879
dtype: float64
Tweet Content    579
Label            579
dtype: int64
Tweet Content    0.793151
Label            0.793151
dtype: float64


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [173]:
print(noninfection_set[df["Possession Word Count"] < 2].count())
print(noninfection_set[df["Possession Word Count"] < 2].count() / noninfection_set.count())
print(infection_set[df["Possession Word Count"] < 2].count())
print(infection_set[df["Possession Word Count"] < 2].count() / infection_set.count())

Tweet Content    1961
Label            1961
dtype: int64
Tweet Content    0.760372
Label            0.760372
dtype: float64
Tweet Content    376
Label            376
dtype: int64
Tweet Content    0.515068
Label            0.515068
dtype: float64


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [174]:
print(noninfection_set[df["Concern Word Count"] < 1].count())
print(noninfection_set[df["Concern Word Count"] < 1].count() / noninfection_set.count())
print(infection_set[df["Concern Word Count"] < 1].count())
print(infection_set[df["Concern Word Count"] < 1].count() / infection_set.count())

Tweet Content    2063
Label            2063
dtype: int64
Tweet Content    0.799922
Label            0.799922
dtype: float64
Tweet Content    639
Label            639
dtype: int64
Tweet Content    0.875342
Label            0.875342
dtype: float64


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [175]:
print(noninfection_set[df["Vaccination Word Count"] < 1].count())
print(noninfection_set[df["Vaccination Word Count"] < 1].count() / noninfection_set.count())
print(infection_set[df["Vaccination Word Count"] < 1].count())
print(infection_set[df["Vaccination Word Count"] < 1].count() / infection_set.count())

Tweet Content    1708
Label            1708
dtype: int64
Tweet Content    0.662272
Label            0.662272
dtype: float64
Tweet Content    705
Label            705
dtype: int64
Tweet Content    0.965753
Label            0.965753
dtype: float64


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [176]:
print(noninfection_set[df["Symptom Word Count"] < 1].count())
print(noninfection_set[df["Symptom Word Count"] < 1].count() / noninfection_set.count())
print(infection_set[df["Symptom Word Count"] < 1].count())
print(infection_set[df["Symptom Word Count"] < 1].count() / infection_set.count())

Tweet Content    2491
Label            2491
dtype: int64
Tweet Content    0.965878
Label            0.965878
dtype: float64
Tweet Content    653
Label            653
dtype: int64
Tweet Content    0.894521
Label            0.894521
dtype: float64


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [177]:
print(noninfection_set[df["Positive Emoticon Count"] < 1].count())
print(noninfection_set[df["Positive Emoticon Count"] < 1].count() / noninfection_set.count())
print(infection_set[df["Positive Emoticon Count"] < 1].count())
print(infection_set[df["Positive Emoticon Count"] < 1].count() / infection_set.count())

Tweet Content    2545
Label            2545
dtype: int64
Tweet Content    0.986817
Label            0.986817
dtype: float64
Tweet Content    710
Label            710
dtype: int64
Tweet Content    0.972603
Label            0.972603
dtype: float64


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [178]:
print(noninfection_set[df["Negative Emoticon Count"] < 1].count())
print(noninfection_set[df["Negative Emoticon Count"] < 1].count() / noninfection_set.count())
print(infection_set[df["Negative Emoticon Count"] < 1].count())
print(infection_set[df["Negative Emoticon Count"] < 1].count() / infection_set.count())

Tweet Content    1538
Label            1538
dtype: int64
Tweet Content    0.596355
Label            0.596355
dtype: float64
Tweet Content    642
Label            642
dtype: int64
Tweet Content    0.879452
Label            0.879452
dtype: float64


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [179]:
print(noninfection_set[df["Mention Count"] < 1].count())
print(noninfection_set[df["Mention Count"] < 1].count() / noninfection_set.count())
print(infection_set[df["Mention Count"] < 1].count())
print(infection_set[df["Mention Count"] < 1].count() / infection_set.count())

Tweet Content    2201
Label            2201
dtype: int64
Tweet Content    0.853432
Label            0.853432
dtype: float64
Tweet Content    564
Label            564
dtype: int64
Tweet Content    0.772603
Label            0.772603
dtype: float64


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [180]:
print(noninfection_set[df["Hashtag Count"] < 1].count())
print(noninfection_set[df["Hashtag Count"] < 1].count() / noninfection_set.count())
print(infection_set[df["Hashtag Count"] < 1].count())
print(infection_set[df["Hashtag Count"] < 1].count() / infection_set.count())

Tweet Content    2530
Label            2530
dtype: int64
Tweet Content    0.981
Label            0.981
dtype: float64
Tweet Content    725
Label            725
dtype: int64
Tweet Content    0.993151
Label            0.993151
dtype: float64


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [181]:
print(noninfection_set[df["Contains URL"] == True].count())
print(noninfection_set[df["Contains URL"] == True].count() / noninfection_set.count())
print(infection_set[df["Contains URL"] == True].count())
print(infection_set[df["Contains URL"] == True].count() / infection_set.count())

Tweet Content    0
Label            0
dtype: int64
Tweet Content    0.0
Label            0.0
dtype: float64
Tweet Content    0
Label            0
dtype: int64
Tweet Content    0.0
Label            0.0
dtype: float64


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [182]:
print(noninfection_set[df["Tweet Length"] < 100].count())
print(noninfection_set[df["Tweet Length"] < 100].count() / noninfection_set.count())
print(infection_set[df["Tweet Length"] < 100].count())
print(infection_set[df["Tweet Length"] < 100].count() / infection_set.count())

Tweet Content    957
Label            957
dtype: int64
Tweet Content    0.371074
Label            0.371074
dtype: float64
Tweet Content    346
Label            346
dtype: int64
Tweet Content    0.473973
Label            0.473973
dtype: float64


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


Based on these results, we decided to remove positive emoticons and URL presence. 

In [183]:
print(noninfection_set[df["CDC Word Count"] < 2].count())
print(noninfection_set[df["CDC Word Count"] < 2].count() / noninfection_set.count())
print(infection_set[df["CDC Word Count"] < 2].count())
print(infection_set[df["CDC Word Count"] < 2].count() / infection_set.count())

Tweet Content    2344
Label            2344
dtype: int64
Tweet Content    0.908879
Label            0.908879
dtype: float64
Tweet Content    579
Label            579
dtype: int64
Tweet Content    0.793151
Label            0.793151
dtype: float64


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [195]:
infection_words = ['get', 'got', 'recov', 'have', 'had', 'has', 'catch', 'cure', 'infect', 'rest', 'wors', 'weaken', 'weak']
possession_words = ['bat', 'civet', 'pangolin', 'snake', 'epidem', 'sick', 'market', 'pandem', 'posit']
concern_words = ['afraid', 'worri', 'scare', 'fear', 'nervous', 'dread','terrifi']
vaccination_words = ['vaccin', 'drug']
symptom_words = ['fever', 'chill', 'shake', 'muscl', 'headach', 'sore', 'throat', 'tast', 'loss', 'smell', 'cough', 'short', 'breath', 'difficulti']
cdc_words = ['fatal', 'case', 'exposur', 'communiti', 'spread', 'transmiss', 'contact', 'trace', 'drive', 'test', 'droplet', 'epidem', 'flatten', 'isol', 'n95', 'quarantin', 'self', 'distanc', 'ventil']
positive_emoticons = [':)', ':D']
negative_emoticons = [':(', ':/']

In [194]:
for word in cdc_words:
    print(stemmer.stem(word))

print(stemmer.stem("pandemic"))

fatal
case
exposur
communiti
spread
transmiss
contact
trace
drive
test
droplet
epidem
flatten
isol
n95
negat
posit
quarantin
self
distanc
ventil
pandem


In [197]:
noninfection_set.count()

Tweet Content    2579
Label            2579
dtype: int64