### Train a Sentiment Classifier Using Annotated CrowdFlower Data
A Super Handy CrowdFlower Glossary of Terms can be found [here](https://success.crowdflower.com/hc/en-us/articles/202703305-Glossary-of-Terms)!

In [17]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline




pd.set_option("display.max_rows", 500)
pd.set_option('display.max_colwidth', -1)

#### Read-In Jobs-Level Data (from CrowdFlower's *Data for Everyone* [library](https://www.crowdflower.com/data-for-everyone/))

In [18]:
cf = pd.read_csv("http://cdn2.hubspot.net/hub/346378/file-2612489700-csv/DFE_CSVs/Airline-Full-Non-Ag-DFE-Sentiment.csv")
cf["airline_sentiment"].replace(["positive","neutral","negative"], [1,0,-1], inplace=True)

print cf.columns
print cf.airline_sentiment.value_counts()
cf.head(2)

Index([u'_unit_id', u'_created_at', u'_golden', u'_id', u'_missed',
       u'_started_at', u'_tainted', u'_channel', u'_trust', u'_worker_id',
       u'_country', u'_region', u'_city', u'_ip', u'airline_sentiment',
       u'negativereason', u'airline', u'airline_sentiment_gold', u'name',
       u'negativereason_gold', u'retweet_count', u'text', u'tweet_coord',
       u'tweet_created', u'tweet_id', u'tweet_location', u'user_timezone'],
      dtype='object')
-1    36280
 0    11027
 1    8476 
Name: airline_sentiment, dtype: int64


Unnamed: 0,_unit_id,_created_at,_golden,_id,_missed,_started_at,_tainted,_channel,_trust,_worker_id,...,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,681448150,2/25/2015 04:52:40,False,1575073003,,2/25/2015 04:49:12,False,elite,0.8108,31110645,...,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,570306133677760513,,Eastern Time (US & Canada)
1,681448150,2/25/2015 05:22:10,False,1575093916,,2/25/2015 05:19:59,False,prodege,0.8919,1908948,...,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,570306133677760513,,Eastern Time (US & Canada)


#### Aggregate Data up to Tweet Level 

In [19]:
def aggregate(df):
    df1 = df.groupby(by=["tweet_id","_golden","text","airline_sentiment"], as_index=False)["_trust"].count()
    df1.rename(columns={"_trust": "n_judgements"}, inplace=True)
    
    df2 = df.groupby(by=["tweet_id","_golden","text","airline_sentiment"], as_index=False)["_trust"].mean()
    df2.rename(columns={"_trust": "avg_trust"}, inplace=True)
    
    return pd.merge(df1, df2, on=["tweet_id","_golden","text","airline_sentiment"]) ##remove duplicate test tweets!

In [20]:
agg = aggregate(cf)
agg[agg.duplicated(["tweet_id","_golden","text"], keep=False)] ##Look at tweet with multiple 

Unnamed: 0,tweet_id,_golden,text,airline_sentiment,n_judgements,avg_trust
10,567623209026334720,False,"Wanted to get my bag benefit, but instead get $25 pricing on all three tickets. When adding a card, MP Visa is only option. @united",-1,2,0.761100
11,567623209026334720,False,"Wanted to get my bag benefit, but instead get $25 pricing on all three tickets. When adding a card, MP Visa is only option. @united",0,1,0.880000
25,567680108002291712,False,@JetBlue No worries. Delay was minor and dealt with nicely. It was captain of flight 2324 by the way.,-1,1,0.864900
26,567680108002291712,False,@JetBlue No worries. Delay was minor and dealt with nicely. It was captain of flight 2324 by the way.,1,2,0.856650
27,567686758708817921,False,@SouthwestAir Is it a temporary site glitch or are you no longer offering flights from GRR to GEG after Feb? Can't find any online :(,-1,1,0.793100
28,567686758708817921,False,@SouthwestAir Is it a temporary site glitch or are you no longer offering flights from GRR to GEG after Feb? Can't find any online :(,0,2,0.878350
29,567686845903826947,False,@united clicked on the link and got this? #confused http://t.co/xMAQcucWZl,-1,1,0.918900
30,567686845903826947,False,@united clicked on the link and got this? #confused http://t.co/xMAQcucWZl,0,2,0.883700
31,567688325276770306,False,"@SouthwestAir Guys, we've got to do something about the inability to check in online for international flight that has... (1/2)",-1,1,0.756800
32,567688325276770306,False,"@SouthwestAir Guys, we've got to do something about the inability to check in online for international flight that has... (1/2)",0,2,0.864850


#### Old Code....

In [None]:
cf_g, cf_ng = pivot_tweets(cf)

print "Total Number of Judgements: ", cf.shape[0]
print "Total Number of Tweets:     ", len(cf.tweet_id.unique())
print "Number of Test Tweets:      ", len(cf_g.tweet_id.unique())
print "Number of Non-Test Tweets:  ", len(cf_ng.tweet_id.unique())

In [None]:
print cf_ng.columns

In [None]:
    
    
def score_tweets(df):    

    ##Calculate Tweet Scores
    df[-1].replace(np.nan, 0, inplace=True)
    df[1].replace(np.nan, 0, inplace=True)
    df["prob_neg"] = df[-1]/ (df[-1]+df[1])
    
    df["airline_sentiment"] = np.nan
    df[df.prob_neg==0.0, "airline_sentiment"] = 1
    df[df.prob_neg==1.0, "airline_sentiment"] = -1
   
    unambig = df[df.airline_sentiment.notnull()]["tweet_id","text","airline_sentiment"]
    ambig = df[df.airline_sentiment.isnull()]["tweet_id","text","prob_neg"]
    
    return umambig, ambig

In [None]:
print cf.drop_duplicates(["_unit_id","tweet_id"]).duplicated("_unit_id").value_counts() #No Duplicates

### Split Test (i.e. Golden Tweets) out from the non-test tweets.
(We already know the "correct" answers for the Test tweets, so we can process those seperately.)

In [None]:
cf_g =  cf[cf._golden==True].copy()
cf_ng = cf[cf._golden==False].copy()

print "Total Number of Judgements: ", cf.shape[0]
print "Total Number of Tweets:     ", len(cf.tweet_id.unique())
print "Number of Test Tweets:      ", len(cf_g.tweet_id.unique())
print "Number of Non-Test Tweets:  ", len(cf_ng.tweet_id.unique())

In [None]:
print cf.drop_duplicates(["tweet_id","_golden"]).duplicated("tweet_id").value_counts() #No Duplicates

In [None]:
test=cf.drop_duplicates(["tweet_id","_golden"])
test["dups"] = cf.duplicated("tweet_id")

test[test.dups==True]["tweet_id"]

#### Process "Test" Tweets - Use "Correct" Sentiment & Topics

In [None]:
##Do this later :)

#### Process Non-Test Tweets: Clean Airline Sentiment  Label

In [None]:
cf_ng.airline_sentiment.value_counts(dropna=False)

In [None]:
##Convert Text Labels into numeric and pool postitive and neutral
cf_ng.airline_sentiment.replace(["positive", "neutral", "negative"], [1, 1, -1], inplace=True)
cf_ng.airline_sentiment.value_counts()

In [None]:
##Function to Aggregate Data to Tweet-Level & Score Tweets 
##0=Absolutely Positive, -1=Absolutely Negative
##Score= 0.01- 0.99 -> Ambiguous (closer to 1, more likely to be negative)
def score_tweets(df):
    df = df.pivot_table(index=["tweet_id","text"], columns="airline_sentiment", values="_trust", aggfunc=np.sum)
    df.reset_index(inplace=True)
    
    ##Calculate Tweet Scores
    df[-1].replace(np.nan, 0, inplace=True)
    df[1].replace(np.nan, 0, inplace=True)
    df["prob_neg"] = df[-1]/ (df[-1]+df[1])
    
    df["airline_sentiment"] = np.nan
    df[df.prob_neg==0.0, "airline_sentiment"] = 1
    df[df.prob_neg==1.0, "airline_sentiment"] = -1
   
    unambig = df[df.airline_sentiment.notnull()]["tweet_id","text","airline_sentiment"]
    ambig = df[df.airline_sentiment.isnull()]["tweet_id","text","prob_neg"]
    
    return umambig, ambig


##Function to sample ambiguous sentiment tweets for hand-coding.
def output_hand_codes(df):
    samp = df.sample(frac=0.15, replace=False, random_state=4444)[["tweet_id","text"]]
    samp.to_csv(outfile, index=False)

    
##Function to Create Training Data
def input_hand_codes(df, infile):
    hc = pd.read_csv(infile)[["tweet_id","airline_sentiment"]] #Read in Hand Coded Tweets
    hc["hand_coded"] = 1 #flag hand coded tweets
    
    merged = pd.merge(df, hc, on="tweet_id", how="left") #merge data
    
    return merged[merged.hand_coded.notnull()], merged[merged.hand_coded.isnull()]

In [None]:
##Score Tweets & Seperate Tweets with "ambiguous" sentiment (rated differently by different raters) from Umambig ones
cf_ng_unambig, cf_ng_ambig = score_tweets(cf_ng)

print ""

In [None]:
##Output 15%
#output_hand_codes(cf_ng_unambig, "ambiguous_sentiment_hand_coded.csv") #Output Hand Coding File

cf_ng_un_labled, cf_ng_un_unlabled = input_hand_codes(cf_ng_ambig, "ambiguous_sentiment_hand_coded.csv")




input_hand_codes(cf_ng_unambig, "ambiguous_sentiment_hand_coded.csv")
cf_ng_train, cf_ng_test = train_test_split(cf_ng_model, test_size=0.2, random_state=4444)