### Clean & Prepprocess Crowdflower Data Prior to Model Training
A Super Handy CrowdFlower Glossary of Terms can be found [here](https://success.crowdflower.com/hc/en-us/articles/202703305-Glossary-of-Terms)!

In [45]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

#### Read-In Jobs-Level Data (from CrowdFlower's *Data for Everyone* [library](https://www.crowdflower.com/data-for-everyone/))

In [2]:
cf = pd.read_csv("http://cdn2.hubspot.net/hub/346378/file-2612489700-csv/DFE_CSVs/Airline-Full-Non-Ag-DFE-Sentiment.csv")
print cf.columns
cf.head(2)

Index([u'_unit_id', u'_created_at', u'_golden', u'_id', u'_missed',
       u'_started_at', u'_tainted', u'_channel', u'_trust', u'_worker_id',
       u'_country', u'_region', u'_city', u'_ip', u'airline_sentiment',
       u'negativereason', u'airline', u'airline_sentiment_gold', u'name',
       u'negativereason_gold', u'retweet_count', u'text', u'tweet_coord',
       u'tweet_created', u'tweet_id', u'tweet_location', u'user_timezone'],
      dtype='object')


Unnamed: 0,_unit_id,_created_at,_golden,_id,_missed,_started_at,_tainted,_channel,_trust,_worker_id,...,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,681448150,2/25/2015 04:52:40,False,1575073003,,2/25/2015 04:49:12,False,elite,0.8108,31110645,...,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,570306133677760513,,Eastern Time (US & Canada)
1,681448150,2/25/2015 05:22:10,False,1575093916,,2/25/2015 05:19:59,False,prodege,0.8919,1908948,...,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,570306133677760513,,Eastern Time (US & Canada)


#### Split Test (i.e. Golden Tweets) out from the non-test tweets.
(We already know the "correct" answers for the Test tweets, so we can process those seperately.)

In [3]:
cf_g =  cf[cf._golden==True].copy()
cf_ng = cf[cf._golden==False].copy()

print cf_g.shape, cf_ng.shape

(11997, 27) (43786, 27)


#### Process "Test" Tweets - Use "Correct" Sentiment & Topics

In [4]:
##Do this later :)

#### Process Non-Test Tweets: Clean Airline Sentiment (Pos/ Neg) Label

##### Clean Airline Sentiment (Positive/Neutral or Negative) Label

In [5]:
cf_ng.airline_sentiment.value_counts(dropna=False)

negative    26919
neutral      9742
positive     7125
Name: airline_sentiment, dtype: int64

In [6]:
##Convert Text Labels into numeric and pool postitive and neutral
cf_ng.airline_sentiment.replace(["positive", "neutral", "negative"], [1, 1, -1], inplace=True)
cf_ng.airline_sentiment.value_counts()

-1    26919
 1    16867
Name: airline_sentiment, dtype: int64

In [38]:
"""
Function to Aggregate Data to Tweet-Level & Score Tweets
Score = 0 -> Absolutely Positive
Score= -1 -> Absolutely Negative
Score= 0.01- 0.99 -> Ambiguous (closer to 1, more likely to be negative)
"""

def score_tweets(df):
    df = df.pivot_table(index=["tweet_id","text"], columns="airline_sentiment", values="_trust", aggfunc=np.sum)
    
    df[-1].replace(np.nan, 0, inplace=True)
    df[1].replace(np.nan, 0, inplace=True)
    
    df["tweet_score"] = df[-1]/(df[-1]+df[1]) #Score Tweets
    return df.reset_index()[["tweet_id","text","tweet_score","score_scaled"]]

In [40]:
##Score Tweets and Split into Train/ Test
cf_ng_1 = score_tweets(cf_ng)
cf_ng_train, cf_ng_test = train_test_split(cf_ng_1, test_size=0.2, random_state=4444)

airline_sentiment,tweet_id,text,tweet_score
0,567588278875213824,@JetBlue's new CEO seeks the right balance to ...,0
1,567590027375702016,@JetBlue is REALLY getting on my nerves !! 😡�...,1
2,567591480085463040,@united yes. We waited in line for almost an h...,1
3,567592368451248130,@united the we got into the gate at IAH on tim...,1
4,567594449874587648,@SouthwestAir its cool that my bags take a bit...,1


In [None]:
"""
Train a model to optimally classify tweets based on tweet score
"""

def train_models(df, pipe, grid):
    X = df["tweet_score"]
    y = df["airline_sentiment"]
    
    pipe = Pipeline(pipe)
    gs = GridSearchCV(estimator=pipe, param_grid=grid, scoring="accuracy", cv=10, n_jobs=-1)
    
    return gs.fit(X, y)

#### Output Sample "Ambiguous" Tweets for Handcoding

In [None]:
ng_sampl = t_sentiment[t_sentiment.duplicated("tweet_id")==True].sample(frac=0.15, replace=False, random_state=4444)
t_sample[["tweet_id","text"]].to_csv("ambigous_sentiment_hand_coded.csv", index=False)

In [None]:
##Pivot Table
t_pivoted = t_sentiment.pivot_table()

In [None]:
##Calculate Tweet "score"
t_sentiment = t_sentiment.pivot_table(index=["tweet_id","text"], columns="airline_sentiment", values="_trust")
t_sentiment.head(5)

In [None]:
t_sentiment["flag"] = 0
t_sentiment.ix[(t_sentiment[-1].notnull()) & (t_sentiment[1].notnull()), "flag"] = 1

In [None]:
t_sentiment["flag"].value_counts()