### Clean & Prepprocess Crowdflower Data Prior to Model Training
A Super Handy CrowdFlower Glossary of Terms can be found [here](https://success.crowdflower.com/hc/en-us/articles/202703305-Glossary-of-Terms)!

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

#### Read-In Jobs-Level Data (from CrowdFlower's *Data for Everyone* [library](https://www.crowdflower.com/data-for-everyone/))

In [2]:
cf = pd.read_csv("http://cdn2.hubspot.net/hub/346378/file-2612489700-csv/DFE_CSVs/Airline-Full-Non-Ag-DFE-Sentiment.csv")
print cf.columns
cf.head(2)

Index([u'_unit_id', u'_created_at', u'_golden', u'_id', u'_missed',
       u'_started_at', u'_tainted', u'_channel', u'_trust', u'_worker_id',
       u'_country', u'_region', u'_city', u'_ip', u'airline_sentiment',
       u'negativereason', u'airline', u'airline_sentiment_gold', u'name',
       u'negativereason_gold', u'retweet_count', u'text', u'tweet_coord',
       u'tweet_created', u'tweet_id', u'tweet_location', u'user_timezone'],
      dtype='object')


Unnamed: 0,_unit_id,_created_at,_golden,_id,_missed,_started_at,_tainted,_channel,_trust,_worker_id,...,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,681448150,2/25/2015 04:52:40,False,1575073003,,2/25/2015 04:49:12,False,elite,0.8108,31110645,...,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,570306133677760513,,Eastern Time (US & Canada)
1,681448150,2/25/2015 05:22:10,False,1575093916,,2/25/2015 05:19:59,False,prodege,0.8919,1908948,...,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,570306133677760513,,Eastern Time (US & Canada)


#### Split Test (i.e. Golden Tweets) out from the non-test tweets.
(We already know the "correct" answers for the Test tweets, so we can process those seperately.)

In [3]:
cf_g =  cf[cf._golden==True].copy()
cf_ng = cf[cf._golden==False].copy()

print cf_g.shape, cf_ng.shape

(11997, 27) (43786, 27)


#### Process "Test" Tweets - Use "Correct" Sentiment & Topics

In [4]:
##Do this later :)

#### Process Non-Test Tweets: Clean Airline Sentiment  Label

##### Clean Airline Sentiment (Positive/Neutral or Negative) Label

In [5]:
cf_ng.airline_sentiment.value_counts(dropna=False)

negative    26919
neutral      9742
positive     7125
Name: airline_sentiment, dtype: int64

In [6]:
##Convert Text Labels into numeric and pool postitive and neutral
cf_ng.airline_sentiment.replace(["positive", "neutral", "negative"], [1, 1, -1], inplace=True)
cf_ng.airline_sentiment.value_counts()

-1    26919
 1    16867
Name: airline_sentiment, dtype: int64

In [14]:
##Function to Aggregate Data to Tweet-Level & Score Tweets 
##0=Absolutely Positive, -1=Absolutely Negative
##Score= 0.01- 0.99 -> Ambiguous (closer to 1, more likely to be negative)
def score_tweets(df):
    df = df.pivot_table(index=["tweet_id","text"], columns="airline_sentiment", values="_trust", aggfunc=np.sum)
    df.reset_index(inplace=True)
    
    ##Calculate Tweet Scores
    df[-1].replace(np.nan, 0, inplace=True)
    df[1].replace(np.nan, 0, inplace=True)
    df["prob_neg"] = df[-1]/ (df[-1]+df[1])
    
    df["airline_sentiment"] = np.nan
    df[df.prob_neg==0.0, "airline_sentiment"] = 1
    df[df.prob_neg==1.0, "airline_sentiment"] = -1
   
    unambig = df[df.airline_sentiment.notnull()]["tweet_id","text","airline_sentiment"]
    ambig = df[df.airline_sentiment.isnull()]["tweet_id","text","prob_neg"]
    
    return umambig, ambig


##Function to sample ambiguous sentiment tweets for hand-coding.
def output_hand_codes(df):
    samp = df.sample(frac=0.15, replace=False, random_state=4444)[["tweet_id","text"]]
    samp.to_csv(outfile, index=False)

    
##Function to Create Training Data
def input_hand_codes(df, infile):
    hc = pd.read_csv(infile)[["tweet_id","airline_sentiment"]]
    return pd.merge(df, hc, on="tweet_id", how="right")


##Train Models to Optimally Classify Tweets based on Tweet Score
def train_models(df, pipe, grid):
    X = df["tweet_score"]
    y = df["airline_sentiment"]
    
    pipe = Pipeline(pipe)
    gs = GridSearchCV(estimator=pipe, param_grid=grid, scoring="accuracy", cv=10, n_jobs=-1)
    
    return gs.fit(X, y)

In [17]:
cf_ng_unambig, cf_ng_ambig = score_tweets(cf_ng) ##Score Tweets & Seperate Ambig from Unambig
#output_hand_codes(cf_ng_unambig, "ambiguous_sentiment_hand_coded.csv") #Output Hand Coding File

cf_ng_model = input_hand_codes(cf_ng_unambig, "ambiguous_sentiment_hand_coded.csv")
cf_ng_train, cf_ng_test = train_test_split(cf_ng_model, test_size=0.2, random_state=4444)

Unnamed: 0,tweet_id,text,Label
0,5.7e+17,@JetBlue hi is there a way we can check to tra...,1
1,5.69e+17,@United @Skywest do it again! I will now mis...,-1
2,5.68e+17,@united #UnitedAirlines wont wait 10min for fa...,-1
3,5.7e+17,@united Yes. To Boston. I was going to Provide...,1
4,5.7e+17,Why? _��� RT @JetBlue Our fleet's on fleek. ht...,1
5,5.69e+17,@united how long does it take for customer fee...,-1
6,5.7e+17,@USAirways Everyone is sorry. Don't have what...,-1
7,5.7e+17,@JetBlue I understand delays. We are all look...,-1
8,5.7e+17,@USAirways @AmericanAir any help regarding fli...,1
9,5.7e+17,@united The agent that met us at the gate said...,-1
