### (1) Clean & Preprocess Crowdflower Data Prior to Model Training
A Super Handy CrowdFlower Glossary of Terms can be found [here](https://success.crowdflower.com/hc/en-us/articles/202703305-Glossary-of-Terms)!

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

#### Read-In Jobs-Level Data (from CrowdFlower's *Data for Everyone* [library](https://www.crowdflower.com/data-for-everyone/))

In [None]:
cf = pd.read_csv("http://cdn2.hubspot.net/hub/346378/file-2612489700-csv/DFE_CSVs/Airline-Full-Non-Ag-DFE-Sentiment.csv")
print cf.columns
cf.head(2)

#### Aggregate Data up to Tweet Level 
TO DO:
1. Seperate out test (golden) from non test tweets (look out for duplicates!)
2. Aggregate data up to tweet level - assign "ambiguous" tweets labels using trust weighted majority rules
3. Look at ambiguous tweet scores (export to excel) - exclude any/ all (??)

In [None]:
def aggregate_sentiment():
    pass

def aggregate_topic():
    pass

#### Old Code....

In [None]:
def sum_and_split_tweet(df):
    cf.replace(["positive","neutral","negative"], [1,0,-1], inplace=True)
    
    df = df.
    
    df = df.pivot_table(index=["tweet_id","text","_golden"], columns="airline_sentiment", values="_trust", aggfunc=np.sum)
    
    df.reset_index(inplace=True)
    df["dup_flag"] = df.duplicated("tweet_id", keep=False)
    
    gold = df[df._golden==True].copy()[["tweet_id","text","positive","neutral","negative"]].copy()
    nogold = df[(df._golden==False) & (df.dup_flag==False)][["tweet_id","text",""]]
    
    return gold, nogold

In [None]:
cf_g, cf_ng = pivot_tweets(cf)

print "Total Number of Judgements: ", cf.shape[0]
print "Total Number of Tweets:     ", len(cf.tweet_id.unique())
print "Number of Test Tweets:      ", len(cf_g.tweet_id.unique())
print "Number of Non-Test Tweets:  ", len(cf_ng.tweet_id.unique())

In [None]:
print cf_ng.columns

In [None]:
    
    
def score_tweets(df):    

    ##Calculate Tweet Scores
    df[-1].replace(np.nan, 0, inplace=True)
    df[1].replace(np.nan, 0, inplace=True)
    df["prob_neg"] = df[-1]/ (df[-1]+df[1])
    
    df["airline_sentiment"] = np.nan
    df[df.prob_neg==0.0, "airline_sentiment"] = 1
    df[df.prob_neg==1.0, "airline_sentiment"] = -1
   
    unambig = df[df.airline_sentiment.notnull()]["tweet_id","text","airline_sentiment"]
    ambig = df[df.airline_sentiment.isnull()]["tweet_id","text","prob_neg"]
    
    return umambig, ambig

In [None]:
print cf.drop_duplicates(["_unit_id","tweet_id"]).duplicated("_unit_id").value_counts() #No Duplicates

### Split Test (i.e. Golden Tweets) out from the non-test tweets.
(We already know the "correct" answers for the Test tweets, so we can process those seperately.)

In [None]:
cf_g =  cf[cf._golden==True].copy()
cf_ng = cf[cf._golden==False].copy()

print "Total Number of Judgements: ", cf.shape[0]
print "Total Number of Tweets:     ", len(cf.tweet_id.unique())
print "Number of Test Tweets:      ", len(cf_g.tweet_id.unique())
print "Number of Non-Test Tweets:  ", len(cf_ng.tweet_id.unique())

In [None]:
print cf.drop_duplicates(["tweet_id","_golden"]).duplicated("tweet_id").value_counts() #No Duplicates

In [None]:
test=cf.drop_duplicates(["tweet_id","_golden"])
test["dups"] = cf.duplicated("tweet_id")

test[test.dups==True]["tweet_id"]

#### Process "Test" Tweets - Use "Correct" Sentiment & Topics

In [None]:
##Do this later :)

#### Process Non-Test Tweets: Clean Airline Sentiment  Label

In [None]:
cf_ng.airline_sentiment.value_counts(dropna=False)

In [None]:
##Convert Text Labels into numeric and pool postitive and neutral
cf_ng.airline_sentiment.replace(["positive", "neutral", "negative"], [1, 1, -1], inplace=True)
cf_ng.airline_sentiment.value_counts()

In [None]:
##Function to Aggregate Data to Tweet-Level & Score Tweets 
##0=Absolutely Positive, -1=Absolutely Negative
##Score= 0.01- 0.99 -> Ambiguous (closer to 1, more likely to be negative)
def score_tweets(df):
    df = df.pivot_table(index=["tweet_id","text"], columns="airline_sentiment", values="_trust", aggfunc=np.sum)
    df.reset_index(inplace=True)
    
    ##Calculate Tweet Scores
    df[-1].replace(np.nan, 0, inplace=True)
    df[1].replace(np.nan, 0, inplace=True)
    df["prob_neg"] = df[-1]/ (df[-1]+df[1])
    
    df["airline_sentiment"] = np.nan
    df[df.prob_neg==0.0, "airline_sentiment"] = 1
    df[df.prob_neg==1.0, "airline_sentiment"] = -1
   
    unambig = df[df.airline_sentiment.notnull()]["tweet_id","text","airline_sentiment"]
    ambig = df[df.airline_sentiment.isnull()]["tweet_id","text","prob_neg"]
    
    return umambig, ambig


##Function to sample ambiguous sentiment tweets for hand-coding.
def output_hand_codes(df):
    samp = df.sample(frac=0.15, replace=False, random_state=4444)[["tweet_id","text"]]
    samp.to_csv(outfile, index=False)

    
##Function to Create Training Data
def input_hand_codes(df, infile):
    hc = pd.read_csv(infile)[["tweet_id","airline_sentiment"]] #Read in Hand Coded Tweets
    hc["hand_coded"] = 1 #flag hand coded tweets
    
    merged = pd.merge(df, hc, on="tweet_id", how="left") #merge data
    
    return merged[merged.hand_coded.notnull()], merged[merged.hand_coded.isnull()]

In [None]:
##Score Tweets & Seperate Tweets with "ambiguous" sentiment (rated differently by different raters) from Umambig ones
cf_ng_unambig, cf_ng_ambig = score_tweets(cf_ng)

print ""

In [None]:
##Output 15%
#output_hand_codes(cf_ng_unambig, "ambiguous_sentiment_hand_coded.csv") #Output Hand Coding File

cf_ng_un_labled, cf_ng_un_unlabled = input_hand_codes(cf_ng_ambig, "ambiguous_sentiment_hand_coded.csv")




input_hand_codes(cf_ng_unambig, "ambiguous_sentiment_hand_coded.csv")
cf_ng_train, cf_ng_test = train_test_split(cf_ng_model, test_size=0.2, random_state=4444)