#### Train Airline Sentiment & Topic Classification Models

In [2]:
import re
import numpy as np
import pandas as pd

from nltk.tokenize.casual import TweetTokenizer
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split, cross_val_score

from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

#### Define Preprocessing and Tokenizer Functions to Read into Vectorizer

In [3]:
def preprocess(text):
    text = re.sub(r"(?:\https?\://)\S+", "", text) #remove urls
    text = re.sub('\@(\w+)', " ", text).replace(": ","") #remove usernames
    text = re.sub('#(\w+)', " ", text) #remove hashtags
    text = text.replace("RT ","") #remove RT Symbols
    text = text.replace("RT: ","") #remove RT Symbols
    text = re.sub("[^a-zA-Z,]+", " ", text) #remove other non-alpha characters
    text = text.strip(" ") #remove leading and trailing whitespace
    
    return text

def tokenize(text):
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)  
    
    return [x for x in tokenizer.tokenize(preprocess(text)) if len(x)>=4]

#### Read In Crowdflower Data

In [6]:
cf = pd.read_csv("data/crowdflower/Airline-Sentiment-2-w-AA.csv")
cf.head(2)

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,airline_sentiment,airline_sentiment:confidence,negativereason,negativereason:confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,681448150,False,finalized,3,2/25/15 5:24,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2/24/15 11:35,5.70306e+17,,Eastern Time (US & Canada)
1,681448153,False,finalized,3,2/25/15 1:53,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)


#### CrowdFlower Data Preprocessing

In [9]:
print cf.airline_sentiment.value_counts(dropna=False)
pd.crosstab(cf.airline_sentiment, cf.negativereason, dropna=False)

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64


Unnamed: 0_level_0,Bad Flight,Can't Tell,Cancelled Flight,Customer Service Issue,Damaged Luggage,Flight Attendant Complaints,Flight Booking Problems,Late Flight,Lost Luggage,longlines
airline_sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
negative,580,1190,847,2910,74,481,529,1665,724,178


In [None]:
def define_complaints(sentiment, complaint):
    if sentiment in ["positive", "neutral"]:
        return "No Complaint"
    elif complaint in ["Cancelled Flight", "Late Flight"]:
        return "Delay or Cancellation"
    elif complaint in ["Lost Luggage", "Damaged Luggage"]:
        return "Lost or Damaged Luggage"
    elif complaint in ["Customer Service Issue", "Flight Attendant Complaints", "Flight Booking Problems", "longlines"]:
        return "Customer Service"
    elif compaint in ["Bad Flight", "Can't Tell"]:
        return "Unknown"
    
cf["complaint"] = cf[["airline_sentiment", "negativereason"]].apply(define_complaints)