#### Train Supervised Aspect-Based Sentiment Analysis Models for Use in Airline Opinion Mining Dashboard

In [2]:
import re
import numpy as np
import pandas as pd

from nltk.tokenize.casual import TweetTokenizer
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

#### Read-In & Clean Labelled Data

In [4]:
cf = pd.read_csv("data/crowdflower/Airline-Sentiment-2-w-AA.csv")
print cf.info()
cf.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14640 entries, 0 to 14639
Data columns (total 20 columns):
_unit_id                        14640 non-null int64
_golden                         14640 non-null bool
_unit_state                     14640 non-null object
_trusted_judgments              14640 non-null int64
_last_judgment_at               14584 non-null object
airline_sentiment               14640 non-null object
airline_sentiment:confidence    14640 non-null float64
negativereason                  9178 non-null object
negativereason:confidence       10522 non-null float64
airline                         14640 non-null object
airline_sentiment_gold          40 non-null object
name                            14640 non-null object
negativereason_gold             32 non-null object
retweet_count                   14640 non-null int64
text                            14640 non-null object
tweet_coord                     1019 non-null object
tweet_created                   14640 

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,airline_sentiment,airline_sentiment:confidence,negativereason,negativereason:confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,681448150,False,finalized,3,2/25/15 5:24,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2/24/15 11:35,5.70306e+17,,Eastern Time (US & Canada)
1,681448153,False,finalized,3,2/25/15 1:53,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
2,681448156,False,finalized,3,2/25/15 10:01,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2/24/15 11:15,5.70301e+17,Lets Play,Central Time (US & Canada)
3,681448158,False,finalized,3,2/25/15 3:05,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada)
4,681448159,False,finalized,3,2/25/15 5:50,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2/24/15 11:14,5.70301e+17,,Pacific Time (US & Canada)


In [6]:
print cf.airline_sentiment.value_counts()
pd.crosstab(cf.airline_sentiment, cf.negativereason)

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64


negativereason,Bad Flight,Can't Tell,Cancelled Flight,Customer Service Issue,Damaged Luggage,Flight Attendant Complaints,Flight Booking Problems,Late Flight,Lost Luggage,longlines
airline_sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
negative,580,1190,847,2910,74,481,529,1665,724,178


In [None]:
##Code ABSA Label
cf["absa"] = -1 #default
cf.ix[cf.airline_sentiment.isin(["neutral","positive"]), "absa"] = 0 #Positive or Neutral
cf.ix[cf.airline_sentiment=="negative" & cf.negativereason.isin(["Cancelled Flight","Late Flight"]), "absa"] = 1

In [None]:
def preprocess(text):
    text = re.sub(r"(?:\https?\://)\S+", "", text) #remove urls
    text = re.sub('\@(\w+)', " ", text).replace(": ","") #remove usernames
    text = re.sub('#(\w+)', " ", text) #remove hashtags
    text = text.replace("RT ","") #remove RT Symbols
    text = text.replace("RT: ","") #remove RT Symbols
    text = re.sub("[^a-zA-Z,]+", " ", text) #remove other non-alpha characters
    text = text.strip(" ") #remove leading and trailing whitespace
    
    return text

def tokenizer(text):
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    return [x for x in tokenizer.tokenize(preprocess(text)) if len(x)>=4]

In [None]:
#### Read In CrowdFlower Data

In [None]:
vectorizer = CountVectorizer(decode_error="ignore", ngram_range=(1,1), tokenizer=tokenizer)

X0 = ["This is the first document. HaHa!"]
X = ["Second Document Document Here. Contains words not in first."]

print tokenizer(X0[0])
print tokenizer(X[0])

In [None]:
X0 = vectorizer.fit_transform(X0)
X = vectorizer.transform(X)

In [None]:
sentence = ["willhuguenin is not in the vocabulary. Becuz the vocab is lame :("]
print tokenizer(sentence[0])
#text_X = vectorizer.transform(text)
#print text_X.toarray().shape

In [None]:
new_X = vectorizer.transform(sentence)
print new_X.toarray().shape

In [None]:
def fit_count_model(classifier, train, test, sws, binary):
    vectorizer = CountVectorizer(decode_error="ignore", ngram_range=(1,1), tokenizer=tokenize, binary=binary)
    
    vectorizer.fit(train["text"])
    
    train_X = vectorizer.transform(train["text"])
    test_X = vectorizer.transform(test["text"])
    
    train_y = train["sentiment"]
    test_y = test["sentiment"]
    
    clf = classifier.fit(train_X, train_y)
    
    #print "Confusion Matrix: "
    #print confusion_matrix(test_y, clf.predict(test_X))
    
    accuracy = accuracy_score(test_y, clf.predict(test_X))
    features = zip(vectorizer.get_feature_names(), clf.coef_[0])
    top10_features = sorted(features, reverse=True, key=lambda x: x[1])[:10]
    
    print "Accuracy: "+str(accuracy)
    
    return [accuracy, [x[0] for x in top10_features]]