### Train NB Model to Predict Airline Tweet Sentiment using Crowdflower Twitter Data

#### TO DO: 

1. Improve Grid Search code (use example from sklearn)

2. Poke around Crowdflower website a little more to get a better sense of their process

3. Look at Crowdflower Data Disaggregated -> How does it appear the airline confidence intervals are constructed? Is it based on number of people who aggreed to the label.

4. Try limiting training tweets (can do this to testing tweets as well? How to treat a subjective label?) to only those that are 100% confident with 3 graders.

5. Try Model with main NLTK Corpus

6. Compare Model results on Collected Twitter Data to Other Sentiment Analyzers

In [11]:
import re
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from nltk.sentiment import util
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.corpus import twitter_samples

from textblob import TextBlob
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

from gensim import corpora, models, similarities, matutils
from sklearn.cluster import KMeans

#### Read-In NLTK Twitter Sentiment Corpus

In [15]:
def read_demo_tweets():
    tweets = []
    for tweet in twitter_samples.strings("negative_tweets.json"):
        tweets.append({"text":tweet, "sentiment":0})
        
    for tweet in twitter_samples.strings("positive_tweets.json"):
        tweets.append({"text":tweet, "sentiment":1})
        
    return pd.DataFrame(tweets)


demo_tweets=read_demo_tweets()
print demo_tweets.sentiment.value_counts(dropna=False)
demo_tweets.head(5)

1    5000
0    5000
Name: sentiment, dtype: int64


Unnamed: 0,sentiment,text
0,0,hopeless for tmr :(
1,0,Everything in the kids section of IKEA is so c...
2,0,@Hegelbon That heart sliding into the waste ba...
3,0,"“@ketchBurning: I hate Japanese call him ""bani..."
4,0,"Dang starting next week I have ""work"" :("


#### Read-In and Clean Crowdflower Data. Split into Train/ Test

In [17]:
#Read-In and Clean Corpus
cf = pd.read_csv("data/crowdflower/Airline-Sentiment-2-w-AA.csv")

cf["sentiment"] = np.nan
cf.ix[cf.airline_sentiment=="negative", "sentiment"] = 0
cf.ix[cf.airline_sentiment.isin(["neutral", "positive"]), "sentiment"] = 1

print cf.sentiment.value_counts(dropna=False)
cf.head(5)

0    9178
1    5462
Name: sentiment, dtype: int64


Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,airline_sentiment,airline_sentiment:confidence,negativereason,negativereason:confidence,airline,...,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone,sentiment
0,681448150,False,finalized,3,2/25/15 5:24,neutral,1.0,,,Virgin America,...,cairdin,,0,@VirginAmerica What @dhepburn said.,,2/24/15 11:35,5.70306e+17,,Eastern Time (US & Canada),1
1,681448153,False,finalized,3,2/25/15 1:53,positive,0.3486,,0.0,Virgin America,...,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada),1
2,681448156,False,finalized,3,2/25/15 10:01,neutral,0.6837,,,Virgin America,...,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2/24/15 11:15,5.70301e+17,Lets Play,Central Time (US & Canada),1
3,681448158,False,finalized,3,2/25/15 3:05,negative,1.0,Bad Flight,0.7033,Virgin America,...,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2/24/15 11:15,5.70301e+17,,Pacific Time (US & Canada),0
4,681448159,False,finalized,3,2/25/15 5:50,negative,1.0,Can't Tell,1.0,Virgin America,...,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2/24/15 11:14,5.70301e+17,,Pacific Time (US & Canada),0


#### Train Test Split

In [19]:
cf_train, cf_test = train_test_split(cf, test_size=0.25, random_state=4444)

print cf_train.shape, cf_test.shape

(10980, 21) (3660, 21)


#### Test TextBlob Sentiment Polarity Analyzer on Test Set

In [25]:
def text_blob_sentiment(text):
    text = text.decode("ascii", errors="ignore")
    
    return TextBlob(text).sentiment.polarity

def text_blob_grid_search(test, step):
    max_accuracy = 0
    max_param = np.nan
    
    for x in np.arange(-1, 1, step):
        pred_y = []
        
        for each in test.text.apply(text_blob_sentiment):
            if each < x:
                pred_y.append(0)
            else:
                pred_y.append(1)
            
        true_y = test["sentiment"].tolist()
        accuracy = accuracy_score(true_y, pred_y)
        print x, accuracy
            
        if accuracy > max_accuracy:
            max_accuracy = accuracy
            max_param = x
            
    return (max_param, max_accuracy)

print text_blob_grid_search(cf_test, 0.5)
#print text_blob_grid_search(cf_test, 0.1)
#print text_blob_grid_search(cf_test, 0.05)

-1.0 0.37650273224
-0.5 0.408469945355
0.0 0.566666666667
0.5 0.655464480874
(nan, 0)


In [4]:
#Grid Search for Best Text Blob Positive-Negative Cuttoff
def text_blob_grid_search(df, step):
    max_accuracy = 0
    max_param = np.nan
    
    for x in np.arange(-1, 1, step):
        pred_y = []
        
        for each in df["text_blob_sentiment"]:
            if each < x:
                pred_y.append(1)
            else:
                pred_y.append(0)
        
        true_y = df["airline_sentiment"].tolist()
        accuracy = accuracy_score(true_y, pred_y)
        #print x, accuracy
        
        if accuracy > max_accuracy:
            max_accuracy= accuracy
            max_param = x
            
    return (max_param, max_accuracy)

print text_blob_grid_search(corpus, 0.5)
print text_blob_grid_search(corpus, 0.1)

(0.5, 0.65949453551912574)
(0.19999999999999973, 0.67356557377049175)


The optimal TextBlob Sentiment Classifer attains a 67% accuracy on the test set (e.g. all data)

#### Count Vectorized NB/ SVC Models

In [12]:
sw=ENGLISH_STOP_WORDS.union(["united","usairways","americanair","southwestair","jetblue","http","virginamerica","amp",\
                             "flight","flights","plane","gate","flightled","bag","airline","airport","fly"])

In [23]:
def fit_count_model(df, classifier):
    vectorizer=CountVectorizer(decode_error="ignore", ngram_range=(1,1), min_df=2, max_df=0.2,\
                               stop_words=sw, token_pattern="\\b[a-z][a-z]+\\b")
     
    vectorizer.fit(df["text"])
    X = vectorizer.transform(df["text"])
    y = df["airline_sentiment"].tolist()

    accuracy = cross_val_score(classifier, X, y, cv=5)
    
    return np.mean(accuracy)

In [24]:
print fit_count_model(corpus, BernoulliNB())
print fit_count_model(corpus, MultinomialNB())
print fit_count_model(corpus, LinearSVC())

0.814689647808
0.801847253187
0.784633553745


#### TD-IDF Vectorized NB/ SVC Models

In [26]:
def fit_tdidf_model(df, classifier):
    vectorizer=TfidfVectorizer(decode_error="ignore", ngram_range=(1,1), min_df=2, max_df=0.2,\
                               stop_words=sw, token_pattern="\\b[a-z][a-z]+\\b")
    
    vectorizer.fit(df["text"])
    X = vectorizer.transform(df["text"])
    y = df["airline_sentiment"].tolist()

    accuracy = cross_val_score(classifier, X, y, cv=5)
    
    return np.mean(accuracy)

In [27]:
print fit_tdidf_model(corpus, BernoulliNB())
print fit_tdidf_model(corpus, MultinomialNB())
print fit_tdidf_model(corpus, LinearSVC())

0.814689647808
0.788660762066
0.797269793753


#### Use LDA for Dimentionality Reduction

In [36]:
vectorizer=CountVectorizer(decode_error="ignore", ngram_range=(1,1), min_df=2, max_df=0.2,\
                           stop_words=sw, token_pattern="\\b[a-z][a-z]+\\b")

vectorizer.fit(corpus["text"])
X = vectorizer.transform(corpus["text"]).transpose() #need to flip this back at some point
y = corpus["airline_sentiment"].tolist()

X_2 = matutils.Sparse2Corpus(X)
id2word = dict((v, k) for k, v in vectorizer.vocabulary_.iteritems())
print len(id2word)

print X.shape, len(y)

 5482
(5482, 14640) 14640


In [42]:
lda = models.LdaModel(X_2, id2word=id2word, num_topics=3, passes=10)

In [43]:
for each in lda.print_topics(num_words=10, num_topics=5):
    print each

(0, u'0.019*thank + 0.013*thanks + 0.013*just + 0.011*great + 0.010*service + 0.010*know + 0.009*crew + 0.009*dm + 0.009*response + 0.008*work')
(1, u'0.018*hours + 0.015*thanks + 0.014*service + 0.013*cancelled + 0.012*delayed + 0.011*hour + 0.010*customer + 0.010*late + 0.010*waiting + 0.009*time')
(2, u'0.030*help + 0.022*cancelled + 0.018*need + 0.013*phone + 0.013*hold + 0.010*trying + 0.010*change + 0.009*number + 0.008*aa + 0.008*tomorrow')


In [73]:
lda_corpus = lda[X_2]

l = [doc for doc in lda_corpus]
print len(l[0])

3


In [76]:
l = [[(1,2),(3,4)], [(5,6),(7,8)]]

l2 = [x[:][:][1] for x in l]
print l2

[(3, 4), (7, 8)]


In [52]:
print cross_val_score(BernoulliNB(), x_3, y, cv=5)

ValueError: Found array with dim 3. Estimator expected <= 2.

#### Formal Grid Search for Optimal Parameters

In [None]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Peter Prettenhofer <peter.prettenhofer@gmail.com>
#         Mathieu Blondel <mathieu@mblondel.org>
# License: BSD 3 clause

from __future__ import print_function

from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline

print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')


###############################################################################
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]
# Uncomment the following to do the analysis on all the categories
#categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

data = fetch_20newsgroups(subset='train', categories=categories)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

###############################################################################
# define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80),
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(data.data, data.target)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
def grid_search():
    templist = []
    
    for classifier in [BernoulliNB(), MultinomialNB()]:
        for vect in ["count","TD-IDF"]:
            for ngram_range in [(1,1),(1,2)]:    
                for min_df in range(1,21,10):
                    for max_df in np.array([0.1,0.2]):
                        for stop_word in [stop_words_1, stop_words_2]:
                            print "fitting: ", classifier, vect, ngram_range, min_df, max_df
                
                            accuracy = fit_model(classifier, vect, min_df, max_df, stop_word)
                
                            d = {"class":classifier, "vect":vect, "ngram":ngram_range, "min_df":min_df, "max_df":max_df,\
                                 "accuracy":accuracy}
                
                            templist.append(d)
                        
    grid = pd.DataFrame(templist)
    return grid

In [None]:
view1 =  grid_search()
#view2 =  grid_search(BernoulliNB(), "Bernoulli NB", "count", (1,2))

In [None]:
view1.sort_values(by="accuracy", ascending=False)

In [None]:
def grid_search(classifier, vect):
    templist = [] 
    
    for ngram_range in [(1,1),(1,2),(2,2)]:
        for min_df in range(1,101,10):
            for max_df in np.array([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]):
                for stop_word in [None,"english",stop_words_1,stop_words_2]:
                        accuracy = fit_model(classifier, vect, ngram_range, min_df, max_df, stop_word)
                        
                        d = {"class":classifier, "vect":vect, "ngrams":ngram_range, "min_df":min_df, "max_df":max_df,\
                             "stop_words":stop_word, accuracy:"accuracy"}
                        templist.append(d)
                        
    grid = pd.DataFrame(templist)
    print grid.sort_values(by="accuracy", ascending=True).head(3)
    return grid
                        
print grid_search(BernoulliNB(), "count")

In [None]:
def create_corpus():
    corpus = []

    for tweet in twitter_samples.strings("negative_tweets.json"):
        corpus.append({"tweet":tweet, "sentiment":0})
    
    for tweet in twitter_samples.strings("positive_tweets.json"):
        corpus.append({"tweet":tweet, "sentiment":1})

    corpus=pd.DataFrame(corpus)
    print corpus["sentiment"].value_counts(dropna=False)
    print corpus.head(5)
    
    return corpus

#Train Multinomial NB Model on Twitter Data
def tweet_sentiment(airline, df, ngram_range=(1,1)):
    #Fit Model to NLTK Tweet Corpus
    vectorizer = text.TfidfVectorizer(stop_words="english", ngram_range=ngram_range) #use tweet tokenizer?
    train_X = vectorizer.fit_transform(df["tweet"])
    train_y = df["sentiment"]
    
    model = MultinomialNB().fit(train_X, train_y)
    
    #Read in Airline Tweets
    tweets=[]
    for tweet in col.find({"airline": airline}):
        tweets.append(tweet["text"])
    
    #Predict Sentiment for Tweets
    tweet_vector = vectorizer.transform(tweets)
    tweet_sentiment = model.predict_proba(tweet_vector)[:, 1]
    
    plt.hist(tweet_sentiment, bins=50, label=airline)
    return tweet_sentiment

In [None]:
#Train Multinomial NB Model on Twitter Data
def tweet_sentiment(airline, df, ngram_range=(1,1)):
    #Fit Model to NLTK Tweet Corpus
    vectorizer = text.TfidfVectorizer(stop_words="english", ngram_range=ngram_range) #use tweet tokenizer?
    train_X = vectorizer.fit_transform(df["tweet"])
    train_y = df["sentiment"]
    
    model = MultinomialNB().fit(train_X, train_y)
    
    #Read in Airline Tweets
    tweets=[]
    for tweet in col.find({"airline": airline}):
        tweets.append(tweet["text"])
    
    #Predict Sentiment for Tweets
    tweet_vector = vectorizer.transform(tweets)
    tweet_sentiment = model.predict_proba(tweet_vector)[:, 1]
    
    plt.hist(tweet_sentiment, bins=50, label=airline)
    return tweet_sentiment