### Importing libraries and defining utility functions

In [1]:
import pandas as pd
import numpy as np
import re, string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer  
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix
import GetOldTweets3 as got
import warnings
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 1000
warnings.filterwarnings('ignore')



In [2]:
def accuracy(actual, pred):
    true = 0
    for i in range(len(pred)):
        if actual[i] == round(pred[i]):
            true +=1
    return true/len(pred)

In [3]:
def lemmatizer(tweet):
    tweet = tweet.split()
    stemmer = WordNetLemmatizer()
    lemmed = [stemmer.lemmatize(word) for word in tweet]
    return ' '.join(lemmed)

In [4]:
def to_label(score):
    if score > .05:
        return 1
    elif score < -.05:
        return 2
    else:
        return 0

In [5]:
def clean_text(text):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', text)
    text = re.sub('@[^\s]+', '', text)
    text = re.sub('#([^\s]+)', '', text)
    text = regex.sub('', text)
    text = text.strip()
    return text

### Reading in data and doing some preprocessing and cleaning

In [6]:
train = pd.read_csv('Tweets_airline.csv',encoding = "ISO-8859-1")

In [7]:
train['airline_sentiment'] = train['airline_sentiment'].replace('neutral', 0)

In [8]:
train['airline_sentiment'] = train['airline_sentiment'].replace('positive', 1)

In [9]:
train['airline_sentiment'] = train['airline_sentiment'].replace('negative', 2)

In [10]:
train['text'] = train['text'].map(lambda x: clean_text(x))

In [11]:
train['text'] = train['text'].map(lambda x: lemmatizer(x))

In [12]:
text = train['text'].values

In [13]:
text.shape

(14640,)

In [14]:
label = train['airline_sentiment'].values

In [15]:
label.shape

(14640,)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(text, label)


### Defining pipelines and parameter grids to search over for logistic regression

Using TF-IDF

In [17]:
parameters_lr = {
    'lr__C': (.01,0.1, 0.5,1),
    'lr__solver' : ('newton-cg', 'sag', 'saga', 'lbfgs')
}

pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words = 'english')),
    ('lr', LogisticRegression(max_iter = 400))
])

grid_lr = GridSearchCV(pipeline_lr, parameters_lr, cv = 3)
grid_lr.fit(X_train, y_train)

results_lr = pd.DataFrame.from_dict(grid_lr.cv_results_)
print(results_lr)

    mean_fit_time  std_fit_time  mean_score_time  std_score_time param_lr__C  \
0        0.303325      0.005662         0.078946        0.005677        0.01   
1        0.413327      0.002704         0.077473        0.004916        0.01   
2        0.481036      0.011645         0.083784        0.000687        0.01   
3        0.287553      0.004664         0.083721        0.003472        0.01   
4        0.421319      0.006690         0.086509        0.002552         0.1   
5        0.380386      0.010134         0.083227        0.003345         0.1   
6        0.473656      0.009565         0.083408        0.006640         0.1   
7        0.381925      0.009643         0.084027        0.001767         0.1   
8        0.433538      0.010262         0.083662        0.008754         0.5   
9        0.441913      0.014785         0.084766        0.005601         0.5   
10       0.493259      0.021914         0.084745        0.003772         0.5   
11       0.511842      0.041618         

Using BOW

In [18]:
parameters_lr = {
    'lr__C': (.01,0.1, 0.5,1),
    'lr__solver' : ('newton-cg', 'sag', 'saga', 'lbfgs')
}

pipeline_lr = Pipeline([
    ('bow',  CountVectorizer(max_features=5000, min_df=5, max_df=0.9, stop_words='english')  ),
    ('lr', LogisticRegression(max_iter = 400))
])

grid_lr = GridSearchCV(pipeline_lr, parameters_lr, cv = 3)
grid_lr.fit(X_train, y_train)

results_lr = pd.DataFrame.from_dict(grid_lr.cv_results_)
print(results_lr)

    mean_fit_time  std_fit_time  mean_score_time  std_score_time param_lr__C  \
0        0.287868      0.038684         0.054824        0.008836        0.01   
1        0.238169      0.010564         0.064181        0.008008        0.01   
2        0.991711      0.389782         0.078524        0.025684        0.01   
3        0.266873      0.036013         0.085512        0.019144        0.01   
4        0.453303      0.007257         0.087080        0.013331         0.1   
5        0.528335      0.031771         0.080835        0.003755         0.1   
6        1.059944      0.117705         0.082921        0.013470         0.1   
7        0.342139      0.032617         0.082172        0.014567         0.1   
8        0.545696      0.028041         0.070883        0.003793         0.5   
9        0.654355      0.081748         0.096409        0.027061         0.5   
10       1.328736      0.086946         0.084531        0.010781         0.5   
11       0.389565      0.011762         

#### Evaluate best LR model on test data

In [19]:
best_pipeline_lr = Pipeline([
    ('bow',  CountVectorizer(max_features=5000, min_df=5, max_df=0.9, stop_words='english')  ),
    ('lr', LogisticRegression( C = 1, solver = 'saga',max_iter = 400))
])

best_pipeline_lr.fit(X_train, y_train)
best_lr_preds = best_pipeline_lr.predict(X_test)

cr_best_lr = classification_report(y_test,best_lr_preds,output_dict=True)
pd.DataFrame(cr_best_lr).transpose()

Unnamed: 0,f1-score,precision,recall,support
0,0.580081,0.633824,0.534739,806.0
1,0.672582,0.713748,0.635906,596.0
2,0.857871,0.824418,0.894154,2258.0
micro avg,0.772951,0.772951,0.772951,3660.0
macro avg,0.703511,0.723996,0.688267,3660.0
weighted avg,0.766524,0.764424,0.772951,3660.0


### Defining pipelines and parameter grids to search over for naive bayes

Using TF-IDF

In [20]:
parameters_nb = {
    'nb__alpha': (0.00001, 0.5, 1),
    'nb__fit_prior' : (True, False)
}

pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words = 'english')),
    ('nb', MultinomialNB())
])


grid_nb = GridSearchCV(pipeline_nb, parameters_nb, cv = 3)
grid_nb.fit(X_train, y_train)

results_nb = pd.DataFrame.from_dict(grid_nb.cv_results_)
print(results_nb)


   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.109253      0.007952         0.043700        0.005418   
1       0.103535      0.005379         0.044257        0.005766   
2       0.106139      0.013787         0.053318        0.001405   
3       0.100632      0.003447         0.046944        0.003195   
4       0.120600      0.019182         0.056550        0.009661   
5       0.114895      0.013103         0.050716        0.006792   

  param_nb__alpha param_nb__fit_prior  \
0           1e-05                True   
1           1e-05               False   
2             0.5                True   
3             0.5               False   
4               1                True   
5               1               False   

                                         params  split0_test_score  \
0   {'nb__alpha': 1e-05, 'nb__fit_prior': True}           0.707457   
1  {'nb__alpha': 1e-05, 'nb__fit_prior': False}           0.699809   
2     {'nb__alpha': 0.5, 'nb__fi

Using BOW

In [21]:
parameters_nb = {
    'nb__alpha': (0.00001, 0.5, 1),
    'nb__fit_prior' : (True, False)
}

pipeline_nb = Pipeline([
    ('bow',  CountVectorizer(max_features=5000, min_df=5, max_df=0.9, stop_words='english') ),
    ('nb', MultinomialNB())
])

grid_nb = GridSearchCV(pipeline_nb, parameters_nb, cv = 3)
grid_nb.fit(X_train, y_train)

results_nb = pd.DataFrame.from_dict(grid_nb.cv_results_)
print(results_nb)


   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.114632      0.009994         0.045163        0.005376   
1       0.130548      0.009923         0.059124        0.015030   
2       0.139077      0.013787         0.057653        0.006865   
3       0.117761      0.010797         0.053411        0.010458   
4       0.180616      0.010473         0.094216        0.010771   
5       0.204002      0.006947         0.092391        0.002514   

  param_nb__alpha param_nb__fit_prior  \
0           1e-05                True   
1           1e-05               False   
2             0.5                True   
3             0.5               False   
4               1                True   
5               1               False   

                                         params  split0_test_score  \
0   {'nb__alpha': 1e-05, 'nb__fit_prior': True}           0.731494   
1  {'nb__alpha': 1e-05, 'nb__fit_prior': False}           0.709096   
2     {'nb__alpha': 0.5, 'nb__fi

#### Evaluate best NB model on test data

In [22]:
best_pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words = 'english')),
    ('nb', MultinomialNB(alpha = .5, fit_prior = False))
])

best_pipeline_nb.fit(X_train, y_train)
best_nb_preds = best_pipeline_nb.predict(X_test)

cr_best_nb = classification_report(y_test,best_nb_preds,output_dict=True)
pd.DataFrame(cr_best_nb).transpose()

Unnamed: 0,f1-score,precision,recall,support
0,0.550642,0.647651,0.478908,806.0
1,0.656557,0.700952,0.61745,596.0
2,0.857619,0.810161,0.910983,2258.0
micro avg,0.768033,0.768033,0.768033,3660.0
macro avg,0.688273,0.719588,0.669114,3660.0
weighted avg,0.757276,0.75659,0.768033,3660.0


### Defining pipelines and parameter grids to search over for adaboost

Using TF-IDF

In [23]:
parameters_ada = {
    'ada__n_estimators': (500,1000),
}

pipeline_ada = Pipeline([
    ('bow',  CountVectorizer(max_features=5000, min_df=5, max_df=0.9, stop_words='english') ),
    ('ada', AdaBoostClassifier())
])

grid_ada = GridSearchCV(pipeline_ada, parameters_ada, cv = 3)
grid_ada.fit(X_train, y_train)

results_ada = pd.DataFrame.from_dict(grid_ada.cv_results_)
print(results_ada)


   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       3.124988      0.122759         0.549275        0.035159   
1       6.496741      0.185280         0.986089        0.071680   

  param_ada__n_estimators                       params  split0_test_score  \
0                     500   {'ada__n_estimators': 500}           0.738050   
1                    1000  {'ada__n_estimators': 1000}           0.720841   

   split1_test_score  split2_test_score  mean_test_score  std_test_score  \
0           0.739344           0.727521         0.734973        0.005294   
1           0.728689           0.712490         0.720674        0.006614   

   rank_test_score  split0_train_score  split1_train_score  \
0                1            0.791365            0.781148   
1                2            0.819238            0.809153   

   split2_train_score  mean_train_score  std_train_score  
0            0.793744          0.788752         0.005464  
1            0.815462          0

Using BOW

In [24]:
parameters_ada = {
    'ada__n_estimators': (500,1000),
}

pipeline_ada = Pipeline([
    ('bow', TfidfVectorizer(stop_words = 'english')) ,
    ('ada', AdaBoostClassifier())
])

grid_ada = GridSearchCV(pipeline_ada, parameters_ada, cv = 3)
grid_ada.fit(X_train, y_train)

results_ada = pd.DataFrame.from_dict(grid_ada.cv_results_)
print(results_ada)


   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       7.158142      0.239082         0.455528        0.089754   
1      13.709453      0.998323         0.998519        0.050000   

  param_ada__n_estimators                       params  split0_test_score  \
0                     500   {'ada__n_estimators': 500}           0.698989   
1                    1000  {'ada__n_estimators': 1000}           0.704452   

   split1_test_score  split2_test_score  mean_test_score  std_test_score  \
0           0.696721           0.695819         0.697177        0.001334   
1           0.709290           0.685433         0.699727        0.010296   

   rank_test_score  split0_train_score  split1_train_score  \
0                2            0.786719            0.768989   
1                1            0.842875            0.835792   

   split2_train_score  mean_train_score  std_train_score  
0            0.788963          0.781557         0.008934  
1            0.846059          0

#### Evaluate best Adaboost model on test data

In [25]:
best_pipeline_ada = Pipeline([
    ('bow',  CountVectorizer(max_features=5000, min_df=5, max_df=0.9, stop_words='english') ),
    ('ada', AdaBoostClassifier(n_estimators = 500))
])

best_pipeline_ada.fit(X_train, y_train)
best_ada_preds = best_pipeline_ada.predict(X_test)

cr_best_ada = classification_report(y_test,best_ada_preds,output_dict=True)
pd.DataFrame(cr_best_ada).transpose()

Unnamed: 0,f1-score,precision,recall,support
0,0.571956,0.567073,0.576923,806.0
1,0.641343,0.677239,0.60906,596.0
2,0.842613,0.834201,0.851196,2258.0
micro avg,0.751366,0.751366,0.751366,3660.0
macro avg,0.685304,0.692838,0.67906,3660.0
weighted avg,0.750234,0.749815,0.751366,3660.0


### Evaluating the pre-trained model "Vader"

In [26]:
analyser = SentimentIntensityAnalyzer()
vader_preds_raw = train['text'].map(lambda x: analyser.polarity_scores(x)['compound'])
vader_preds = vader_preds_raw.map(lambda x: to_label(x)).tolist()
labels = train['airline_sentiment'].tolist()


In [27]:
cr_vader = classification_report(labels,vader_preds,output_dict=True)
pd.DataFrame(cr_vader).transpose()

Unnamed: 0,f1-score,precision,recall,support
0,0.402649,0.385205,0.421749,3099.0
1,0.472926,0.325965,0.861193,2363.0
2,0.6308,0.893885,0.487361,9178.0
micro avg,0.533811,0.533811,0.533811,14640.0
macro avg,0.502125,0.535018,0.590101,14640.0
weighted avg,0.557023,0.694541,0.533811,14640.0


### Assessing the model on my labeled data

In [28]:
tweets = pd.read_csv('Tweets.csv')

In [29]:
tweets['Tweet'] = tweets['Tweet'].map(lambda x: clean_text(x))

In [30]:
tweets['Tweet'] = tweets['Tweet'].map(lambda x: lemmatizer(x))

In [31]:
test_x = tweets['Tweet'].values

In [32]:
label = tweets['Sentiment']

In [33]:
label = list(map(lambda x: 0 if x == 0.1 else 1,label))

In [34]:
test_preds = best_pipeline_lr.predict(test_x)


In [35]:
test_preds = list(map(lambda x: 1 if x > 0 else 0,test_preds.tolist()))

In [36]:
accuracy(label,test_preds)

0.74

In [37]:
results = list(zip(label,test_preds,tweets['Tweet'].tolist()))

In [38]:
results = pd.DataFrame(results, columns = ['actual','predicted','tweet'])

In [39]:
results.loc[results['actual']!=results['predicted']]

Unnamed: 0,actual,predicted,tweet
11,0,1,Almost that time
15,0,1,Bruins sign Zdeno Chara to a oneyear contract extension through the 20192020 season worth 2 million plus an additional 175 million in performancebased incentive
20,1,0,Bruins defenseman Connor Clifton born in Long Branch NJ and raised in Matawan is expected to be in the lineup v the Devils Will mark his first NHL game against New Jersey Should have a nice following tonight
22,0,1,Best Records Since Trade Deadline 91 Vegas Golden Knights 102 Tampa Bay Lightning 821 Washington Capitals 721 Carolina Hurricanes 83 Boston Bruins
23,0,1,The Bruins look for their third win in a row tonight against New Jersey Tuukka Rask and Marbleheads Cory Schneider are the starting goalie 7pm on NESN BEEEEEE THERRRRRRE
27,0,1,Bruins announce that Lee Stempniak ha been assigned back to Providence
30,0,1,The Boston Bruins are 1934 since the AllStar break
32,1,0,Good news Bruins fan Marcus Johansson ha a chance to play tomorrow against the Lightning
40,0,1,Bruins captain Zdeno Chara had the ultimate Florida man experience … via
41,1,0,The future of Lebanese hockey is in good hand On this picture Antoine Waked Laval Rockets AHL affiliate of the Montreal Canadiens and Karl ElMir Providence Bruins AHL affiliate of the Boston Bruins Rivals on the ice brother off the ice Watch out world


In [102]:
feature_names = best_pipeline_lr.steps[0][1].get_feature_names()
for i, class_label in enumerate([0,1,2]):
    top = np.argsort(best_pipeline_lr.steps[1][1].coef_[i])[-50:]
    print("%s: %s" % (class_label," ".join(feature_names[j] for j in top)))
    print('\n')

0: ring gain near save negotiate phoenix upcoming spring song daily anytime resolution dmed fam fort possible brother ceo announced midnight winter hook jblu hawaii reserve eastern partnership promo allow nashville music conf atlanta rr meant volunteer saw golf photo revenue requires journal mexico grandma carry discount hi australia suggestion dal


1: quick compliment thursday view lt3 handled heart wish good sea win world saved snack las fav lady beautiful passbook thnx type incredible star happy enjoy perfect impressed comfortable cool rock appreciate worked loved worry love exceptional loving deserves best sweet thx kudos excited excellent great amazing wonderful thanks awesome thank


2: suitcase killing stranded lost 140 feedback ruining telling alternate situation story communication text wont stuck hate error joke crazy unless frustrated solution youâ havent unacceptable werent frustration lose lie paid fuck half stop unhappy worse hr screwed disappointed hire delayed answer l

### Get 2 test data sets for third party evaluators

In [73]:
tweetCriteria = got.manager.TweetCriteria().setQuerySearch('boston bruins')\
                                           .setSince("2019-03-01")\
                                           .setUntil("2019-04-01")\
                                           .setMaxTweets(100)\
                                           .setTopTweets(True)
tweets = got.manager.TweetManager.getTweets(tweetCriteria)

In [74]:
validation_tweets = []
for tweet in tweets:
    validation_tweets.append(tweet.text)
    

In [75]:
validation_tweets = pd.DataFrame(validation_tweets, columns = ['tweet'])

In [76]:
tweet_x = validation_tweets['tweet'].map(lambda x: clean_text(x))

In [77]:
tweet_x = tweet_x.map(lambda x: lemmatizer(x))

In [78]:
validation_x = tweet_x.values

In [79]:
validation_preds = best_pipeline_lr.predict(validation_x)

In [80]:
validation_preds = list(map(lambda x: 1 if x > 0 else 0,validation_preds.tolist()))

In [81]:
validation_tweets['prediction'] = validation_preds

In [83]:
validation_tweets.to_csv('validation.csv', index = False)

In [84]:
validated = pd.read_csv('validation.csv')

In [98]:
len(validated.loc[(validated['tester_one']==validated['tester_two']) & (validated['tester_one']==validated['prediction'])])

68

In [99]:
len(validated.loc[(validated['tester_one']==validated['tester_two'])])

87