### Importing libraries and defining utility functions

In [57]:
import pandas as pd
import numpy as np
import re, string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer  
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')



In [24]:
def lemmatizer(tweet):
    tweet = tweet.split()
    stemmer = WordNetLemmatizer()
    lemmed = [stemmer.lemmatize(word) for word in tweet]
    return ' '.join(lemmed)

In [52]:
def to_label(score):
    if score > .05:
        return 1
    elif score < -.05:
        return 2
    else:
        return 0

In [25]:
def clean_text(text):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', text)
    text = re.sub('@[^\s]+', '', text)
    text = re.sub('#([^\s]+)', '', text)
    text = regex.sub('', text)
    text = text.strip()
    return text

### Reading in data and doing some preprocessing and cleaning

In [27]:
train = pd.read_csv('Tweets_airline.csv',encoding = "ISO-8859-1")

In [28]:
train['airline_sentiment'] = train['airline_sentiment'].replace('neutral', 0)

In [29]:
train['airline_sentiment'] = train['airline_sentiment'].replace('positive', 1)

In [30]:
train['airline_sentiment'] = train['airline_sentiment'].replace('negative', 2)

In [31]:
train['text'] = train['text'].map(lambda x: clean_text(x))

In [32]:
train['text'] = train['text'].map(lambda x: lemmatizer(x))

In [33]:
text = train['text'].values

In [34]:
label = train['airline_sentiment'].values

In [35]:
X_train, X_test, y_train, y_test = train_test_split(text, label)


### Defining pipelines and parameter grids to search over for logistic regression

Using TF-IDF

In [81]:
parameters_lr = {
    'lr__C': (.01,0.1, 0.5,1),
    'lr__solver' : ('newton-cg', 'sag', 'saga', 'lbfgs')
}

pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words = 'english')),
    ('lr', LogisticRegression(max_iter = 400))
])

grid_lr = GridSearchCV(pipeline_lr, parameters_lr, cv = 3)
grid_lr.fit(X_train, y_train)

results_lr = pd.DataFrame.from_dict(grid_lr.cv_results_)
print(results_lr)

    mean_fit_time  std_fit_time  mean_score_time  std_score_time param_lr__C  \
0        0.267267      0.021719         0.067395        0.004660        0.01   
1        0.347720      0.010044         0.071187        0.001796        0.01   
2        0.440083      0.020647         0.070822        0.001770        0.01   
3        0.248199      0.007239         0.073977        0.000933        0.01   
4        0.343974      0.008709         0.070973        0.002231         0.1   
5        0.326821      0.003346         0.069016        0.003213         0.1   
6        0.411722      0.015955         0.082743        0.020205         0.1   
7        0.394413      0.027176         0.072666        0.002334         0.1   
8        0.371334      0.008866         0.076054        0.002013         0.5   
9        0.344479      0.007639         0.067461        0.008484         0.5   
10       0.398890      0.005183         0.070382        0.003498         0.5   
11       0.457890      0.006487         

Using BOW

In [37]:
parameters_lr = {
    'lr__C': (.01,0.1, 0.5,1),
    'lr__solver' : ('newton-cg', 'sag', 'saga', 'lbfgs')
}

pipeline_lr = Pipeline([
    ('bow',  CountVectorizer(max_features=5000, min_df=5, max_df=0.9, stop_words='english')  ),
    ('lr', LogisticRegression(max_iter = 400))
])

grid_lr = GridSearchCV(pipeline_lr, parameters_lr, cv = 3)
grid_lr.fit(X_train, y_train)

results_lr = pd.DataFrame.from_dict(grid_lr.cv_results_)
print(results_lr)

    mean_fit_time  std_fit_time  mean_score_time  std_score_time param_lr__C  \
0        0.220076      0.005778         0.046210        0.001243        0.01   
1        0.216421      0.024102         0.046220        0.003406        0.01   
2        0.486021      0.025301         0.046868        0.002826        0.01   
3        0.136303      0.002860         0.046209        0.001695        0.01   
4        0.269281      0.005339         0.044215        0.000470         0.1   
5        0.228721      0.003080         0.044223        0.001256         0.1   
6        0.425186      0.013513         0.047561        0.005245         0.1   
7        0.198793      0.027383         0.051871        0.004318         0.1   
8        0.315148      0.017031         0.043905        0.000801         0.5   
9        0.307835      0.026311         0.043882        0.000814         0.5   
10       0.640630      0.051647         0.046867        0.004214         0.5   
11       0.233376      0.001628         

#### Evaluate best LR model on test data

In [77]:
best_pipeline_lr = Pipeline([
    ('bow',  CountVectorizer(max_features=5000, min_df=5, max_df=0.9, stop_words='english')  ),
    ('lr', LogisticRegression( C = 1, solver = 'saga',max_iter = 400))
])

best_pipeline_lr.fit(X_train, y_train)
best_lr_preds = best_pipeline_lr.predict(X_test)

cr_best_lr = classification_report(y_test,best_lr_preds,output_dict=True)
pd.DataFrame(cr_best_lr).transpose()

Unnamed: 0,f1-score,precision,recall,support
0,0.577653,0.631336,0.532383,772.0
1,0.67288,0.74433,0.613946,588.0
2,0.862769,0.824485,0.904783,2300.0
micro avg,0.779508,0.779508,0.779508,3660.0
macro avg,0.704434,0.733384,0.683704,3660.0
weighted avg,0.772123,0.770867,0.779508,3660.0


### Defining pipelines and parameter grids to search over for naive bayes

Using TF-IDF

In [82]:
parameters_nb = {
    'nb__alpha': (0.00001, 0.5, 1),
    'nb__fit_prior' : (True, False)
}

pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words = 'english')),
    ('nb', MultinomialNB())
])


grid_nb = GridSearchCV(pipeline_nb, parameters_nb, cv = 3)
grid_nb.fit(X_train, y_train)

results_nb = pd.DataFrame.from_dict(grid_nb.cv_results_)
print(results_nb)


   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.170221      0.015330         0.071708        0.003099   
1       0.133043      0.001019         0.067854        0.009916   
2       0.157123      0.008614         0.070896        0.003583   
3       0.165459      0.008402         0.076485        0.002102   
4       0.162114      0.005817         0.073622        0.005198   
5       0.163985      0.008150         0.070206        0.002879   

  param_nb__alpha param_nb__fit_prior  \
0           1e-05                True   
1           1e-05               False   
2             0.5                True   
3             0.5               False   
4               1                True   
5               1               False   

                                         params  split0_test_score  \
0   {'nb__alpha': 1e-05, 'nb__fit_prior': True}           0.718383   
1  {'nb__alpha': 1e-05, 'nb__fit_prior': False}           0.701175   
2     {'nb__alpha': 0.5, 'nb__fi

Using BOW

In [39]:
parameters_nb = {
    'nb__alpha': (0.00001, 0.5, 1),
    'nb__fit_prior' : (True, False)
}

pipeline_nb = Pipeline([
    ('bow',  CountVectorizer(max_features=5000, min_df=5, max_df=0.9, stop_words='english') ),
    ('nb', MultinomialNB())
])

grid_nb = GridSearchCV(pipeline_nb, parameters_nb, cv = 3)
grid_nb.fit(X_train, y_train)

results_nb = pd.DataFrame.from_dict(grid_nb.cv_results_)
print(results_nb)


   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.105043      0.004174         0.050863        0.005353   
1       0.107389      0.012948         0.044881        0.001629   
2       0.109699      0.009875         0.050862        0.004329   
3       0.107703      0.011318         0.050534        0.009402   
4       0.108045      0.008632         0.047216        0.004713   
5       0.097074      0.000941         0.044880        0.001411   

  param_nb__alpha param_nb__fit_prior  \
0           1e-05                True   
1           1e-05               False   
2             0.5                True   
3             0.5               False   
4               1                True   
5               1               False   

                                         params  split0_test_score  \
0   {'nb__alpha': 1e-05, 'nb__fit_prior': True}           0.737777   
1  {'nb__alpha': 1e-05, 'nb__fit_prior': False}           0.715105   
2     {'nb__alpha': 0.5, 'nb__fi

#### Evaluate best NB model on test data

In [78]:
best_pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words = 'english')),
    ('nb', MultinomialNB(alpha = .5, fit_prior = False))
])

best_pipeline_nb.fit(X_train, y_train)
best_nb_preds = best_pipeline_nb.predict(X_test)

cr_best_nb = classification_report(y_test,best_nb_preds,output_dict=True)
pd.DataFrame(cr_best_nb).transpose()

Unnamed: 0,f1-score,precision,recall,support
0,0.53716,0.621806,0.472798,772.0
1,0.644007,0.679245,0.612245,588.0
2,0.859798,0.818718,0.905217,2300.0
micro avg,0.76694,0.76694,0.76694,3660.0
macro avg,0.680321,0.70659,0.66342,3660.0
weighted avg,0.757076,0.754776,0.76694,3660.0


### Defining pipelines and parameter grids to search over for adaboost

Using TF-IDF

In [40]:
parameters_ada = {
    'ada__n_estimators': (500,1000),
}

pipeline_ada = Pipeline([
    ('bow',  CountVectorizer(max_features=5000, min_df=5, max_df=0.9, stop_words='english') ),
    ('ada', AdaBoostClassifier())
])

grid_ada = GridSearchCV(pipeline_ada, parameters_ada, cv = 3)
grid_ada.fit(X_train, y_train)

results_ada = pd.DataFrame.from_dict(grid_ada.cv_results_)
print(results_ada)


   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       1.867351      0.066080         0.303191        0.023974   
1       3.578128      0.164715         0.525573        0.010946   

  param_ada__n_estimators                       params  split0_test_score  \
0                     500   {'ada__n_estimators': 500}           0.733406   
1                    1000  {'ada__n_estimators': 1000}           0.717290   

   split1_test_score  split2_test_score  mean_test_score  std_test_score  \
0           0.721661           0.737288         0.730783        0.006644   
1           0.707184           0.722253         0.715574        0.006270   

   rank_test_score  split0_train_score  split1_train_score  \
0                1            0.795191            0.793005   
1                2            0.809673            0.808444   

   split2_train_score  mean_train_score  std_train_score  
0            0.792953          0.793716         0.001043  
1            0.811937          0

Using BOW

In [41]:
parameters_ada = {
    'ada__n_estimators': (500,1000),
}

pipeline_ada = Pipeline([
    ('bow', TfidfVectorizer(stop_words = 'english')) ,
    ('ada', AdaBoostClassifier())
])

grid_ada = GridSearchCV(pipeline_ada, parameters_ada, cv = 3)
grid_ada.fit(X_train, y_train)

results_ada = pd.DataFrame.from_dict(grid_ada.cv_results_)
print(results_ada)


   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       4.086084      0.071377         0.289548        0.000478   
1       8.290173      0.375760         0.556179        0.005446   

  param_ada__n_estimators                       params  split0_test_score  \
0                     500   {'ada__n_estimators': 500}           0.706911   
1                    1000  {'ada__n_estimators': 1000}           0.699536   

   split1_test_score  split2_test_score  mean_test_score  std_test_score  \
0           0.702540           0.691088         0.700182        0.006671   
1           0.699262           0.703390         0.700729        0.001884   

   rank_test_score  split0_train_score  split1_train_score  \
0                2            0.779751            0.782347   
1                1            0.838639            0.841372   

   split2_train_score  mean_train_score  std_train_score  
0            0.777110          0.779736         0.002138  
1            0.834062          0

#### Evaluate best NB model on test data

In [79]:
best_pipeline_ada = Pipeline([
    ('bow',  CountVectorizer(max_features=5000, min_df=5, max_df=0.9, stop_words='english') ),
    ('ada', AdaBoostClassifier(n_estimators = 500))
])

best_pipeline_ada.fit(X_train, y_train)
best_ada_preds = best_pipeline_ada.predict(X_test)

cr_best_ada = classification_report(y_test,best_ada_preds,output_dict=True)
pd.DataFrame(cr_best_ada).transpose()

Unnamed: 0,f1-score,precision,recall,support
0,0.540506,0.528465,0.553109,772.0
1,0.628311,0.678501,0.585034,588.0
2,0.833154,0.82516,0.841304,2300.0
micro avg,0.739344,0.739344,0.739344,3660.0
macro avg,0.667324,0.677375,0.659816,3660.0
weighted avg,0.738517,0.739017,0.739344,3660.0


### Evaluating the pre-trained model "Vader"

In [47]:
analyser = SentimentIntensityAnalyzer()
vader_preds_raw = train['text'].map(lambda x: analyser.polarity_scores(x)['compound'])
vader_preds = vader_preds_raw.map(lambda x: to_label(x)).tolist()
labels = train['airline_sentiment'].tolist()


In [49]:
accuracy(labels, vader_preds)

0.5338114754098361

In [86]:
cr_vader = classification_report(labels,vader_preds,output_dict=True)
pd.DataFrame(cr_vader).transpose()

Unnamed: 0,f1-score,precision,recall,support
0,0.402649,0.385205,0.421749,3099.0
1,0.472926,0.325965,0.861193,2363.0
2,0.6308,0.893885,0.487361,9178.0
micro avg,0.533811,0.533811,0.533811,14640.0
macro avg,0.502125,0.535018,0.590101,14640.0
weighted avg,0.557023,0.694541,0.533811,14640.0
