<a href="https://colab.research.google.com/github/victoriaporter58/Airline-Sentiment-Analysis-using-Machine-Learning/blob/main/Naive_Bayes_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Naive Bayes Model
Our baseline to beat model.

##Import Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


##Import libraries

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from collections import Counter
import pandas as pd
import numpy as np

##Read in cleaned tweet data

For our baseline model we are only going to consider the airline sentiment and the tweet text.

In [None]:
#tweets_data_path = '/content/gdrive/My Drive/project_folder/Tweets.zip (Unzipped Files)/Cleaned.csv'
tweets_data_path = '/content/gdrive/My Drive/project_folder/Copy of CleanedSentimentAnalysis.csv'
tweets = pd.read_csv(tweets_data_path, header=0)

data_frame = tweets.copy()[['airline_sentiment', 'text']]

##Separate tweets by sentiment
We know that the total number of tweets is 14,641 so we now want to individually isolate them based on their sentiment (positive, negative or neutral)

In [None]:
num_tweet = 14641

data_frame_pos = data_frame.copy()[data_frame.airline_sentiment == 'positive'][:num_tweet]
data_frame_neg = data_frame.copy()[data_frame.airline_sentiment == 'negative'][:num_tweet]
data_frame_neu = data_frame.copy()[data_frame.airline_sentiment == 'neutral'][:num_tweet]

data_frame = pd.concat([data_frame_pos, data_frame_neg, data_frame_neu], ignore_index=True).reset_index(drop=True) 

##Split into train/test data
The test dataset makes up 10% of the total dataset.

* x_train, x_test: tweet text
* y_train, y_test: tweet sentiment

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data_frame['text'], data_frame['airline_sentiment'], test_size=0.1, random_state=0)

data_frame_train = pd.DataFrame()
data_frame_test = pd.DataFrame()

data_frame_train['text'] = x_train
data_frame_train['airline_sentiment'] = y_train
data_frame_train = data_frame_train.reset_index(drop=True)

data_frame_test['text'] = x_test
data_frame_test['airline_sentiment'] = y_test
data_frame_test = data_frame_test.reset_index(drop=True)

##Define class
This class contains functions that allow us to fit the NB classifier on the training data and predict the sentiment.

* **Fit function**: This functions computes naive bayes classification probabilities.

* **Predict function**: This function allows us to use the fit data to predict the classes of our test data.


In [None]:
class Classifier(object):

    def __init__(self, data_frame_train):
        self.data_frame_train = data_frame_train
        self.data_frame_pos = data_frame_train.copy()[data_frame_train.airline_sentiment == 'positive']
        self.data_frame_neg = data_frame_train.copy()[data_frame_train.airline_sentiment == 'negative']
        self.data_frame_neu = data_frame_train.copy()[data_frame_train.airline_sentiment == 'neutral']

    def fit(self):
        Pr_pos = data_frame_pos.shape[0]/self.data_frame_train.shape[0]
        Pr_neg = data_frame_neg.shape[0]/self.data_frame_train.shape[0]
        Pr_neu = data_frame_neu.shape[0]/self.data_frame_train.shape[0]
        self.Prior  = (Pr_pos, Pr_neg, Pr_neu)

        self.pos_words = ' '.join(self.data_frame_pos['text'].tolist()).split()
        self.neg_words = ' '.join(self.data_frame_neg['text'].tolist()).split()
        self.neu_words = ' '.join(self.data_frame_neu['text'].tolist()).split()

        all_words = ' '.join(self.data_frame_train['text'].tolist()).split()

        self.vocab = len(Counter(all_words))

        wc_pos = len(' '.join(self.data_frame_pos['text'].tolist()).split())
        wc_neg = len(' '.join(self.data_frame_neg['text'].tolist()).split())
        wc_neu = len(' '.join(self.data_frame_neu['text'].tolist()).split())
        self.word_count = (wc_pos, wc_neg, wc_neu)
        return self


    def predict(self, data_frame_test):
        class_choice = ['positive', 'negative', 'neutral']

        classification = []
        for tweet in data_frame_test['text']:
            text = tweet.split()

            val_pos = np.array([])
            val_neg = np.array([])
            val_neu = np.array([])
            for word in text:
                tmp_pos = np.log((self.pos_words.count(word)+1)/(self.word_count[0]+self.vocab))
                tmp_neg = np.log((self.neg_words.count(word)+1)/(self.word_count[1]+self.vocab))
                tmp_neu = np.log((self.neu_words.count(word)+1)/(self.word_count[2]+self.vocab))
                val_pos = np.append(val_pos, tmp_pos)
                val_neg = np.append(val_neg, tmp_neg)
                val_neu = np.append(val_neu, tmp_neu)

            val_pos = np.log(self.Prior[0]) + np.sum(val_pos)
            val_neg = np.log(self.Prior[1]) + np.sum(val_neg)
            val_neu = np.log(self.Prior[2]) + np.sum(val_neu)

            probability = (val_pos, val_neg, val_neu)
            classification.append(class_choice[np.argmax(probability)])
        return classification

##Initialise the classifier

In [None]:
classifier = Classifier(data_frame_train)

##Train the classifier

In [None]:
classifier = classifier.fit()

##Make predictions

In [None]:
predict = classifier.predict(data_frame_test)

##Analyse model performance
Visualise how well the model performed in detail using:

* Accuracy
* Classification Report
* Confusion Matrix

In [None]:
print("Accuracy: ", accuracy_score(data_frame_test.airline_sentiment.tolist(),predict), "\n")

print("Classification Report:")
print(classification_report(data_frame_test.airline_sentiment.tolist(),predict), "\n")

print("Confusion Matrix:")
print(confusion_matrix(data_frame_test.airline_sentiment.tolist(),predict))

Accuracy:  0.7668285912560722 

Classification Report:
              precision    recall  f1-score   support

    negative       0.79      0.93      0.85       889
     neutral       0.70      0.46      0.55       311
    positive       0.72      0.58      0.64       241

    accuracy                           0.77      1441
   macro avg       0.74      0.65      0.68      1441
weighted avg       0.76      0.77      0.75      1441
 

Confusion Matrix:
[[824  41  24]
 [138 142  31]
 [ 83  19 139]]
