## Simple Naive Bayes

In [None]:
# Importing libraries
import numpy as np 
import pandas as pd 
from numpy import array
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report 
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Reading the dataset
data = pd.read_csv('/home/alaa/Desktop/Data_Analytics/Tweets.csv')

---
### Preprocessing

In [3]:
# Data cleaning and preprocessing
# Removing mentions starting with '@'
def remove_mentions(input_tweet):
    for i in range(len(input_tweet)):
        input_tweet[i] = re.sub(r'@\w+', '', input_tweet[i])
    return input_tweet

# Removing hyperlinks starting with 'http'
def remove_links(input_tweet):
    for i in range(len(input_tweet)):
        input_tweet[i] = re.sub(r'http\S+', '',input_tweet[i])
    return input_tweet

# Removing stopwords available in stopwords library
def remove_stopwords(input_tweet):
    list_of_stopwords = stopwords.words('english')
    # Keeping meaningful stopwords
    important_stopwords = ["not", "no"]
    words = input_tweet.split()
    clean_stopwords = [word for word in words 
                       if (word not in list_of_stopwords or word in important_stopwords) and len(word) > 1]
    return " ".join(clean_stopwords)

# Removing all punctuation symbols
def remove_punctuation(input_tweet):
    for i in range(len(input_tweet)):
        input_tweet[i] = re.sub(r'[^\w\s]','',input_tweet[i])
    return input_tweet

# Converting text to lowercase
def lower_case(input_tweet):
    for i in range(len(input_tweet)):
        input_tweet[i] = input_tweet[i].lower()
    return input_tweet

In [4]:
# Pre-processing steps
data_new = data[['text', 'airline_sentiment']]
preprocessed_data = data_new.apply(remove_mentions).apply(remove_links).apply(remove_punctuation).apply(lower_case)
cleaned_tweets = []
for tweets in preprocessed_data.text:
    clean_stopwords = remove_stopwords(tweets)
    cleaned_tweets.append(clean_stopwords)

X = cleaned_tweets
Y = preprocessed_data['airline_sentiment']

# Mapping for Label Encoding
Y = Y.map({'negative':0, 'positive':1, 'neutral':2}).astype(int)

---
### Performing Naive Bayes

In [5]:
# Label Encoding for features
values = array(X)
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

In [6]:
# Splitting dataset into train, test and validation sets
X_train, X_test, y_train, y_test = train_test_split(onehot_encoded, Y, test_size=0.15, random_state=20)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.18, random_state=20)

In [7]:
# Function to print results
def results(labels, pred):
    print(confusion_matrix(labels,pred))  
    print(classification_report(labels,pred))  
    print(accuracy_score(labels, pred))

In [8]:
# Initializing Multinomial Naive Bayes
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Testing accuracy on Validation data
print ("Validation Accuracy of Naiive Bayes: %s" % ( accuracy_score(y_val, clf.predict(X_val))))

Validation Accuracy of Naiive Bayes: 0.6441964285714286


In [9]:
# Testing accuracy on test data and printing results
print ("Test Accuracy of Naiive Bayes: %s" % ( accuracy_score(y_test, clf.predict(X_test))))
results(y_test, clf.predict(X_test))

Test Accuracy of Naiive Bayes: 0.651183970856102
[[1394    0    0]
 [ 334   27    1]
 [ 430    1    9]]
              precision    recall  f1-score   support

           0       0.65      1.00      0.78      1394
           1       0.96      0.07      0.14       362
           2       0.90      0.02      0.04       440

    accuracy                           0.65      2196
   macro avg       0.84      0.37      0.32      2196
weighted avg       0.75      0.65      0.53      2196

0.651183970856102
