# Sentiment Analysis with Naive Bayes
We apply Naive Bayes on the Twitter Sentiment Analysis data. To ease the problem, we will filter the dataset to include only positive and negative tweets. 

# Prepare data

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data_path = "datasets/twitter_sentiment_analysis/twitter_training.csv"
train_data = pd.read_csv(train_data_path,header=None)
train_data.columns = ["Tweet_ID","entity","sentiment","Tweet_content"]

test_data_path = "datasets/twitter_sentiment_analysis/twitter_validation.csv"
test_data = pd.read_csv(test_data_path,header=None)
test_data.columns = ["Tweet_ID","entity","sentiment","Tweet_content"]

In [3]:
## Inlcude Only "Positive" and "Negatvie" twitts to form a binary classification problem
## Label Positve as 1 and Negative as 0
train_data = train_data[train_data.sentiment.isin(["Positive","Negative"])]
train_data["label"] = train_data.sentiment.map({"Positive":1, "Negative":0})
test_data = test_data[test_data.sentiment.isin(["Positive","Negative"])]
test_data["label"] = test_data.sentiment.map({"Positive":1, "Negative":0})

# Build Naive Bayes classifier

In [5]:
import spacy
import re

In [6]:
class TwitterNBClassifier:
    def __init__(self,nlp):
        self.nlp = nlp
        self.loglikelihood={}
        self.logprior = 0
    def process_tweet_spacy(self, tweet, lemmetize=True):
        # remove old sytle retweet text "RT"
        tweet = str(tweet)
        tweet2 = re.sub(r'^RT[\s]+','', tweet)
        # remove hyperlinks
        tweet2 = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet2)
        # remove hashtags
        # only removing the hash # sign from the word
        tweet2 = re.sub(r'#', '', tweet2)

        doc = self.nlp(tweet2)
        # remove stopworks and punctuation
        if lemmetize:
            return [token.lemma_.lower() for token in doc if (not token.is_stop) and (not token.is_punct) ]
        else:
            return [token.text.lower() for token in doc if (not token.is_stop) and (not token.is_punct) ]

    def freq_counts(self, X, y):
        freqs ={}
        vocab = set()
        neg_pos_count =[0, 0]
        for tweet, label in zip(X,y):
            tokenized_tweet = self.process_tweet_spacy(tweet)
            for token in tokenized_tweet:
                vocab.add(token)
                neg_pos_count[label] += 1
                pair = (label, token)
                if pair in freqs:
                    freqs[pair] += 1
                else:
                    freqs[pair] = 1    
        return freqs, vocab, neg_pos_count
    
    def fit(self, X,y):
        freqs, vocab, neg_pos_count = self.freq_counts(X,y)
        V = len(vocab)
        for word in vocab:
            p_pos = (freqs.get((1, word),0)+1)/(neg_pos_count[1]+V)
            p_neg = (freqs.get((0, word),0)+1)/(neg_pos_count[0]+V)
            self.loglikelihood[word] = np.log(p_pos)-np.log(p_neg)
        self.logprior = np.log(np.sum(y)) - np.log(np.sum(1-y))
    
    def predict(self, X):
        score = np.ones(len(X))*self.logprior
        for i, tweet in enumerate(X):
            tokenized_tweet = self.process_tweet_spacy(tweet)
            for token in tokenized_tweet:
                score[i] += self.loglikelihood.get(token, 0)
        return np.where(score>0, 1, 0)


In [7]:
nlp = spacy.load("en_core_web_sm")
mynb = TwitterNBClassifier(nlp)

In [8]:
mynb.fit(train_data.Tweet_content, train_data.label)

In [15]:
ypred = mynb.predict(test_data.Tweet_content)

# Test data Evaluation 

In [17]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [18]:
ytest = test_data.label.values

In [19]:
print(f"Accuracy over test data is {accuracy_score(ytest, ypred)}")
print(f"Precision over test data is {precision_score(ytest, ypred)}")
print(f"Recall over test data is {recall_score(ytest, ypred)}")
print(f"F1 score over test data is {f1_score(ytest, ypred)}")

Accuracy over test data is 0.9134438305709024
Precision over test data is 0.9259259259259259
Recall over test data is 0.9025270758122743
F1 score over test data is 0.9140767824497257


We achieve an accuracy score of >91% which is pretty good. Looks like for this dataset, the Naive Bayes classifier perfomrs much better than logistic regression.