In [159]:
# references
# https://networking.ringofsaturn.com/Web/removetags.php
# https://towardsdatascience.com/5-simple-ways-to-tokenize-text-in-python-92c6804edfc4
# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://www.geeksforgeeks.org/python-lemmatization-with-nltk/
# https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas
# https://machinelearningmastery.com/train-test-split-for-evaluating-machine-learning-algorithms/
# https://en.wikipedia.org/wiki/Precision_and_recall
# https://towardsdatascience.com/multi-class-metrics-made-simple-part-i-precision-and-recall-9250280bddc2
# https://androidkt.com/micro-macro-averages-for-imbalance-multiclass-classification/
# https://towardsdatascience.com/multi-class-metrics-made-simple-part-ii-the-f1-score-ebe8b2c2ca1.

# Intro
In the Name of Allah

Sentiment analysis is a technique through which you can analyze a piece of text to determine the sentiment behind it. In this notebook, we're going to train a Naïve Bayes Classifier for the task of sentiment analysis on Imdb movie reviews dataset.

**Please pay attention to these notes:**

<br/>

- **Assignment Due:** 1400/09/19 23:59
- Write your code in the cells denoted by:
```
######## Your Code Here ########
```
- You can add more cells if necessary
- Finding any sort of copying will zero down your grade.
- When your solution is ready to submit, don't forget to set the name of this notebook like  "Name_StudentID.ipynb".
- If you have any questions about this assignment, feel free to drop us a line. You can also ask your questions on the telegram group.
- You must run this notebook on Google Colab platform.

<br/>



# Libraries

In [2]:
# importing the libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import string
from nltk.stem import WordNetLemmatizer
import collections
from collections import Counter
from sklearn.model_selection import train_test_split as tts

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tohid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tohid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tohid\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# !wget https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv
# I ran code on my laptop with Windows which does not have wget (can not easily used) so I commented this cell and used curl in git-bash to download it.
# !curl https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/master/IMDB-Dataset.csv > IMDB-Dataset.csv

# Load data

In [4]:
imdb = pd.read_csv("IMDB-Dataset.csv")

In [5]:
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Preprocess
The first step of NLP is text preprocessing. Data cleaning is a very crucial step in any machine learning model, but more so for NLP. Without the cleaning process, the dataset is often a cluster of words that the computer doesn’t understand. Raw data over a properly or improperly formed sentence is not always desirable as it contains lot of unwanted components like null/html/links/url/emoji/stopwords etc. So in this step, this unwanted components are removed for better performance and accuracy.

In [6]:
# step 1: html tags removing
def html_tag_remover(text):
    text = re.sub('<[a-zA-Z\/][^>]*>', '', text)
    return text


# step 2: punctuations removing
def punctuations_remover(text):
    text = re.sub('[^\w\s\d]', '', text)
    return text


# step 3: lowercasing
def lowercaser(text):
    text = text.lower()
    return text



# step 4: tokenizing
def tokenizer(text):
    tokens = nltk.tokenize.word_tokenize(text)
    return tokens


# step 5: stopwords removing
def stopwords_remover(tokens):
    stop_words = nltk.corpus.stopwords.words('english')
    stop_words_removed_list = []
    for token in tokens:
        if token not in stop_words:
            stop_words_removed_list.append(token)
            
    return stop_words_removed_list


# step 6: lemmatizing
def lemmatizer(tokens):
    lemmatized_tokens = []
    for token in tokens:
        lemmatized_tokens.append(WordNetLemmatizer().lemmatize(token))
    
    return lemmatized_tokens

In [11]:
# Put it all together
def text_preprocessor(text):
    html_tag_removed = html_tag_remover(text)
    punctuations_removed = punctuations_remover(html_tag_removed)
    lowercased = lowercaser(punctuations_removed)
    tokens = tokenizer(lowercased)
    stopwords_removed_tokens = stopwords_remover(tokens)
    lemmatized_stopwords_removed_tokens = lemmatizer(stopwords_removed_tokens)

    return list(set(lemmatized_stopwords_removed_tokens))

In [12]:
# Test

# test_sample = imdb.iloc[1]["review"]
# html_tag_remover(test_sample)
# punctuations_remover(test_sample)
# lowercaser(test_sample)
# tokens = tokenizer(test_sample)
# tokens = stopwords_remover(tokens)
# lemmatizer(tokens)
# test_sample

# lemmatizer(stopwords_remover(tokenizer(lowercaser(punctuations_remover(html_tag_remover(test_sample))))))

# text_preprocessor(test_sample)

In [13]:
# Run on dataset and make a list of preprocessed tokens and labels
def dataset_preprocess(df):
    dataset_preprocessed_list = []
    labels = []
    for index, row in df.iterrows():
        if index%10000==0:
            print(index)
        dataset_preprocessed_list.append(text_preprocessor(row["review"]))
        labels.append(row["sentiment"])
    return dataset_preprocessed_list, labels

def labels_encoder(labels):
    encoded_labels = []
    for i in labels:
        if i == "positive":
            encoded_labels.append(1)
        elif i == "negative":
            encoded_labels.append(0)
            
    return encoded_labels

In [14]:
dataset, labels = dataset_preprocess(imdb)

0
10000
20000
30000
40000


In [15]:
encoded_labels = labels_encoder(labels)

In [16]:
print(len(encoded_labels))
print(dataset[0])

50000
['side', 'street', 'em', 'scene', 'reviewer', 'mainly', 'get', 'pretty', 'audience', 'wholl', 'mentioned', 'prison', 'experimental', 'inwards', 'section', 'middle', 'timid', 'classic', 'sold', 'first', 'viewingthats', 'called', 'punch', 'home', 'dealing', 'become', 'use', 'got', 'show', 'awayi', 'pull', 'manyaryans', 'faint', 'hooked', 'taste', 'developed', 'never', 'nickname', 'due', 'nasty', 'say', 'front', 'hearted', 'muslim', 'may', 'glass', 'surreal', 'mess', 'touch', 'uncomfortable', 'face', 'away', 'happened', 'painted', 'romanceoz', 'episode', 'death', 'agenda', 'charm', 'one', 'experience', 'youll', 'regard', 'set', 'comfortable', 'mannered', 'skill', 'crooked', 'graphic', 'inmate', 'watching', 'hardcore', 'go', 'security', 'doesnt', 'wouldnt', 'city', 'turned', 'class', 'penitentary', 'latino', 'emerald', 'dodgy', 'sex', 'fact', 'dare', 'italian', 'moreso', 'guard', 'bitch', 'exactly', 'brutality', 'drug', 'wordit', 'forget', 'unflinching', 'injustice', 'word', 'gangsta

<font size="5">Split the dataset</font>

Data splitting, or commonly known as train-test split, is the partitioning of data into subsets for model training and evaluation separately. Since the test set is not specified beforehand, we have to split the dataset into train and test set in an ideal proportion. 


In [17]:
X_train, X_test, y_train, y_test = tts(dataset, encoded_labels, test_size=0.2)

In [18]:
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

40000
10000
40000
10000


# Training
Use Naive Beyes algorithm to train a Language Model

In [132]:
class NB_classifier:
    def __init__(self):
        pass

         
    # compute classes prior probability
    def compute_classes_probability(self):
        classes_probability = np.zeros(self.no_classes)
        for i in range(len(classes_probability)):
            classes_probability[i] = self.train_labels.count(i) / self.train_len
            
        return classes_probability
            
        
    # compute dictionaries of words occurrence for each class 
    def compute_dictionaries(self):
        all_classes_dictionaries = [{} for i in range(self.no_classes)]
        for row in range(len(self.train_set)):
            for token in self.train_set[row]:
                if token in all_classes_dictionaries[self.train_labels[row]]:
                    all_classes_dictionaries[self.train_labels[row]][token]+=1
                else:
                    all_classes_dictionaries[self.train_labels[row]][token]=1
            
        return all_classes_dictionaries
               
    # computer vocab size
    def computer_vocab_size(self):
        all_words = list()
        for i in range(self.no_classes):
            all_words+=list(set(list(self.all_classes_dictionaries[i].keys())))
        
        vocab_size = len(set(all_words))
        
        return vocab_size
    
    # count all words in each class
    def count_classes_words(self):
        classes_words_count = np.zeros(self.no_classes)
        for i in range(self.no_classes):
            classes_words_count[i]= sum(self.all_classes_dictionaries[i].values())
            
        return classes_words_count
    
    
    # token probablity calculator with - log probability is used to prevent underflow
    def token_probability_calculator(self, token, class_no):
        if token in self.all_classes_dictionaries[class_no]:
            count = self.all_classes_dictionaries[class_no][token]
        else:
            count = 0 
        probability = (count + 1) / (self.vocab_size + self.classes_words_count[class_no])
        
        return np.log(probability)
        
        
       
    # train - put it all together
    def train(self, train_set, train_labels):
        print("Training: computing probabilities and required info")
        self.train_set = train_set
        self.train_labels = train_labels
        self.train_len = len(self.train_set)
        self.no_classes = len(set(self.train_labels))
        self.classes_probability = self.compute_classes_probability()
        
        self.all_classes_dictionaries = self.compute_dictionaries()
        self.vocab_size = self.computer_vocab_size()
        self.classes_words_count = self.count_classes_words()
        

    # compute probability for each class and return the max one as  the predicted one
    # mode==0: input is tokenized, mode==1: input is raw text
    # log==True: print the probabilities
    def predict_a_sentence_probability(self, sentence, mode=0, log=False):
        probabilities = np.zeros(self.no_classes)
        if mode==0:
            tokens = sentence
        elif mode==1:
            tokens = text_preprocessor(sentence)
            
        for i in range(self.no_classes):
            partial_probabilities = np.zeros(len(tokens)+1)
            for j in range(len(tokens)):
                partial_probabilities[j] = self.token_probability_calculator(tokens[j], i)
                
            partial_probabilities[-1] = np.log(self.classes_probability[i])
            probabilities[i] = np.sum(partial_probabilities)
            
        if log:
            print(probabilities)
        
        return np.argmax(probabilities)
    
    # test on all samples of a dataset 
    # mode==0: input is tokenized, mode==1: input is raw text
    # log==True: print the probabilities
    def evaluate_on_a_dataset(self, test_set, mode=0, log=False):
        predicted_classes = np.zeros(len(test_set))
        for i in range(len(test_set)):
            predict = self.predict_a_sentence_probability(test_set[i], mode=mode, log=log)
            predicted_classes[i] = predict
        return predicted_classes

In [133]:
NB = NB_classifier()
NB.train(X_train, y_train)

Training: computing probabilities and required info


In [134]:
print("NB.no_classes: ",NB.no_classes)
print("NB.classes_probability: ",NB.classes_probability)
# print(NB.all_classes_dictionaries[0])
print("NB.vocab_size: ",NB.vocab_size)
print("NB.classes_words_count: ",NB.classes_words_count)

NB.no_classes:  2
NB.classes_probability:  [0.500975 0.499025]
NB.vocab_size:  181839
NB.classes_words_count:  [1949003. 1966939.]


# Test
Now you need to run inference on your test set

In [135]:
# one sample
print(NB.predict_a_sentence_probability(X_test[999],log=True))
print("-------------------------------------")
# all dataset
y_hat_test = NB.evaluate_on_a_dataset(X_test, mode=0, log=False)

[-1477.70301043 -1448.72599749]
1
-------------------------------------


In [136]:
print(len(y_hat_test))
print(y_hat_test[:20])
# 1 is positive - 0 is negative

10000
[1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1.]


# Evaluation
After training is finished, we need some metrics to evaluate the trained model on the test set. Here, you need to write code for utilizing the metrics bellow without the sklearn libraries!

Precision

In [221]:
# note that the imports from sklearn are only for showing that the results are correct!
from sklearn import metrics
print("sklearn-acc      :",metrics.accuracy_score(y_test, y_hat_test))
print("sklearn-presicion:",metrics.precision_score(y_test, y_hat_test))
print("sklearn-recall   :",metrics.recall_score(y_test, y_hat_test))
print("sklearn-f1       :",metrics.f1_score(y_test, y_hat_test))
print("sklearn-confusion matric:\n",metrics.confusion_matrix(y_test, y_hat_test))


# calculate confusion matrix by comparing the predicted results and real values
def confusion_matrix_calculator(y_test, y_pred, no_classes=2):
    assert len(y_test)==len(y_pred), "Lengths are not equal!"
    if no_classes==None:
        no_classes = len(set(y_test))

    confusion_matrix = np.zeros((no_classes,no_classes))
    
    for i in range(len(y_test)):
        confusion_matrix[int(y_test[i]), int(y_pred[i])] +=1
            
    return confusion_matrix


def precision_calculator(confusion_matrix):
    # precision = tp/(tp+fp)
    tp = confusion_matrix[1,1]
    fp = confusion_matrix[0,1]
    precision = tp/(tp+fp)
    return precision

sklearn-acc      : 0.8537
sklearn-presicion: 0.8726552730304293
sklearn-recall   : 0.8309188331018059
sklearn-f1       : 0.8512757954660976
sklearn-confusion matric:
 [[4350  611]
 [ 852 4187]]


In [222]:
confusion_matrix = confusion_matrix_calculator(y_test, y_hat_test, no_classes=2)
precision = precision_calculator(confusion_matrix)
print("precision: ", precision)

precision:  0.8726552730304293


Recall

In [223]:
def recall_calculator(confusion_matrix):
    # recall = tp/(tp+fn)
    tp = confusion_matrix[1,1]
    fn = confusion_matrix[1,0]
    recall = tp/(tp+fn)
    return recall

recall = recall_calculator(confusion_matrix)
print("recall: ", recall)

recall:  0.8309188331018059


F-measure

In [224]:
def f1_measure_calculator(confusion_matrix):
    # f1 = 2 * (precision*recall) / (precision+recall)
    precision = precision_calculator(confusion_matrix)    
    recall    = recall_calculator(confusion_matrix)
    f1 = 2 * (precision*recall) / (precision+recall)
    return f1

f1_score = f1_measure_calculator(confusion_matrix)
print("f1_score: ", f1_score)

f1_score:  0.8512757954660976


Confustion matrix

In [225]:
confusion_matrix

array([[4350.,  611.],
       [ 852., 4187.]])