In [1]:
import re
import string
from time import time 
import math as m

import torch
import torch.nn as nn
from torchsummary import summary

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


In [2]:
nltk.download('stopwords')
punct = string.punctuation
stopwords_english = stopwords.words('english')
stemmer = PorterStemmer()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
dataset = pd.read_csv('imdb_complete.csv')
dataset.head(10)

Unnamed: 0,text,polarity
0,"first think another Disney movie, might good, ...",1
1,"Put aside Dr. House repeat missed, Desperate H...",0
2,"big fan Stephen King's work, film made even gr...",1
3,watched horrid thing TV. Needless say one movi...,0
4,truly enjoyed film. acting terrific plot. Jeff...,1
5,"memory ""The Last Hunt"" stuck since saw 1956 13...",1
6,"Shakespeare fan, appreciate Ken Branagh done b...",0
7,privilege watching Scarface big screen beautif...,1
8,real classic. shipload sailors trying get town...,1
9,Serials short subjects originally shown theate...,1


## TEXT PREPROCESSING

In [4]:
def getkey(dict_, key):
    if key in dict_.keys():
        return dict_[key]
    else:
        return 0

In [5]:
def preprocessing(string, stopwords, stemmer):
    '''We can do all the preprocessing in just one step by creating a pipeline
    First, we have to make all the words in lowercase,
    then we have to tokenize the string,
    then we have to remove stopwords and 
    finally we have to stem all the words.
    This is how it will be ready to be analyzed further'''
    string  = string.lower()
    tokens = re.split('\s|(?<!\d)[,.](?!\d)', string)
    clean_tokens = []
    for word in tokens:
        if word not in stopwords:
            clean_tokens.append(word)
    
    stemmed_words = []
    for word in clean_tokens:
        stemmed_words.append(stemmer.stem(word))
    
    preprocessed_array = []
    for word in stemmed_words:
        if word!='':
            preprocessed_array.append(word)
            
    return preprocessed_array
#########################################
dataset['preprocessed'] = dataset['text'].apply(lambda x:preprocessing(x, stopwords_english, stemmer))    
dataset.head(10)

Unnamed: 0,text,polarity,preprocessed
0,"first think another Disney movie, might good, ...",1,"[first, think, anoth, disney, movi, might, goo..."
1,"Put aside Dr. House repeat missed, Desperate H...",0,"[put, asid, dr, hous, repeat, miss, desper, ho..."
2,"big fan Stephen King's work, film made even gr...",1,"[big, fan, stephen, king', work, film, made, e..."
3,watched horrid thing TV. Needless say one movi...,0,"[watch, horrid, thing, tv, needless, say, one,..."
4,truly enjoyed film. acting terrific plot. Jeff...,1,"[truli, enjoy, film, act, terrif, plot, jeff, ..."
5,"memory ""The Last Hunt"" stuck since saw 1956 13...",1,"[memori, ""the, last, hunt"", stuck, sinc, saw, ..."
6,"Shakespeare fan, appreciate Ken Branagh done b...",0,"[shakespear, fan, appreci, ken, branagh, done,..."
7,privilege watching Scarface big screen beautif...,1,"[privileg, watch, scarfac, big, screen, beauti..."
8,real classic. shipload sailors trying get town...,1,"[real, classic, shipload, sailor, tri, get, to..."
9,Serials short subjects originally shown theate...,1,"[serial, short, subject, origin, shown, theate..."


## BUILDING THE VOCABULARY 

In [6]:
def build_frequency(processed_strings, polarities):
    vocab = {}
    for string, polarity in zip(processed_strings, polarities):
        for word in string:
            pair = (word, polarity)
            if pair in vocab:
                vocab[pair]+=1
            else:
                vocab[pair] = 1
    return vocab
#####################################
vocab = build_frequency(dataset['preprocessed'], dataset['polarity'])

## FORMING THE PROBABILITY DICTIONARY 

In [7]:
# now building up the probability dictionary
'''
# example for testing 
# first testing it on a small dataset and verifying it
sample_tweets = ["I am happy because i am learning NLP", "I am happy, not sad"
         ,"I am sad, i am learning NLP", "I am sad, not happy"]
sample_labels = [1,1,0,0]
sample_tweets = [tweet.lower() for tweet in sample_tweets]
sample_tweets = [re.split('\s|(?<!\d)[,.](?!\d)', tweet) for tweet in sample_tweets]
sample_processed_tweets = []
for i in range(len(sample_tweets)):
    temp = []
    for word in sample_tweets[i]:
        if word!='':
            temp.append(stemmer.stem(word))
    sample_processed_tweets.append(temp)
sample_vocab = build_frequency(sample_processed_tweets, sample_labels)
'''
def log_likelihood(sample_vocab):
    num_unique_words = 0
    num_positive_occurences = 0
    num_negative_occurences = 0
    #####
    unique_words = set()
    for key in sample_vocab.keys():
        unique_words.add(key[0])
        if key[1]==0:
            num_negative_occurences+= sample_vocab[key]
        else:
            num_positive_occurences+= sample_vocab[key]
    num_unique_words = len(unique_words)

    # building the dictionary (with laplacian smoothing)
    sample_prob_dict_lap = {}
    for word in unique_words:
        sample_prob_dict_lap[(word, 0)] =  (1 + getkey(sample_vocab, (word, 0)))/(num_unique_words + num_negative_occurences)   
        sample_prob_dict_lap[(word, 1)] =  (1 + getkey(sample_vocab, (word, 1)))/(num_unique_words + num_positive_occurences)   

    likelihood_dict = {}
    for word in unique_words:
        likelihood_dict[word] = m.log(sample_prob_dict_lap[(word, 1)]/sample_prob_dict_lap[(word, 0)])
    
    return likelihood_dict
            
prob_dict = log_likelihood(vocab)

In [8]:
def predict(prob_dict, processed_tweet, log_prior):
    # processed_tweet will be a list of stemmed tokens
    # positive log_value means a positive tweet and vice_versa
    log_value = log_prior # to overcome class imbalance
    for word in processed_tweet:
        log_value+= getkey(prob_dict, word) 
    return log_value

In [9]:
num_positive_tweets = sum(dataset['polarity']==1)
num_negative_tweets = sum(dataset['polarity']==0)
log_prior = m.log(num_positive_tweets/num_negative_tweets)

In [11]:
# sample_example
index = 5
tweet, label = dataset['preprocessed'][index], dataset['polarity'][index] 
print(f"tweet = {tweet}, \n\n\nlabel = {label}")
log_value = predict(prob_dict, tweet, log_prior)
print(f"log_value = {log_value}\n\n\n")

index = 1
tweet, label = dataset['preprocessed'][index], dataset['polarity'][index] 
print(f"tweet = {tweet}, \n\n\nlabel = {label}")
log_value = predict(prob_dict, tweet, log_prior)
print(f"log_value = {log_value}")

tweet = ['memori', '"the', 'last', 'hunt"', 'stuck', 'sinc', 'saw', '1956', '13.', 'movi', 'far', 'ahead', 'other', 'time', 'address', 'treatment', 'nativ', 'environ', 'ever', 'present', 'contrast', 'short', 'long', 'term', 'effect', 'greed', 'relev', 'today', '1956,', 'cinemagraph', 'discuss', 'utmost', 'depth', 'relev', 'top', 'set', 'beauti', 'cinematographi', 'excel', 'memori', 'movi', 'end', 'day'], 


label = 1
log_value = 14.376497091451666



tweet = ['put', 'asid', 'dr', 'hous', 'repeat', 'miss', 'desper', 'housew', '(new)', 'watch', 'one', 'know', 'exactli', 'plagu', 'movi', 'never', 'thought', "i'd", 'say', 'want', '15', 'minut', 'fame', 'back', '<br', '/><br', '/>script', 'direct', "can't", 'say', 'recogn', 'stabl', 'actor', '(the', 'usual', 'suspects)', 'thought', 'herbert', 'marshal', 'class', 'addit', 'sat', 'good', 'cheesi', 'flick', 'boy', 'wrong', 'dullsvil', '<br', '/><br', '/>mi', 'favorit', 'parts:', '"offic', 'girl"', 'make', '029', 'keypunch', 'put', 'card', '087

In [12]:
# let us check accuracy on this dataset
labels = dataset['polarity'].iloc[:].values
tweets = dataset['preprocessed'].iloc[:].values
accuracy = 0
num_examples = labels.shape[0]
for index in range(num_examples):
    if predict(prob_dict, tweets[index], log_prior)>=0:
        accuracy+= labels[index]==1
    else:
        accuracy+= labels[index]==0
accuracy/= num_examples

print(f"Accuacry on the training set is {accuracy*100}")

Accuacry on the training set is 95.45


### Naive Bayes can go wrong due to many reasons, some of them can be - 
* Word Order changes the meaning of sentence.
* Preprocessing step removing something important words which might play important role in determining.
* These models can't understand sarcasm(obviously)