In [18]:
import pandas as pd
import numpy as np
import nltk
import string
from sklearn.model_selection import train_test_split

In [19]:
df = pd.read_csv('MovieReviewTrainingDatabase.csv')
train, test = train_test_split(df, test_size=0.01)

In [20]:
concatenated_positive = []
concatenated_negative = []
positive_bow = []
negative_bow = []
num_positive_documents = 0;
num_negative_documents = 0;
for index, row in train.iterrows():
    sentiment = row['sentiment']
    unprocessed_review = row['review']
    processed_review = "".join([char for char in unprocessed_review if char not in string.punctuation])
    processed_review = processed_review.lower()
    if sentiment == 'Positive':
        concatenated_positive += nltk.tokenize.word_tokenize(processed_review)
        num_positive_documents += 1
    else:
        concatenated_negative += nltk.tokenize.word_tokenize(processed_review)
        num_negative_documents += 1

positive_bow = list(set(concatenated_positive))
negative_bow = list(set(concatenated_negative))
vocab = list(set(positive_bow + negative_bow))

### The equation for the class chosen by a Naive Bayes classifier:

$$c_{NB} = \underset{c \in C}{\arg\max}\log P(c) + \sum_{i \in positions}^{}\log P(w_{i}|c)$$

In [21]:
log_prior_positive = np.log(num_positive_documents / (num_positive_documents + num_negative_documents))
log_prior_negative = np.log(num_negative_documents / (num_positive_documents + num_negative_documents))

In [23]:
num_correct = 0;
num_total = 0;

fdist_positive = nltk.FreqDist(concatenated_positive)
word_freqs_positive = dict((k, v) for k, v in fdist_positive.items())

fdist_negative = nltk.FreqDist(concatenated_negative)
word_freqs_negative = dict((k, v) for k, v in fdist_negative.items())


for index, row in test.iterrows():
    sentiment = row['sentiment']
    unprocessed_review = row['review']
    processed_review = "".join([char for char in unprocessed_review if char not in string.punctuation])
    processed_review = processed_review.lower()
    
    positive_sum = log_prior_positive
    negative_sum = log_prior_negative

    for word in nltk.tokenize.word_tokenize(processed_review):
        if word not in vocab:
            continue
        if word not in word_freqs_positive:
            numerator = 1
        else:
            numerator = word_freqs_positive[word] + 1
        denominator = (len(concatenated_positive)+len(vocab))
        positive_sum += np.log(numerator/denominator)
        if word not in word_freqs_negative:
            numerator = 1
        else:
            numerator = word_freqs_negative[word] + 1
        denominator = (len(concatenated_negative)+len(vocab))
        negative_sum += np.log(numerator/denominator)

    if (positive_sum > negative_sum and sentiment == 'Positive') or (negative_sum > positive_sum and sentiment == 'Negative'):
        num_correct += 1

    num_total += 1

In [24]:
print('Accuracy: ', num_correct / num_total)

Accuracy:  0.836
