In [4]:
import collections

import numpy as np

import util
import svm


In [10]:
train_positive_messages,train_negative_messages = util.load_reviews_from_folder('../aclImdb/train/pos'),util.load_reviews_from_folder('../aclImdb/train/neg')
test_positive_messages,test_negative_messages = util.load_reviews_from_folder('../aclImdb/test/pos'),util.load_reviews_from_folder('../aclImdb/test/neg')

In [14]:
def get_words_into_list(review):
    normalized_words = []
    for word in review.split(' '):
        normalized_words.append(word.lower())
    return normalized_words

In [15]:
def create_dictionary(reviews):
    from collections import defaultdict
    word_to_message_count = defaultdict(int)

    for review in reviews:
        unique_words = set(get_words_into_list(review))
        for word in unique_words:
            word_to_message_count[word] += 1
    filtered_words = [word for word, count in word_to_message_count.items() if count >= 5]
    word_to_index = {word: idx for idx, word in enumerate(sorted(filtered_words))}
    return word_to_index

In [19]:
word_dictionary = create_dictionary(train_positive_messages)

In [20]:
def transform_text(messages, word_dictionary):
    arr_to_return = []
    for message in messages:
        word_counts = np.zeros(len(word_dictionary), dtype=int)
        for word in get_words_into_list(message):
            if word in word_dictionary:
                word_counts[word_dictionary[word]] += 1
        arr_to_return.append(word_counts)
    return np.array(arr_to_return, dtype=int)

In [22]:
new_train_positive_matrix = transform_text(train_positive_messages, word_dictionary)
new_train_negative_matrix = transform_text(train_negative_messages, word_dictionary)


(12500, 28241)


In [24]:
#construct prior probabilities for the model
model = {}
model['positive_prior'] = np.log(len(train_positive_messages) / (len(train_positive_messages) + len(train_negative_messages)))
model['negative_prior'] = np.log(len(train_negative_messages) / (len(train_positive_messages) + len(train_negative_messages)))

#compute the conditional probabilities for each word given positive and negative labels
positive_word_counts = new_train_positive_matrix.sum(axis=0)
negative_word_counts = new_train_negative_matrix.sum(axis=0)

# Calculate total words in each class
total_positive_words = positive_word_counts.sum()
total_negative_words = negative_word_counts.sum()

# Apply Laplace smoothing to get probabilities
vocab_size = len(word_dictionary)

probability_positive = (positive_word_counts + 1) / (total_positive_words + vocab_size)
probability_negative = (negative_word_counts + 1) / (total_negative_words + vocab_size)

model['word_given_positive'] = np.log(probability_positive)
model['word_given_negative'] = np.log(probability_negative)


print(model)





{'positive_prior': 0.5, 'negative_prior': 0.5, 'word_given_positive': array([2.36035691e-04, 1.00440720e-05, 6.81562025e-06, ...,
       1.25550899e-05, 5.02203598e-06, 1.43486742e-05]), 'word_given_negative': array([2.36035691e-04, 1.00440720e-05, 6.81562025e-06, ...,
       1.25550899e-05, 5.02203598e-06, 1.43486742e-05])}


In [28]:
#predict the test set
new_test_positive_matrix = transform_text(test_positive_messages, word_dictionary)
new_test_negative_matrix = transform_text(test_negative_messages, word_dictionary)

number_of_accurate_predictions = 0
number_of_total_predictions = len(new_test_positive_matrix) + len(new_test_negative_matrix)
for i in range(len(new_test_positive_matrix)):
    positive_score = np.log(model['positive_prior'] + np.sum(new_test_positive_matrix[i] * model['word_given_positive']))
    negative_score = np.log(model['negative_prior'] + np.sum(new_test_negative_matrix[i] * model['word_given_negative']))
    
    if positive_score > negative_score:
        number_of_accurate_predictions += 1
for i in range(len(new_test_negative_matrix)):
    positive_score = np.log(model['positive_prior'] + np.sum(new_test_positive_matrix[i] * model['word_given_positive']))
    negative_score = np.log(model['negative_prior'] + np.sum(new_test_negative_matrix[i] * model['word_given_negative']))
    
    if negative_score > positive_score:
        number_of_accurate_predictions += 1
print(f'Accuracy: {number_of_accurate_predictions / number_of_total_predictions * 100:.2f}%')



Accuracy: 50.00%
