In [7]:
import csv
import re
import math
from collections import Counter
import random 

with open('rt_reviews.csv', 'r', newline='', encoding='cp1252') as csvfile:
    reader = csv.DictReader(csvfile)
    merged_data = []
    for row in reader:
        review = row['Review'].lower()
        row['Review'] = review
        merged_data.append(row)

In [8]:
# Divide the dataset into train, development and test sets
random.shuffle(merged_data)
train_data = merged_data[:int(0.8*len(merged_data))]
dev_data = merged_data[int(0.8*len(merged_data)):int(0.9*len(merged_data))]
test_data = merged_data[int(0.9*len(merged_data)):]




In [9]:
# Build a vocabulary as a list
word_freq = {}
for review in train_data:
    words = review['Review'].split()
    for word in words:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1

vocabulary = [word for word in word_freq if word_freq[word] >= 5]


In [10]:
# Create a reverse index
reverse_index = {}
for i, word in enumerate(vocabulary):
    reverse_index[word] = i

In [11]:
#Part C
# Count the number of documents containing 'the'
doc_containing_the = 0
for review in train_data:
    if 'the' in review['Review']:
        doc_containing_the += 1

# Calculate the probability of occurrence
P_the = doc_containing_the / len(train_data)

print(f"Probability of occurrence of 'the': {P_the:.4f}")

Probability of occurrence of 'the': 0.7131


In [12]:
# Count the number of positive reviews containing 'the'
pos_doc_containing_the = 0
for review in train_data:
    if review['Freshness'] == 'fresh' and 'the' in review['Review']:
        pos_doc_containing_the += 1

# Calculate the number of positive reviews
num_pos_docs = sum([1 for review in train_data if review['Freshness'] == 'fresh'])

# Calculate the conditional probability
if num_pos_docs != 0:
    P_the_given_pos = pos_doc_containing_the / num_pos_docs
else:
    P_the_given_pos = 0

print(f"Conditional probability of 'the' given Positive: {P_the_given_pos:.4f}")


Conditional probability of 'the' given Positive: 0.7068


In [14]:
import math

class NaiveBayesClassifier:
    def __init__(self):
        self.vocab = set()
        self.class_prior = {}
        self.word_freq = {}
        self.class_word_count = {}
        self.class_total_count = {}
        self.num_classes = 0

    def tokenize(self, review):
        # convert to lowercase and split into words
        words = review.lower().split()
        # remove any non-alphanumeric characters
        words = [word.strip('.,!?()[]{}') for word in words]
        # remove any empty strings
        words = [word for word in words if len(word) > 0]
        return words
    def fit(self, reviews, labels):
        self.num_classes = len(set(labels))
        # compute class prior probabilities
        for label in labels:
            if label not in self.class_prior:
                self.class_prior[label] = 0
            self.class_prior[label] += 1
        for label in self.class_prior:
            self.class_prior[label] /= len(labels)
        # compute word frequency and class word count
        for i in range(len(reviews)):
            words = self.tokenize(reviews[i])
            label = labels[i]
            self.class_word_count[label] = self.class_word_count.get(label, 0) + len(words)
            self.class_total_count[label] = self.class_total_count.get(label, 0) + 1
            for word in words:
                self.vocab.add(word)
                if word not in self.word_freq:
                    self.word_freq[word] = {}
                if label not in self.word_freq[word]:
                    self.word_freq[word][label] = 0
                self.word_freq[word][label] += 1

    def predict(self, reviews):
        predictions = []
        for review in reviews:
            words = self.tokenize(review)
            class_scores = {}
            for label in self.class_prior:
                score = math.log(self.class_prior[label])
                for word in words:
                    if word in self.vocab and label in self.word_freq[word]:
                        word_prob = self.word_freq[word][label] / self.class_word_count[label]
                        score += math.log(word_prob)
                class_scores[label] = score
            pred_label = max(class_scores, key=class_scores.get)
            predictions.append(pred_label)
        return predictions

In [15]:
model = NaiveBayesClassifier()
train_reviews = [row['Review'] for row in train_data]
train_labels = [row['Freshness'] for row in train_data]
model.fit(train_reviews, train_labels)

In [None]:
# make predictions on the dev set
dev_reviews = [row['Review'] for row in dev_data]
dev_labels = [row['Freshness'] for row in dev_data]
dev_predictions = model.predict(dev_reviews)
accuracy = sum(1 for i in range(len(dev_labels)) if dev_labels[i] == dev_predictions[i]) / len(dev_labels)
print(f"Accuracy on dev data: {accuracy}")