In [13]:
import re
from collections import defaultdict
import math



In [14]:
# Step 1: Sample dataset
data = [
    ("I loved the movie", "positive"),
    ("I hated the movie", "negative"),
    ("A great movie , good movie", "positive"),
    ("poor acting", "negative"),
    ("great acting a good movie", "positive")
]



In [15]:
# Step 2: Tokenization
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())


In [16]:
# Step 3: Vocabulary & counts
vocab = set()
word_counts = {'positive': defaultdict(int), 'negative': defaultdict(int)}
class_word_totals = {'positive': 0, 'negative': 0}
class_counts = {'positive': 0, 'negative': 0}

for sentence, label in data:
    tokens = tokenize(sentence)
    class_counts[label] += 1
    class_word_totals[label] += len(tokens)
    for token in tokens:
        word_counts[label][token] += 1
        vocab.add(token)

vocab_size = len(vocab)



In [17]:
# Step 4: Probability calculation for a new test sentence
def calculate_class_probability(test_tokens, label):
    total_words = class_word_totals[label]
    prior = class_counts[label] / sum(class_counts.values())
    print(f"\nClass: {label.upper()}")
    print(f"P({label}) = {prior:.4f}")
    
    product = math.log(prior)  # use log to prevent underflow
    for word in test_tokens:
        count = word_counts[label][word]
        prob = (count + 1) / (total_words + vocab_size)
        product += math.log(prob)
        print(f"P({word}|{label}) = ({count}+1)/({total_words}+{vocab_size}) = {prob:.4f}")

    return product


In [18]:
# Step 5: Predict Function
def predict(sentence):
    print(f"\nInput Sentence: \"{sentence}\"\n" + "-"*50)
    tokens = tokenize(sentence)
    pos_score = calculate_class_probability(tokens, 'positive')
    neg_score = calculate_class_probability(tokens, 'negative')

    print(f"\nLog(P(+|doc)) = {pos_score:.6f}")
    print(f"Log(P(-|doc)) = {neg_score:.6f}")

    predicted_class = 'positive' if pos_score > neg_score else 'negative'
    print(f"\n→ Predicted Class: {predicted_class.upper()}")
    return predicted_class

In [19]:
# Step 6: Run Prediction
test_sentence = "I hated the poor activity"
predict(test_sentence)



Input Sentence: "I hated the poor activity"
--------------------------------------------------

Class: POSITIVE
P(positive) = 0.6000
P(i|positive) = (1+1)/(14+10) = 0.0833
P(hated|positive) = (0+1)/(14+10) = 0.0417
P(the|positive) = (1+1)/(14+10) = 0.0833
P(poor|positive) = (0+1)/(14+10) = 0.0417
P(activity|positive) = (0+1)/(14+10) = 0.0417

Class: NEGATIVE
P(negative) = 0.4000
P(i|negative) = (1+1)/(6+10) = 0.1250
P(hated|negative) = (1+1)/(6+10) = 0.1250
P(the|negative) = (1+1)/(6+10) = 0.1250
P(poor|negative) = (1+1)/(6+10) = 0.1250
P(activity|negative) = (0+1)/(6+10) = 0.0625

Log(P(+|doc)) = -15.014800
Log(P(-|doc)) = -12.006646

→ Predicted Class: NEGATIVE


'negative'