In [1]:
import re
import math
from collections import defaultdict
from sklearn.model_selection import KFold

# Tokenizer
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

# Dataset
dataset = [
    ("I loved the good activity", "positive"),
    ("The movie was good and enjoyable", "positive"),
    ("I hated the bad activity", "negative"),
    ("The movie was boring and poor", "negative"),
    ("Enjoyable experience with good movie", "positive"),
    ("Terrible boring story and poor acting", "negative"),
    ("I loved the characters and plot", "positive"),
    ("I hated the ending and acting", "negative"),
    ("What a good film it was", "positive"),
    ("Bad script and poor direction", "negative")
]

# Build vocabulary
def build_model(data):
    vocab = set()
    word_counts = {'positive': defaultdict(int), 'negative': defaultdict(int)}
    class_counts = {'positive': 0, 'negative': 0}
    class_word_totals = {'positive': 0, 'negative': 0}

    for sentence, label in data:
        tokens = tokenize(sentence)
        class_counts[label] += 1
        class_word_totals[label] += len(tokens)
        for token in tokens:
            word_counts[label][token] += 1
            vocab.add(token)

    vocab_size = len(vocab)
    return word_counts, class_counts, class_word_totals, vocab_size

# Predict using log probabilities
def predict(sentence, word_counts, class_counts, class_word_totals, vocab_size):
    tokens = tokenize(sentence)
    scores = {}

    for label in ['positive', 'negative']:
        log_prob = math.log(class_counts[label] / sum(class_counts.values()))
        for word in tokens:
            count = word_counts[label][word]
            total = class_word_totals[label]
            prob = (count + 1) / (total + vocab_size)
            log_prob += math.log(prob)
        scores[label] = log_prob

    return max(scores, key=scores.get)

# 5-Fold Cross Validation
def cross_validate(dataset, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    correct_total = 0
    total_total = 0

    for fold, (train_idx, test_idx) in enumerate(kf.split(dataset)):
        train_data = [dataset[i] for i in train_idx]
        test_data = [dataset[i] for i in test_idx]

        word_counts, class_counts, class_word_totals, vocab_size = build_model(train_data)

        correct = 0
        for sentence, label in test_data:
            pred = predict(sentence, word_counts, class_counts, class_word_totals, vocab_size)
            if pred == label:
                correct += 1
        accuracy = correct / len(test_data)
        total_total += len(test_data)
        correct_total += correct
        print(f"Fold {fold + 1}: Accuracy = {accuracy:.2f}")

    overall = correct_total / total_total
    print(f"\nOverall 5-Fold Accuracy: {overall:.2f}")

# Run 5-Fold CV
cross_validate(dataset)


Fold 1: Accuracy = 1.00
Fold 2: Accuracy = 1.00
Fold 3: Accuracy = 0.50
Fold 4: Accuracy = 1.00
Fold 5: Accuracy = 0.50

Overall 5-Fold Accuracy: 0.80
