In [14]:
import re
import os
import nltk
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Tushar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Data Preprocessing

In [15]:
def preprocess_data(corpus_path):
    with open(corpus_path, 'r', encoding='utf-8') as f:
        text = f.read()
    
    sentences = nltk.sent_tokenize(text)
    
    processed_sentences = []
    for sentence in sentences:
        tokens = [token.lower() for token in nltk.word_tokenize(sentence)]
        processed_sentences.append(['<s>'] + tokens + ['</s>'])
        
    return processed_sentences


n-gram Model Implementation

In [16]:
class NGramLanguageModel:
    
    def __init__(self, n, sentences, k=1.0):
        self.n = n
        self.k = k
        self.counts = defaultdict(Counter)
        self.vocab = set()
        self.train(sentences)

    def train(self, sentences):
        all_tokens = []
        for sentence in sentences:
            all_tokens.extend(sentence)
            padded_sentence = ['<s>'] * (self.n - 1) + sentence
            for i in range(len(padded_sentence) - self.n + 1):
                history = tuple(padded_sentence[i : i + self.n - 1])
                word = padded_sentence[i + self.n - 1]
                self.counts[history][word] += 1
        
        self.vocab = set(all_tokens)
        self.vocab_size = len(self.vocab)

    def calculate_prob(self, history, word):
        history = tuple(history)
        
        
        numerator = self.counts[history].get(word, 0) + self.k
        
        history_count = sum(self.counts[history].values())
        
        denominator = history_count + (self.k * self.vocab_size)

        if denominator == 0:

            return 1 / self.vocab_size

        return numerator / denominator

    def calculate_sentence_perplexity(self, sentence):
        padded_sentence = ['<s>'] * (self.n - 1) + sentence
        log_prob_sum = 0.0
        
        num_tokens = len(sentence)

        if num_tokens == 0:
            return float('inf')

        for i in range(len(padded_sentence) - self.n + 1):
            history = tuple(padded_sentence[i : i + self.n - 1])
            word = padded_sentence[i + self.n - 1]
            
            prob = self.calculate_prob(history, word)
            
            if prob == 0:
                return float('inf')
            
            log_prob_sum += np.log2(prob)


        perplexity = 2 ** (-log_prob_sum / num_tokens)
        return perplexity

Data Ingestion with k_values

In [17]:
bbc_folder = 'BBC'
all_sentences = []
for filename in os.listdir(bbc_folder):
    if filename.endswith('.txt'):
        file_path = os.path.join(bbc_folder, filename)
        sentences = preprocess_data(file_path)
        all_sentences.extend(sentences)

train_sentences, test_sentences = train_test_split(
    all_sentences, test_size=0.2, random_state=42 
)

print(f"Total sentences: {len(all_sentences)}")
print(f"Training sentences: {len(train_sentences)}")
print(f"Testing sentences: {len(test_sentences)}")

models_to_evaluate = {
    "Unigram": 1,
    "Bigram": 2,
    "Trigram": 3
}
k_values = [0.0, 0.5, 1.0, 5.0] 

results = []

Total sentences: 239
Training sentences: 191
Testing sentences: 48


Evaluation Run

In [18]:
for model_name, n in models_to_evaluate.items():
    for k in k_values:
        
        if n == 1:

            class UnigramModel(NGramLanguageModel):
                def calculate_prob(self, history, word):
                    numerator = self.counts[()].get(word, 0) + self.k
                    total_tokens = sum(self.counts[()].values())
                    denominator = total_tokens + (self.k * self.vocab_size)
                    return numerator / denominator
            
            model = UnigramModel(n, train_sentences, k=k)
        else:
            model = NGramLanguageModel(n, train_sentences, k=k)

        
        total_perplexity = 0
        sentence_perplexities = []
        
        for sentence in test_sentences:
            ppl = model.calculate_sentence_perplexity(sentence)
            sentence_perplexities.append(ppl)

        finite_ppls = [p for p in sentence_perplexities if p != float('inf')]
        if not finite_ppls:
            mean_perplexity = float('inf')
        else:
            mean_perplexity = np.mean(finite_ppls)
            
        smoothing_type = f"Add-k (k={k})" if k > 0 else "No Smoothing (MLE)"
        results.append({
            "Model": model_name,
            "Smoothing": smoothing_type,
            "Mean Perplexity": mean_perplexity
        })
        print(f"Model: {model_name}, Smoothing: {smoothing_type}, Mean Perplexity: {mean_perplexity:.4f}")

Model: Unigram, Smoothing: No Smoothing (MLE), Mean Perplexity: 197.9589
Model: Unigram, Smoothing: Add-k (k=0.5), Mean Perplexity: 408.3495
Model: Unigram, Smoothing: Add-k (k=1.0), Mean Perplexity: 381.7473
Model: Unigram, Smoothing: Add-k (k=5.0), Mean Perplexity: 402.1481
Model: Bigram, Smoothing: No Smoothing (MLE), Mean Perplexity: 8.2587
Model: Bigram, Smoothing: Add-k (k=0.5), Mean Perplexity: 389.8500
Model: Bigram, Smoothing: Add-k (k=1.0), Mean Perplexity: 463.1084
Model: Bigram, Smoothing: Add-k (k=5.0), Mean Perplexity: 672.7263
Model: Trigram, Smoothing: No Smoothing (MLE), Mean Perplexity: 1.7481
Model: Trigram, Smoothing: Add-k (k=0.5), Mean Perplexity: 642.7935
Model: Trigram, Smoothing: Add-k (k=1.0), Mean Perplexity: 702.5768
Model: Trigram, Smoothing: Add-k (k=5.0), Mean Perplexity: 850.2898


Results

In [19]:
results_df = pd.DataFrame(results)
print("\n Perplexity Evaluation Report ")
print(results_df.to_string(index=False))


 Perplexity Evaluation Report 
  Model          Smoothing  Mean Perplexity
Unigram No Smoothing (MLE)       197.958921
Unigram      Add-k (k=0.5)       408.349471
Unigram      Add-k (k=1.0)       381.747349
Unigram      Add-k (k=5.0)       402.148113
 Bigram No Smoothing (MLE)         8.258747
 Bigram      Add-k (k=0.5)       389.849993
 Bigram      Add-k (k=1.0)       463.108376
 Bigram      Add-k (k=5.0)       672.726309
Trigram No Smoothing (MLE)         1.748121
Trigram      Add-k (k=0.5)       642.793459
Trigram      Add-k (k=1.0)       702.576781
Trigram      Add-k (k=5.0)       850.289760
