In [23]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.7.1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.22.0 (from scikit-learn)
  Using cached numpy-2.2.6-cp311-cp311-win_amd64.whl.metadata (60 kB)
Using cached scikit_learn-1.7.1-cp311-cp311-win_amd64.whl (8.9 MB)
Using cached numpy-2.2.6-cp311-cp311-win_amd64.whl (12.9 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, numpy, scikit-learn

  Attempting uninstall: numpy

    Found existing installation: numpy 2.3.2

   ------------- -------------------------- 1/3 [numpy]
    Uninstalling numpy-2.3.2:
   ------------- -------------------------- 1/3 [numpy]
   ------------- -------------------------- 1/3 [numpy]
      Successfully uninstalled numpy-2.3.2
   ------------- -------------------------- 1/3 [numpy]
   ------------- -------------------

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.2.6 which is incompatible.


In [7]:
import os
import re
import random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

Data Loading and Preprocessing

In [8]:
def load_and_preprocess_data(folder_path='BBC', unk_threshold=1):
    print("Loading and preprocessing data...")
    
    all_docs = []
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8', errors='ignore') as f:
                all_docs.append(f.read())

    full_text = "\n".join(all_docs)
    tokenized_sentences = [word_tokenize(s.lower()) for s in sent_tokenize(full_text)]
    
    word_counts = Counter(token for sent in tokenized_sentences for token in sent)
    vocab = {word for word, count in word_counts.items() if count > unk_threshold}
    vocab.add('<UNK>')
    
    processed_sentences = [
        [word if word in vocab else '<UNK>' for word in sent] 
        for sent in tokenized_sentences
    ]
    
    print(f"Data loaded. Sentences: {len(processed_sentences)}, Vocab size: {len(vocab)}")
    return processed_sentences, vocab

n-gram Model Implementation

In [9]:
class NgramLanguageModel:
    def __init__(self, n, vocab):
        self.n = n
        self.vocab = vocab
        self.vocab_size = len(vocab)
        self.counts = defaultdict(Counter)
        self.context_counts = Counter()

    def train(self, sentences):
        for sent in sentences:
            padded_sent = ['<s>'] * (self.n - 1) + sent + ['</s>']
            for i in range(len(padded_sent) - self.n + 1):
                ngram = tuple(padded_sent[i : i + self.n])
                context = ngram[:-1]
                word = ngram[-1]
                self.counts[context][word] += 1
                self.context_counts[context] += 1

    def get_prob(self, word, context):
        context_tuple = tuple(context)
        word_count = self.counts[context_tuple].get(word, 0)
        context_count = self.context_counts.get(context_tuple, 0)
        return (word_count + 1) / (context_count + self.vocab_size)

Interpolated Model Implementation

In [10]:
class InterpolatedModel:
    def __init__(self, models, vocab):
        self.models = models
        self.n = max(model.n for model in models)
        self.lambdas = np.ones(len(models)) / len(models)
        self.vocab = vocab

    def train(self, held_out_sentences, iterations=10):
        print("Learning interpolation weights with EM algorithm...")
        for i in range(iterations):
            expected_counts = np.zeros(len(self.models))
            for sent in held_out_sentences:
                padded_sent = ['<s>'] * (self.n - 1) + sent + ['</s>']
                for j in range(self.n - 1, len(padded_sent)):
                    probs = np.zeros(len(self.models))
                    for k, model in enumerate(self.models):
                        context = padded_sent[j - model.n + 1 : j]
                        word = padded_sent[j]
                        probs[k] = model.get_prob(word, context)
                    
                    total_prob = np.dot(self.lambdas, probs)
                    if total_prob > 1e-9:
                        for k in range(len(self.models)):
                            expected_counts[k] += (self.lambdas[k] * probs[k]) / total_prob
            
            self.lambdas = expected_counts / np.sum(expected_counts)
        print(f"EM complete. Final lambdas: {self.lambdas}")

    def get_prob(self, word, context):
        interpolated_prob = 0.0
        for i, model in enumerate(self.models):
            model_context = context[-(model.n - 1):] if model.n > 1 else []
            interpolated_prob += self.lambdas[i] * model.get_prob(word, model_context)
        return interpolated_prob

Perplexity Calculation

In [11]:
def calculate_perplexity(model, test_sentences):
    total_log_prob = 0
    token_count = 0
    
    for sent in test_sentences:
        padded_sent = ['<s>'] * (model.n - 1) + sent + ['</s>']
        token_count += len(sent) + 1 
        for i in range(model.n - 1, len(padded_sent)):
            context = padded_sent[i - model.n + 1 : i]
            word = padded_sent[i]
            prob = model.get_prob(word, context)
            if prob > 0:
                total_log_prob += np.log2(prob)

    perplexity = 2 ** (-total_log_prob / token_count)
    return perplexity

Model Evalutaion and Results

In [13]:

all_sentences, vocab = load_and_preprocess_data(folder_path='BBC')
random.shuffle(all_sentences)
split_idx = int(0.8 * len(all_sentences))
train_full, test_set = all_sentences[:split_idx], all_sentences[split_idx:]

val_split_idx = int(0.9 * len(train_full))
train_set, val_set = train_full[:val_split_idx], train_full[val_split_idx:]

print(f"\nData split complete:")
print(f"Training sentences: {len(train_set)}")
print(f"Validation sentences (for EM): {len(val_set)}")
print(f"Testing sentences: {len(test_set)}")


print("\nTraining n-gram Models ")
unigram_model = NgramLanguageModel(n=1, vocab=vocab)
unigram_model.train(train_set)
print("Unigram model trained.")

bigram_model = NgramLanguageModel(n=2, vocab=vocab)
bigram_model.train(train_set)
print("Bigram model trained.")

trigram_model = NgramLanguageModel(n=3, vocab=vocab)
trigram_model.train(train_set)
print("Trigram model trained.")

print("\nTraining Interpolated Model ")
interpolated_model = InterpolatedModel(
    models=[unigram_model, bigram_model, trigram_model], 
    vocab=vocab
)
interpolated_model.train(val_set)

print("\n Calculating Perplexity on the Test Set ")
perplexity_unigram = calculate_perplexity(unigram_model, test_set)
perplexity_bigram = calculate_perplexity(bigram_model, test_set)
perplexity_trigram = calculate_perplexity(trigram_model, test_set)
perplexity_interpolated = calculate_perplexity(interpolated_model, test_set)

results = {
    "Model": ["Unigram (n=1)", "Bigram (n=2)", "Trigram (n=3)", "Interpolated (n=1,2,3)"],
    "Perplexity Score": [perplexity_unigram, perplexity_bigram, perplexity_trigram, perplexity_interpolated]
}
df_results = pd.DataFrame(results)

print("\n Perplexity Score Evaluation Results ")
print(df_results.to_string(index=False))

Loading and preprocessing data...
Data loaded. Sentences: 239, Vocab size: 585

Data split complete:
Training sentences: 171
Validation sentences (for EM): 20
Testing sentences: 48

Training n-gram Models 
Unigram model trained.
Bigram model trained.
Trigram model trained.

Training Interpolated Model 
Learning interpolation weights with EM algorithm...
EM complete. Final lambdas: [0.52372313 0.45702263 0.01925424]

 Calculating Perplexity on the Test Set 

 Perplexity Score Evaluation Results 
                 Model  Perplexity Score
         Unigram (n=1)        175.281094
          Bigram (n=2)        213.990381
         Trigram (n=3)        403.637043
Interpolated (n=1,2,3)        151.038585


## Evaluation and Analysis

| Model                | Perplexity Score |
|-----------------------|------------------|
| Unigram (n=1)         | 175.281094 |
| Bigram (n=2)          | 213.990381 |
| Trigram (n=3)         | 403.637043 |
| Interpolated (n=1,2,3)| 151.038585 |

- **Unigram** works reasonably well since it avoids sparsity, but ignores context.  
- **Bigram** is worse due to sparsity in a small dataset.  
- **Trigram** performs the worst, showing the effect of extreme sparsity.  
- **Interpolated model** outperforms all individual models, confirming interpolation alleviates sparsity and balances coverage + context.  
- Learned weights (λ values) typically favor unigram and bigram more heavily, showing trigram contributes very little in small corpora.  

**Conclusion:** Interpolation significantly improves language model performance by combining multiple n-gram models and mitigating sparsity.
