In [1]:
import math
from collections import Counter
import numpy as np

In [2]:
# --- 1. Data Preprocessing ---
class Corpus:
    def __init__(self, train_path, val_path, unk_threshold=1):
        self.train_path = train_path
        self.val_path = val_path
        self.unk_threshold = unk_threshold
        
        self.vocab = set()
        self.train_corpus_unk = []
        self.val_corpus_unk = []

    def load_and_prepare_data(self):
        print("Loading and preparing data...")
        # 1. Load and tokenize sentences from the file
        raw_train_corpus = self._preprocess_file(self.train_path)
        raw_val_corpus = self._preprocess_file(self.val_path)

        # 2. Create vocabulary from training data and handle <UNK> tokens
        self.vocab, self.train_corpus_unk = self._handle_unknowns(raw_train_corpus)

        # 3. Replace words in validation data with <UNK> based on the created vocabulary
        self.val_corpus_unk = self._replace_oov(raw_val_corpus, self.vocab)
        print("Data preparation complete.")

    def _preprocess_file(self, file_path):
        processed_sentences = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                tokens = line.lower().strip().split()
                if tokens:
                    processed_sentences.append(['<s>'] + tokens + ['</s>'])
        return processed_sentences

    def _handle_unknowns(self, corpus):
        # Counts word frequencies and replaces words below a threshold with <UNK>.
        # Count the frequency of all words
        word_counts = Counter(word for sentence in corpus for word in sentence)

        # Add only words with a frequency greater than the threshold to the vocabulary dictionary.
        vocab = {word for word, count in word_counts.items() if count > self.unk_threshold}
        vocab.update(['<s>', '</s>', '<UNK>'])
        
        processed_corpus = [[word if word in vocab else '<UNK>' for word in sentence] for sentence in corpus]
        return vocab, processed_corpus
    
    def _replace_oov(self, corpus, vocab):
        # Replaces words not in the vocabulary with <UNK>.
        return [[word if word in vocab else '<UNK>' for word in sentence] for sentence in corpus]

In [3]:
# --- 2. N-gram Language Model Class (based on your provided logic) ---

class NgramLanguageModel:
    def __init__(self, n=1, k=0):
        self.n = n
        self.k = k
        self.unigram_counts = Counter()
        self.bigram_counts = Counter()
        self.total_tokens = 0
        self.vocab_size = 0

    def train(self, corpus):
        """Trains the language model on the given corpus."""
        # print("Training model...")
        for sentence in corpus:
            self.total_tokens += len(sentence)
            # --- Unigram Counts ---
            for word in sentence:
                self.unigram_counts[word] += 1

            # --- Bigram Counts ---
            if self.n >= 2:
                for i in range(len(sentence) - 1):
                    bigram = (sentence[i], sentence[i+1])
                    self.bigram_counts[bigram] += 1
        
        self.vocab_size = len(self.unigram_counts)
        # print(f"Training complete. Vocabulary size: {self.vocab_size}")

    def get_smoothed_unigram_prob(self, word):
        """Calculates smoothed unigram probability."""
        numerator = self.unigram_counts.get(word, 0) + self.k
        denominator = self.total_tokens + (self.k * self.vocab_size)
        return numerator / denominator

    def get_smoothed_bigram_prob(self, prev_word, word):
        """Calculates Add-k smoothed bigram probability."""
        bigram = (prev_word, word)
        numerator = self.bigram_counts.get(bigram, 0) + self.k
        denominator = self.unigram_counts.get(prev_word, 0) + (self.k * self.vocab_size)
        # Handle case where the context (prev_word) was never seen
        if denominator == 0:
            return 1 / self.vocab_size
        return numerator / denominator
    
    def calculate_perplexity(self, validation_corpus):
        """Calculates perplexity for a unigram model."""
        total_log_prob = 0.0
        # M is the number of words / m is the number of sentences
        M = 0

        for sentence in validation_corpus:
            # Total tokens in validation set, excluding <s> start tokens
            M += len(sentence) - 1
            for i in range(1, len(sentence)):
                if self.n == 1 :
                    word = sentence[i]
                    prob = self.get_smoothed_unigram_prob(word)

                if self.n == 2 :
                    prev_word = sentence[i-1]
                    word = sentence[i]
                    prob = self.get_smoothed_bigram_prob(prev_word, word)
                    
                if prob > 0:
                    total_log_prob += math.log2(prob)
                else:
                    total_log_prob += math.log2(1e-10)
                
        if M == 0:
            return float('inf')
                    
        l = total_log_prob / M
        perplexity = 2 ** (-l)
        return perplexity

In [4]:
# --- Main Execution Code ---
train_path = '/train.txt' # your path for training data 
val_path = '/val.txt' # your path for validation data

corpus = Corpus(train_path, val_path, unk_threshold = 1)
corpus.load_and_prepare_data()

Loading and preparing data...
Data preparation complete.


In [16]:
model = NgramLanguageModel(n=1, k=0)
model.train(corpus.train_corpus_unk)
print(model.get_smoothed_unigram_prob('the'))
print(model.get_smoothed_unigram_prob('hotel'))

0.05834105040349252
0.011388190677779247


In [22]:
model = NgramLanguageModel(n=2, k=0)
model.train(corpus.train_corpus_unk)
print(model.get_smoothed_bigram_prob('the','hotel'))
print(model.get_smoothed_bigram_prob('it','was'))

0.07804232804232804
0.24157303370786518


In [5]:
n_vec = [1, 2]
k_vec = [1, 0.1, 0.01, 0.001, 0]

results_train = np.zeros((len(n_vec) * len(k_vec), 3), dtype=float)
idx = 0
for n in n_vec:
    for k in k_vec:
        model = NgramLanguageModel(n=n, k=k)
        model.train(corpus.train_corpus_unk)
        ppl = model.calculate_perplexity(corpus.train_corpus_unk)
        
        results_train[idx] = [n, k, ppl]
        idx += 1

results_val = np.zeros((len(n_vec) * len(k_vec), 3), dtype=float)
idx = 0
for n in n_vec:
    for k in k_vec:
        model = NgramLanguageModel(n=n, k=k)
        model.train(corpus.train_corpus_unk)
        ppl = model.calculate_perplexity(corpus.val_corpus_unk)
        
        results_val[idx] = [n, k, ppl]
        idx += 1

In [6]:
print("## LANGUAGE MODEL PERPLEXITY EVALUATION ##")
print("--- MODEL TRAINING SET EVALUATION ---")
print("--- Perplexity on Training Set ---")

for row in results_train :
    if row[0] == 1 :
        print(f"Unigram | k = {row[1]:<6.3f} | Perplexity: {row[2]:.4f}")    
    if row[0] == 2 :
        print(f"Bigram  | k = {row[1]:<6.3f} | Perplexity: {row[2]:.4f}")    

print("\n--- Perplexity on Validation Set ---")
for row in results_val :
    if row[0] == 1 :
        print(f"Unigram | k = {row[1]:<6.3f} | Perplexity: {row[2]:.4f}")    
    if row[0] == 2 :
        print(f"Bigram  | k = {row[1]:<6.3f} | Perplexity: {row[2]:.4f}") 

## LANGUAGE MODEL PERPLEXITY EVALUATION ##
--- MODEL TRAINING SET EVALUATION ---
--- Perplexity on Training Set ---
Unigram | k = 1.000  | Perplexity: 336.5974
Unigram | k = 0.100  | Perplexity: 335.6611
Unigram | k = 0.010  | Perplexity: 335.6542
Unigram | k = 0.001  | Perplexity: 335.6547
Unigram | k = 0.000  | Perplexity: 335.6547
Bigram  | k = 1.000  | Perplexity: 380.1728
Bigram  | k = 0.100  | Perplexity: 108.1736
Bigram  | k = 0.010  | Perplexity: 49.1018
Bigram  | k = 0.001  | Perplexity: 35.8265
Bigram  | k = 0.000  | Perplexity: 33.0065

--- Perplexity on Validation Set ---
Unigram | k = 1.000  | Perplexity: 295.4775
Unigram | k = 0.100  | Perplexity: 293.3962
Unigram | k = 0.010  | Perplexity: 293.2443
Unigram | k = 0.001  | Perplexity: 293.2298
Unigram | k = 0.000  | Perplexity: 293.2282
Bigram  | k = 1.000  | Perplexity: 429.0841
Bigram  | k = 0.100  | Perplexity: 185.0601
Bigram  | k = 0.010  | Perplexity: 142.0454
Bigram  | k = 0.001  | Perplexity: 175.9258
Bigram  | k =