In [13]:
!pip install emoji contractions



In [1]:
import contractions
import demoji
import re
import string
import numpy as np
from collections import Counter, defaultdict
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

nltk.data.path.append("C:/nltk_data")
try:
    nltk.data.find('corpora/brown')
except LookupError:
    nltk.download('brown')

Part A: Apply the preprocessing pipeline

In [2]:
class Preprocessor:
    def __init__(self):
        demoji.download_codes()

    def expand_contractions(self, text):
        return contractions.fix(text)

    def handle_emojis(self, text):
        return demoji.replace(text, " ")

    def normalize_text(self, text):
        text = text.lower()
        text = re.sub(r'\b\d{4}[-/]\d{2}[-/]\d{2}\b|\b\d{2}[-/]\d{2}[-/]\d{4}\b', ' DATE ', text)
        text = re.sub(r'\b\d+[\d,.]*\b', ' NUM ', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def process(self, text):
        text = self.expand_contractions(text)
        text = self.handle_emojis(text)
        text = self.normalize_text(text)
        tokens = word_tokenize(text)
        return tokens

Part B: Training the n-gram model from scratch

In [3]:
class NgramLanguageModel:
    def __init__(self, n=3):
        self.n = n
        self.counts = {i: defaultdict(Counter) for i in range(1, n + 1)}
        self.lambdas = np.ones(n) / n
        self.vocab = set()
        self.total_words = 0

    def train(self, sentences, held_out_sentences, em_iterations=10):
        print("Starting training...")
        padded_sentences = []
        for sentence in sentences:
            padded_sentence = ['<s>'] * (self.n - 1) + sentence + ['</s>']
            padded_sentences.append(padded_sentence)
            for token in padded_sentence:
                self.vocab.add(token)
        self.total_words = sum(len(s) for s in padded_sentences)
        for sentence in padded_sentences:
            for i in range(1, self.n + 1):
                for gram in ngrams(sentence, i):
                    if i == 1: 
                        self.counts[1][()][gram[0]] += 1
                    else: 
                        self.counts[i][gram[:-1]][gram[-1]] += 1
        print(f"Training complete. Vocabulary size: {len(self.vocab)}")
        print("Learning interpolation weights using EM algorithm...")
        self.learn_lambdas_em(held_out_sentences, em_iterations)

    def get_ngram_prob(self, word, context, order):
        if order == 1:
            return self.counts[1][()][word] / self.total_words
        
        context_count = sum(self.counts[order][context].values())
        if context_count == 0:
            return 0
        
        word_count_in_context = self.counts[order][context].get(word, 0)
        return word_count_in_context / context_count

    def learn_lambdas_em(self, held_out, iterations):
        for i in range(iterations):
            expected_counts = np.zeros(self.n)

            for sentence in held_out:
                padded_sentence = ['<s>'] * (self.n - 1) + sentence + ['</s>']
                for j in range(self.n - 1, len(padded_sentence)):
                    history = tuple(padded_sentence[j - self.n + 1 : j])
                    word = padded_sentence[j]
                    
                    probs = np.zeros(self.n)
                    for k in range(self.n):
                        order = k + 1
                        context = history[-(order-1):] if order > 1 else ()
                        probs[k] = self.get_ngram_prob(word, context, order)
                    
                    total_prob = np.dot(self.lambdas, probs)
                    if total_prob > 1e-9:
                        for k in range(self.n):
                            expected_counts[k] += (self.lambdas[k] * probs[k]) / total_prob
            
            self.lambdas = expected_counts / np.sum(expected_counts)
            print(f"EM Iteration {i+1}/{iterations}, New Lambdas: {self.lambdas}")

    def get_interpolated_prob(self, word, history):
        prob = 0.0
        for i in range(self.n):
            order = self.n - i
            context = tuple(history[-(order-1):]) if order > 1 else ()
            mle_prob = self.get_ngram_prob(word, context, order)
            prob += self.lambdas[i] * mle_prob
        return prob

#Part C: Sentence Completion Task

    def generate_sentence(self, prefix, max_length=20, k=5):
        preprocessor = Preprocessor()
        tokens = preprocessor.process(prefix)
        sentence = ['<s>'] * (self.n - 1) + tokens
            
        for _ in range(max_length):
            history = sentence[-(self.n - 1):]
            probs = {word: self.get_interpolated_prob(word, history) for word in self.vocab}
            probs.pop('<s>', None)
                
            sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
            top_k_words = [item[0] for item in sorted_probs[:k]]
            top_k_probs = [item[1] for item in sorted_probs[:k]]
                
            prob_sum = sum(top_k_probs)
            if prob_sum == 0: 
                break
            renormalized_probs = [p / prob_sum for p in top_k_probs]
                
            next_word = np.random.choice(top_k_words, p=renormalized_probs)
            if next_word == '</s>': 
                break
            sentence.append(next_word)
            
        return ' '.join(sentence[self.n - 1:])

Sampling

In [5]:
print("\n Task B: N-gram Language Model ")
from nltk.corpus import brown
corpus = brown.sents()[:5000]
preprocessor = Preprocessor()
processed_corpus = [preprocessor.process(' '.join(s)) for s in corpus]
train_data, held_out_data = processed_corpus[:4500], processed_corpus[4500:]

trigram_model = NgramLanguageModel(n=3)
trigram_model.train(train_data, held_out_data, em_iterations=5)

print("\n Sentence Completion Examples ")
prefixes = ["I'm director of IIT Ropar announced that", "After the club, they went to"]
for p in prefixes:
    generated_text = trigram_model.generate_sentence(p, max_length=15, k=10)
    print(f"Prefix: '{p}'\nCompleted: '{generated_text}'\n")


 Task B: N-gram Language Model 


  demoji.download_codes()


Starting training...
Training complete. Vocabulary size: 11950
Learning interpolation weights using EM algorithm...
EM Iteration 1/5, New Lambdas: [0.60280252 0.31212704 0.08507044]
EM Iteration 2/5, New Lambdas: [0.65416869 0.30291201 0.04291931]
EM Iteration 3/5, New Lambdas: [0.66667662 0.3049985  0.02832488]
EM Iteration 4/5, New Lambdas: [0.66967099 0.30889284 0.02143617]
EM Iteration 5/5, New Lambdas: [0.67019348 0.31218089 0.01762563]

 Sentence Completion Examples 
Prefix: 'I'm director of IIT Ropar announced that'
Completed: 'i am director of iit ropar announced that the united nations member with a few soldiers from the antitrust legislation together in yous'

Prefix: 'After the club, they went to'
Completed: 'after the club they went to press were NUM NUM and the other of the other side in hudson of sherman'

