<a href="https://colab.research.google.com/github/uttamgchauhan/NLP/blob/main/bigram_language_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import math
from collections import defaultdict
from nltk import ngrams
from nltk.tokenize import word_tokenize
import nltk

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Function to build a bigram language model with Add-One (Laplace) smoothing
def build_bigram_language_model(corpus):
    """
    Builds a bigram language model with Add-One (Laplace) smoothing using the given corpus.

    Args:
        corpus (str): The input corpus.

    Returns:
        dict: A nested dictionary representing the bigram language model.
              The keys are previous words, and the values are dictionaries
              containing the possible next words and their probabilities.
    """
    language_model = defaultdict(lambda: defaultdict(int))
    tokens = word_tokenize(corpus)
    bigram_tokens = list(ngrams(tokens, 2, pad_left=True, pad_right=True))
    vocabulary = set(tokens)

    for prev_word, word in bigram_tokens:
        language_model[prev_word][word] += 1

    # Apply Add-One (Laplace) smoothing
    vocabulary_size = len(vocabulary)
    for prev_word in language_model:
        total_count = sum(language_model[prev_word].values())
        for word in vocabulary:
            language_model[prev_word][word] = (language_model[prev_word][word] + 1) / (total_count + vocabulary_size)

    return language_model

In [None]:
# Function to calculate perplexity for bigram language model
def calculate_bigram_perplexity(language_model, test_corpus):
    tokens = word_tokenize(test_corpus)
    bigram_tokens = list(ngrams(tokens, 2, pad_left=True, pad_right=True))
    num_words = len(tokens)
    log_sum = 0.0

    for prev_word, word in bigram_tokens:
        if prev_word in language_model and word in language_model[prev_word]:
            probability = language_model[prev_word][word]
        else:
            probability = 1 / (len(language_model[prev_word]) + len(language_model[prev_word]) + 1)
        log_sum += math.log2(probability)

    perplexity = 2 ** (-log_sum / num_words)
    return perplexity

In [None]:
# Example usage
corpus = "I love programming. Programming is fun. Programming is creative."
test_corpus = "I enjoy programming. Programming is my passion."

language_model = build_bigram_language_model(corpus)
perplexity = calculate_bigram_perplexity(language_model, test_corpus)

print(f"Perplexity: {perplexity}")

Perplexity: 3.462469001722811
