In [29]:
import collections
from __future__ import annotations
import collections
import itertools
import math
from pathlib import Path
from collections import defaultdict, deque
from typing import Dict, Tuple, Deque, Generator, List

SENTENCES_FILE = "/content/drive/MyDrive/dataset/sentences_of_hindi_corpus_1.txt"
SENTENCE_LIMIT = 1000000
MAX_N = 4
ADD_K = 0.5
START_TOKEN = "<s>"
END_TOKEN = "</s>"

In [30]:
def stream_sentences_with_boundaries(file_path: str, limit: int) -> Generator[str, None, None]:
    sentences_read = 0
    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            for line in itertools.islice(f, limit):
                line = line.strip()
                if line:
                    yield START_TOKEN

                    for tok in line.split():
                        t = tok.strip()
                        if t:
                            yield t

                    yield END_TOKEN
                    sentences_read += 1

    except FileNotFoundError:
        return

    print(f"Successfully processed {sentences_read} sentences.")

In [31]:
def mle_conditional(ngram: Tuple[str, ...], counts_n: Dict[Tuple[str, ...], int], counts_prev: Dict[Tuple[str, ...], int] | int, total_tokens: int) -> float:
    """MLE Conditional Probability: P(w_n | h) = C(h, w_n) / C(h)"""
    count_ngram = counts_n.get(ngram, 0)

    if len(ngram) == 1:
        # Unigram context is total_tokens (N)
        denom = total_tokens
    else:
        # N-gram context C(h) is retrieved from the counts_prev dictionary
        hist = ngram[:-1]
        denom = counts_prev.get(hist, 0)

    return count_ngram / denom if denom else 0.0


def add_one_conditional(ngram: Tuple[str, ...], counts_n: Dict[Tuple[str, ...], int], counts_prev: Dict[Tuple[str, ...], int] | int, vocab_size: int, total_tokens: int) -> float:
    """Add-One Smoothing: P*(w_n | h) = (C(h, w_n) + 1) / (C(h) + V)"""
    count_ngram = counts_n.get(ngram, 0)

    if len(ngram) == 1:
        count_context = total_tokens
    else:
        hist = ngram[:-1]
        count_context = counts_prev.get(hist, 0)

    denom = count_context + vocab_size
    return (count_ngram + 1) / denom if denom else 0.0


def add_k_conditional(ngram: Tuple[str, ...], counts_n: Dict[Tuple[str, ...], int], counts_prev: Dict[Tuple[str, ...], int] | int, vocab_size: int, k: float, total_tokens: int) -> float:
    """Add-K Smoothing: P*(w_n | h) = (C(h, w_n) + K) / (C(h) + K * V)"""
    count_ngram = counts_n.get(ngram, 0)

    if len(ngram) == 1:
        count_context = total_tokens
    else:
        hist = ngram[:-1]
        count_context = counts_prev.get(hist, 0)

    denom = count_context + (k * vocab_size)
    return (count_ngram + k) / denom if denom else 0.0


def token_type_score(ngram: Tuple[str, ...], counts_n: Dict[Tuple[str, ...], int]) -> float:
    """Add Token Type Smoothing (Custom Score): C(Ngram) + len(unique characters in predicted word)"""
    count_ngram = counts_n.get(ngram, 0)
    predicted = ngram[-1]
    tts_score_part = float(len(set(predicted)))
    return count_ngram + tts_score_part, tts_score_part

In [32]:
def print_demonstration(n: int, counts_n: Dict[Tuple[str, ...], int], counts_prev: Dict[Tuple[str, ...], int] | int, vocab_size: int, total_tokens: int, gram: Tuple[str, ...], context: Tuple[str, ...], label: str):

    count_ngram = counts_n.get(gram, 0)

    # Correctly retrieve context count for printing (either N or count from dict)
    if n == 1:
        count_context = total_tokens
    else:
        count_context = counts_prev.get(context, 0)

    # MLE
    mle_p = mle_conditional(gram, counts_n, counts_prev, total_tokens)

    # Add-One
    add1_p = add_one_conditional(gram, counts_n, counts_prev, vocab_size, total_tokens)

    # Add-K
    addk_p = add_k_conditional(gram, counts_n, counts_prev, vocab_size, ADD_K, total_tokens)

    # Token Type Score
    tts, tts_score_part = token_type_score(gram, counts_n)

    print(f"\n--- {n}-gram Model ({label}) ---")
    print(f"  Counts: C(Ngram)={count_ngram}, C(Context)={count_context}")
    print(f"  Unsmoothed (MLE): {mle_p:.10f}")
    print(f"  Add-One (k=1): {add1_p:.10f}")
    print(f"  Add-K (k={ADD_K}): {addk_p:.10f}")
    print(f"  Add Token Type Score: {tts:.4f} (C(Ngram) + {tts_score_part:.2f})")


In [33]:
def main():

    counts: Dict[int, Dict[Tuple[str, ...], int]] = {n: defaultdict(int) for n in range(1, MAX_N + 1)}
    total_tokens = 0
    vocab = set()
    window: Deque[str] = deque(maxlen=MAX_N - 1)

    # 1. Load Data (Q1) and Count N-grams (Q2)
    for tok in stream_sentences_with_boundaries(SENTENCES_FILE, SENTENCE_LIMIT):
        total_tokens += 1
        vocab.add(tok)

        counts[1][(tok,)] += 1

        if MAX_N > 1:
            hist = list(window)
            hl = len(hist)

            for n in range(2, MAX_N + 1):
                needed = n - 1
                if hl >= needed:
                    gram = tuple(hist[-needed:] + [tok])
                    counts[n][gram] += 1

        window.append(tok)

    vocab_size = len(vocab)

    if total_tokens == 0:
        return

    print("\n" + "="*50)
    print("MODEL SUMMARY")
    print(f"Total tokens (N): {total_tokens}; Vocabulary size (V): {vocab_size}")
    for n in range(1, MAX_N + 1):
        print(f"Unique {n}-grams: {len(counts[n])}")
    print("="*50)

    # 2. Demonstration of Probability Calculation (Q2)

    # Example N-grams to test
    unigram = ("खाना",)
    bigram = ("आज", "खाना")
    trigram = ("मैं", "आज", "खाना")
    quadrigram = (START_TOKEN, "मैं", "आज", "खाना")

    examples = [
        (1, unigram, tuple(), "P(खाना)"),
        (2, bigram, ("आज",), "P(खाना | आज)"),
        (3, trigram, ("मैं", "आज"), "P(खाना | मैं, आज)"),
        (4, quadrigram, (START_TOKEN, "मैं", "आज"), f"P(खाना | {START_TOKEN}, मैं, आज)"),
    ]

    for n, gram, context, label in examples:
        # Pass total_tokens for N=1, or the counts dictionary for N>1
        counts_prev = total_tokens if n == 1 else counts[n-1]

        print_demonstration(
            n, counts[n], counts_prev, vocab_size, total_tokens, gram, context, label
        )


if __name__ == '__main__':
    main()

Successfully processed 1000000 sentences.

MODEL SUMMARY
Total tokens (N): 19112465; Vocabulary size (V): 260592
Unique 1-grams: 260592
Unique 2-grams: 3161479
Unique 3-grams: 8887725
Unique 4-grams: 13350139

--- 1-gram Model (P(खाना)) ---
  Counts: C(Ngram)=1503, C(Context)=19112465
  Unsmoothed (MLE): 0.0000786398
  Add-One (k=1): 0.0000776336
  Add-K (k=0.5): 0.0000781333
  Add Token Type Score: 1506.0000 (C(Ngram) + 3.00)

--- 2-gram Model (P(खाना | आज)) ---
  Counts: C(Ngram)=1, C(Context)=15657
  Unsmoothed (MLE): 0.0000638692
  Add-One (k=1): 0.0000072398
  Add-K (k=0.5): 0.0000102773
  Add Token Type Score: 4.0000 (C(Ngram) + 3.00)

--- 3-gram Model (P(खाना | मैं, आज)) ---
  Counts: C(Ngram)=0, C(Context)=68
  Unsmoothed (MLE): 0.0000000000
  Add-One (k=1): 0.0000038364
  Add-K (k=0.5): 0.0000038354
  Add Token Type Score: 3.0000 (C(Ngram) + 3.00)

--- 4-gram Model (P(खाना | <s>, मैं, आज)) ---
  Counts: C(Ngram)=0, C(Context)=19
  Unsmoothed (MLE): 0.0000000000
  Add-One (k=1)