In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
import pandas as pd
import csv
import numpy as np
import heapq
import math
from collections import defaultdict

In [None]:
from collections import defaultdict
import math
from itertools import islice

In [None]:
!kaggle datasets download -d sudalairajkumar/telugu-nlp

Dataset URL: https://www.kaggle.com/datasets/sudalairajkumar/telugu-nlp
License(s): copyright-authors
Downloading telugu-nlp.zip to /content
  0% 0.00/88.7M [00:00<?, ?B/s]
100% 88.7M/88.7M [00:00<00:00, 1.40GB/s]


In [None]:
!unzip telugu-nlp.zip -d telugu-nlp

Archive:  telugu-nlp.zip
  inflating: telugu-nlp/telugu_books/telugu_books.csv  
  inflating: telugu-nlp/telugu_news/test_telugu_news.csv  
  inflating: telugu-nlp/telugu_news/train_telugu_news.csv  


In [None]:
from datasets import load_dataset
dataset=load_dataset("ai4bharat/IndicCorpV2","indiccorp_v2",split="tel_Telu",streaming=True)
import regex as re

def telugu_sentence_tokenizer(text):
    text = re.sub(r'\s+', ' ', text).strip()
    sentence_endings = r'(?<=[.!?।])\s'
    sentences = re.split(sentence_endings, text)
    return [s.strip() for s in sentences if s.strip()]

def telugu_word_tokenizer(text):
    pattern = r'''
        (?:\p{Script=Telugu}(?:[\p{M}\u200C\u200D])*)+                 # Telugu words
        | (?:\d+\.\d+)                         # Decimal numbers
        | (?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4})    # Dates
        | (?:[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}) # Emails
        | (?:(?:[https?://|www\.])[^\s]+)                  # URLs
        | (?:\d+)                              # Numbers
        | [^\s\p{L}\p{N}]                      # Punctuation/symbols
    '''
    tokens = re.findall(pattern, text, flags=re.VERBOSE)
    tokens = [t.replace("\u200c", "").replace("\u200d", "") for t in tokens]
    return tokens

In [None]:
def count_ngrams_stream(dataset, max_n, limit=500000):
    counts = {n: defaultdict(int) for n in range(1, max_n + 1)}
    vocab = set()

    for example in islice(dataset, limit):
        text = example.get("text") or example.get("content") or ""
        if not text.strip():
            continue
        for sentence in telugu_sentence_tokenizer(text):
            tokens = (["<bos>"] * (max_n - 1)) + telugu_word_tokenizer(sentence) + ["<eos>"]
            vocab.update(tokens)

            for n in range(1, max_n + 1):
                for i in range(len(tokens) - n + 1):
                    ngram = tuple(tokens[i:i+n])
                    counts[n][ngram] += 1

    ngram_counts = {n: dict(counts[n]) for n in range(1, max_n + 1)}

    context_counts = {n: ngram_counts.get(n - 1, {}) for n in range(2, max_n + 1)}

    return ngram_counts, context_counts, vocab

In [None]:
class Ngram:
    def __init__(self, counts, context_counts, vocab, n):
        self.counts = counts
        self.context_counts = context_counts
        self.vocab = vocab
        self.V = len(vocab)
        self.n = n

    def prob(self, ngram):
        if self.n == 1:
            return self.counts.get(ngram, 0) / sum(self.counts.values())
        context = ngram[:-1]
        return self.counts.get(ngram, 0) / max(1, self.context_counts.get(context, 0))

    def prob_addone(self, ngram):
        if self.n == 1:
            return (self.counts.get(ngram, 0) + 1) / (sum(self.counts.values()) + self.V)
        context = ngram[:-1]
        return (self.counts.get(ngram, 0) + 1) / (self.context_counts.get(context, 0) + self.V)

    def prob_addk(self, ngram, k=0.5):
        if self.n == 1:
            return (self.counts.get(ngram, 0) + k) / (sum(self.counts.values()) + self.V * k)
        context = ngram[:-1]
        return (self.counts.get(ngram, 0) + k) / (self.context_counts.get(context, 0) + self.V * k)

    def prob_addtokentype(self, ngram):
        if self.n == 1:
            return (self.counts.get(ngram, 0) + random.r) / (sum(self.counts.values()) + self.V)
        context = ngram[:-1]
        return (self.counts.get(ngram, 0) + self.V) / (self.context_counts.get(context, 0) + self.V)

    def prob_sentences(self, tokens, smoothing="raw", k=0.5, log_space=True):
      if log_space:
          log_prob = 0.0
          for i in range(len(tokens) - self.n + 1):
              ngram = tuple(tokens[i:i+self.n])
              if smoothing == "raw":
                  p = self.prob(ngram)
              elif smoothing == "addone":
                  p = self.prob_addone(ngram)
              elif smoothing == "addk":
                  p = self.prob_addk(ngram, k)
              else:
                  p = self.prob_addtokentype(ngram)
              if p > 0:
                  log_prob += math.log(p)
              else:
                  log_prob += float('-inf')
          return log_prob
      else:
          prob = 1.0
          for i in range(len(tokens) - self.n + 1):
              ngram = tuple(tokens[i:i+self.n])
              if smoothing == "raw":
                  prob *= self.prob(ngram)
              elif smoothing == "addone":
                  prob *= self.prob_addone(ngram)
              elif smoothing == "addk":
                  prob *= self.prob_addk(ngram, k)
              else:
                  prob *= self.prob_addtokentype(ngram)
          return prob

In [None]:
print("Counting unigrams...")
uni_counts, uni_context, vocab = count_ngrams_stream(dataset, 1)
unigram = Ngram(uni_counts, {}, vocab, 1)

print("Counting bigrams...")
bi_counts, bi_context, _ = count_ngrams_stream(dataset, 2)
bigram = Ngram(bi_counts, uni_counts, vocab, 2)

print("Counting trigrams...")
tri_counts, tri_context, _ = count_ngrams_stream(dataset, 3)
trigram = Ngram(tri_counts, bi_counts, vocab, 3)

print("Counting quadgrams...")
quad_counts, quad_context, _ = count_ngrams_stream(dataset, 4)
quadgram = Ngram(quad_counts, tri_counts, vocab, 4)

Counting unigrams...
Counting bigrams...
Counting trigrams...
Counting quadgrams...


In [None]:
df = pd.read_csv("/content/telugu-nlp/telugu_news/test_telugu_news.csv")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

val_df = df.iloc[:1000]
test_df = df.iloc[1000:2000]
train_df = df.iloc[2000:]

print("Train size:", len(train_df))
print("Validation size:", len(val_df))
print("Test size:", len(test_df))

Train size: 2329
Validation size: 1000
Test size: 1000


In [None]:
for sentence in test_df['body'][:10]:
    tokens = ["<bos>", *telugu_word_tokenizer(sentence), "<eos>"]
    print("Sentence:", sentence[:60], "...")
    print("Unigram raw:", unigram.prob_sentences(tokens))
    tokens = ["<bos>", *telugu_word_tokenizer(sentence), "<eos>"]
    print("Bigram add-one:", bigram.prob_sentences(tokens, "addone"))
    tokens = ["<bos>" * 2 , *telugu_word_tokenizer(sentence), "<eos>"]
    print("Trigram add-k:", trigram.prob_sentences(tokens, "addk", k=0.5))
    tokens = ["<bos>" * 3, *telugu_word_tokenizer(sentence), "<eos>"]
    print("Quadgram token-type:", quadgram.prob_sentences(tokens, "addtokentype"))
    print("="*60)

In [None]:
class Ngram:
    def __init__(self, counts, context_counts, vocab, n):
        self.counts = counts
        self.context_counts = context_counts
        self.vocab = vocab
        self.V = len(vocab)
        self.n = n
        self.Nc = defaultdict(int)
        for c in counts.values():
            self.Nc[c] += 1

        self.N = sum(counts.values())

        self.top100={}

    def prob_good_turing(self, ngram):
        C = self.counts.get(ngram, 0)
        Nc = self.Nc.get(C, 0)
        Nc1 = self.Nc.get(C+1, 0)

        if C == 0:
            N1 = self.Nc.get(1, 0)
            return N1 / (self.N * ((self.V ** self.n) - self.N)) if self.N > 0 else 0.0

        if Nc == 0:
            return C / self.N

        C_star = (C+1) * (Nc1 / Nc)

        self.top100[C]=[Nc,C_star]

        return C_star / self.N

    def prob_sentences(self, tokens, use_log=True):
        log_prob = 0.0
        prob = 1.0

        for i in range(len(tokens) - self.n + 1):
            ngram = tuple(tokens[i:i+self.n])
            p = self.prob_good_turing(ngram)

            if use_log:
                if p > 0:
                    log_prob += math.log(p)
                else:
                    log_prob += float("-inf")
            else:
                prob *= p

        return log_prob if use_log else prob

    def print_top100(self):
      rows = []
      for C, Nc in self.Nc.items():
          Nc1 = self.Nc.get(C+1, 0)
          if C == 0:
              continue
          C_star = (C+1) * (Nc1 / Nc) if Nc > 0 else C
          rows.append((C, Nc, round(C_star,3)))

      rows = sorted(rows, key=lambda x: x[0], reverse=True)[:100]
      print("C (MLE)   Nc   C*")
      for C, Nc, C_star in rows:
          print(f"{C:<8} {Nc:<4} {C_star}")

In [None]:
print("Counting unigrams...")
uni_counts, uni_context, vocab = count_ngrams_stream(dataset, 1)
uni_counts = uni_counts[1]
unigram = Ngram(uni_counts, {}, vocab, 1)

print("Counting bigrams...")
bi_counts, bi_context, _ = count_ngrams_stream(dataset, 2)
bi_counts = bi_counts[1]
bigram = Ngram(bi_counts, uni_counts, vocab, 2)

print("Counting trigrams...")
tri_counts, tri_context, _ = count_ngrams_stream(dataset, 3)
tri_counts = tri_counts[1]
trigram = Ngram(tri_counts, bi_counts, vocab, 3)

print("Counting quadgrams...")
quad_counts, quad_context, _ = count_ngrams_stream(dataset, 4)
quad_counts = quad_counts[1]
quadgram = Ngram(quad_counts, tri_counts, vocab, 4)

Counting unigrams...
Counting bigrams...
Counting trigrams...
Counting quadgrams...


In [None]:
for sentence in test_df['body'][:10]:
    tokens = ["<bos>", *telugu_word_tokenizer(sentence), "<eos>"]
    print("Sentence:", sentence[:60], "...")
    print("Unigram raw:", unigram.prob_sentences(tokens))
    print("Bigram add-one:", bigram.prob_sentences(tokens))
    print("Trigram add-k:", trigram.prob_sentences(tokens))
    print("Quadgram token-type:", quadgram.prob_sentences(tokens))
    print("="*60)

Sentence: కరాచీ: చాంపియన్స్ ట్రోఫీలో భారత్‌ను ఓడించే సత్తా పాకిస్థాన్‌ ...
Unigram raw: -inf
Bigram add-one: -inf
Trigram add-k: -inf
Quadgram token-type: -5832.666522225466
Sentence: చాలా ప్రతిష్టాత్మకంగా తెరకెక్కనున్న చిరంజీవి 151వ చిత్రం సై. ...
Unigram raw: -inf
Bigram add-one: -inf
Trigram add-k: -inf
Quadgram token-type: -5763.490267141461
Sentence: అమెరికా కొత్త అధ్యక్షుడు ట్రంప్‌కు చెందిన దుబాయిలోని గోల్ఫ్‌ ...
Unigram raw: -inf
Bigram add-one: -inf
Trigram add-k: -5971.641103469907
Quadgram token-type: -8454.942746253628
Sentence: రాహుల్‌కు ఇద్దరు విదేశీ గర్ల్‌ఫ్రెండ్స్‌.. పెళ్లిపై వేదాంత ధ ...
Unigram raw: -inf
Bigram add-one: -inf
Trigram add-k: -inf
Quadgram token-type: -16346.462049199172
Sentence: 
నీరజ్‌ శ్యామ్‌, నైరా షా జంటగా నటించిన చిత్రం ‘ఇ.ఈ’. రామ్‌ గ ...
Unigram raw: -inf
Bigram add-one: -inf
Trigram add-k: -inf
Quadgram token-type: -4194.908117069552
Sentence: చండీగఢ్: అత్యాచారం కేసుల్లో కారాగార శిక్ష అనుభవిస్తున్న డేరా ...
Unigram raw: -inf
Bigram add-one: -inf
T

In [None]:
print("top 100 frequencies of unigram")
unigram.print_top100()

top 100 frequencies of unigram
C (MLE)   Nc   C*
722767   1    0.0
348303   1    0.0
107152   1    0.0
70566    1    0.0
48747    1    0.0
36937    1    0.0
34873    1    0.0
33380    1    0.0
29231    1    0.0
29052    1    0.0
28639    1    0.0
28305    1    0.0
27982    1    0.0
26522    1    0.0
25353    1    0.0
24409    1    0.0
22876    1    0.0
22805    1    0.0
22683    1    0.0
22352    1    0.0
21720    1    0.0
20583    1    0.0
20352    1    0.0
20097    1    0.0
19662    1    0.0
18792    1    0.0
18457    1    0.0
17942    1    0.0
17603    1    0.0
17507    1    0.0
16760    1    0.0
16341    1    0.0
15577    1    0.0
15483    1    0.0
15434    1    0.0
15101    1    0.0
14567    1    0.0
14246    1    0.0
13917    1    0.0
13718    1    0.0
13709    1    0.0
13554    1    0.0
13305    1    0.0
13182    1    0.0
12956    1    0.0
12547    1    0.0
12363    1    0.0
11838    1    0.0
11649    1    0.0
11551    1    0.0
11547    1    0.0
11310    1    0.0
11243    1    0

In [None]:
print("top 100 frequencies of bigram")
bigram.print_top100()

top 100 frequencies of bigram
C (MLE)   Nc   C*
722760   1    0.0
70558    1    0.0
47934    1    0.0
23415    1    0.0
20136    1    0.0
17658    1    0.0
17340    1    0.0
15482    1    0.0
15434    1    0.0
13200    1    0.0
11193    1    0.0
9800     1    0.0
9785     1    0.0
9741     1    0.0
9415     1    0.0
8588     1    0.0
8384     1    0.0
8258     1    0.0
8231     1    0.0
8105     1    0.0
7988     1    0.0
7921     1    0.0
7686     1    0.0
7294     1    0.0
7263     1    0.0
6544     1    0.0
6415     1    0.0
6007     1    0.0
5923     1    0.0
5672     1    0.0
5481     1    0.0
4985     1    0.0
4906     1    0.0
4535     1    0.0
4411     1    0.0
4230     1    0.0
4083     1    0.0
4037     1    0.0
4000     1    0.0
3992     1    0.0
3912     1    0.0
3799     1    0.0
3797     1    0.0
3770     1    0.0
3725     1    0.0
3655     1    0.0
3607     1    0.0
3577     1    0.0
3575     1    0.0
3536     1    0.0
3522     1    0.0
3512     1    0.0
3485     1    0.

In [None]:
print("top 100 frequencies of trigram")
trigram.print_top100()

top 100 frequencies of trigram
C (MLE)   Nc   C*
722760   1    0.0
70558    1    0.0
47934    1    0.0
23415    1    0.0
20136    1    0.0
17658    1    0.0
17340    1    0.0
15482    1    0.0
15434    1    0.0
13200    1    0.0
11193    1    0.0
9800     1    0.0
9785     1    0.0
9741     1    0.0
9415     1    0.0
8588     1    0.0
8384     1    0.0
8258     1    0.0
8230     1    0.0
7881     1    0.0
7686     1    0.0
7294     1    0.0
7263     1    0.0
6544     1    0.0
6415     1    0.0
6007     1    0.0
5923     1    0.0
5577     1    0.0
5481     1    0.0
4985     1    0.0
4906     1    0.0
4577     1    0.0
4371     1    0.0
4232     1    0.0
4230     1    0.0
4083     1    0.0
4037     1    0.0
4000     1    0.0
3992     1    0.0
3912     1    0.0
3799     1    0.0
3797     1    0.0
3770     1    0.0
3725     1    0.0
3575     1    0.0
3536     1    0.0
3515     1    0.0
3512     1    0.0
3451     1    0.0
3422     1    0.0
3421     1    3422.0
3420     1    3421.0
3411     

In [None]:
print("top 100 frequencies of quadgram")
quadgram.print_top100()

top 100 frequencies of quadgram
C (MLE)   Nc   C*
722760   1    0.0
70558    1    0.0
47934    1    0.0
23415    1    0.0
20136    1    0.0
17658    1    0.0
17340    1    0.0
15482    1    0.0
15434    1    0.0
13200    1    0.0
11193    1    0.0
9800     1    0.0
9785     1    0.0
9741     1    0.0
9415     1    0.0
8588     1    0.0
8384     1    0.0
8258     1    0.0
8230     1    0.0
7686     1    0.0
7294     1    0.0
7263     1    0.0
6544     1    0.0
6415     1    0.0
6007     1    0.0
5923     1    0.0
5577     1    0.0
5481     1    0.0
4985     1    0.0
4906     1    0.0
4577     1    0.0
4371     1    0.0
4351     1    0.0
4232     1    0.0
4230     1    0.0
4083     1    0.0
4037     1    0.0
4000     1    0.0
3992     1    0.0
3912     1    0.0
3799     1    0.0
3797     1    0.0
3770     1    0.0
3725     1    0.0
3575     1    0.0
3536     1    0.0
3512     1    0.0
3451     1    0.0
3422     1    0.0
3420     2    0.0
3411     1    0.0
3340     1    0.0
3331     1    

In [None]:
lam={'l1':0,'l2':0,'l3':0,'l4':0}
for example in islice(dataset, 500000):
        text = example.get("text") or example.get("content") or ""
        if not text.strip():
            continue
        for sentence in telugu_sentence_tokenizer(text):
          tokens = ["<bos>", *telugu_word_tokenizer(sentence), "<eos>"]
          lam['l1']+=unigram.prob_sentences(tokens,use_log=True)
          lam['l2']+=bigram.prob_sentences(tokens,use_log=True)
          lam['l3']+=trigram.prob_sentences(tokens,use_log=True)
          lam['l4']+=quadgram.prob_sentences(tokens,use_log=True)

sum=lam['l1']+lam['l2']+lam['l3']+lam['l4']
lam['l1']=lam['l1']/sum
lam['l2']=lam['l2']/sum
lam['l3']=lam['l3']/sum
lam['l4']=lam['l4']/sum

# def deleted_interpolation(lam,n,tokens):
  # return (lam['l1'] if n>=1 else 0)*unigram.prob_sentences(tokens)+ (lam['l2'] if n>=2 else 0)*bigram.prob_sentences(tokens)+ (lam['l3'] if n>=3 else 0)*trigram.prob_sentences(tokens)+ (lam['l4'] if n>=4 else 0)*quadgram.prob_sentences(tokens)

In [None]:
print(lam)

{'l1': nan, 'l2': nan, 'l3': nan, 'l4': nan}


In [None]:
def deleted_interpolation(tokens, lam, models, n):
    score = 0.0
    for order in range(1, n+1):
        score += lam[f"l{order}"] * models[order].prob_sentences(tokens)
    return score

In [None]:
models = {1: unigram, 2: bigram, 3: trigram, 4: quadgram}

for sentence in test_df['body'][:10]:
    tokens = ["<bos>", *telugu_word_tokenizer(sentence), "<eos>"]
    print("Sentence:", sentence[:60], "...")
    print("Deleted interpolation (n=4):", deleted_interpolation(tokens, lam, models, 4))
    print("="*60)

Sentence: కరాచీ: చాంపియన్స్ ట్రోఫీలో భారత్‌ను ఓడించే సత్తా పాకిస్థాన్‌ ...
Deleted interpolation (n=4): nan
Sentence: చాలా ప్రతిష్టాత్మకంగా తెరకెక్కనున్న చిరంజీవి 151వ చిత్రం సై. ...
Deleted interpolation (n=4): nan
Sentence: అమెరికా కొత్త అధ్యక్షుడు ట్రంప్‌కు చెందిన దుబాయిలోని గోల్ఫ్‌ ...
Deleted interpolation (n=4): nan
Sentence: రాహుల్‌కు ఇద్దరు విదేశీ గర్ల్‌ఫ్రెండ్స్‌.. పెళ్లిపై వేదాంత ధ ...
Deleted interpolation (n=4): nan
Sentence: 
నీరజ్‌ శ్యామ్‌, నైరా షా జంటగా నటించిన చిత్రం ‘ఇ.ఈ’. రామ్‌ గ ...
Deleted interpolation (n=4): nan
Sentence: చండీగఢ్: అత్యాచారం కేసుల్లో కారాగార శిక్ష అనుభవిస్తున్న డేరా ...
Deleted interpolation (n=4): nan
Sentence: కొందరికీ అన్నీ ఉన్నా సక్సెస్‌లు రావడం కష్టమవుతోంది. మెగా కాం ...
Deleted interpolation (n=4): nan
Sentence: ప్రసాద్‌కు ద్రోణాచార్య, హకీమ్‌కు ధ్యాన్‌చంద్‌ సాకేత్‌, జ్యోత ...
Deleted interpolation (n=4): nan
Sentence: 
రహానె అవుట్‌.. ధవన్‌ ఇన్‌ఆసీస్‌తో టీ20లకు భారత జట్టున్యూఢిల ...
Deleted interpolation (n=4): nan
Sentence: లక్నో: హిందువుల మన

In [None]:
class KatzNgram:
  def __init__(self,n,counts,context_counts,lower_order_model,k=5):
    self.n=n
    self.k=k
    self.counts=counts
    self.context_counts=context_counts
    self.lower_order_model=lower_order_model

    self.N=sum(self.counts.values())
    self.Nc=defaultdict(int)
    for c in self.counts.values():
      if c<=self.k+1:
       self.Nc[c]+=1

    self._calculate_discounts()
    if self.lower_order_model:
      self._calculate_alphas()

  def _get_discounts(self,count):
    if count==0:
      return 1.0

    if count>self.k:
      return 1.0

    Nc=self.Nc.get(count,0)
    Nc1=self.Nc.get(count+1,0)

    if Nc==0 or Nc1==0:
      return 1.0

    C_star = (count+1)*(Nc1/Nc)
    return C_star/count

  def _calculate_discounts(self):
    self.discounts={c:self._get_discounts(c) for c in range(1,self.k+1)}

  def _calculate_alphas(self):
    self.alphas={}
    context_groups=defaultdict(list)
    for ngram in self.counts.keys():
      context_groups[ngram[:-1]].append(ngram)

    for context, context_count in self.context_counts.items():
            prob_mass_seen = 0.0
            for ngram in context_groups[context]:
                count = self.counts.get(ngram, 0)
                discount = self.discounts.get(count, 1.0)
                prob_mass_seen += (count * discount) / context_count

            lower_order_prob_mass = 0.0
            for ngram in context_groups[context]:
                lower_order_prob_mass += self.lower_order_model.prob(ngram[1:])

            numerator = 1.0 - prob_mass_seen
            denominator = 1.0 - lower_order_prob_mass

            if denominator == 0:
                self.alphas[context] = 1.0
            else:
                self.alphas[context] = numerator / denominator

  def prob(self, ngram):
        if self.n == 1:
            count = self.counts.get(ngram, 0)

            if count == 0:
                N1 = self.Nc.get(1, 0)
                return N1 / self.N if self.N > 0 else 0

            return count / self.N

        count = self.counts.get(ngram, 0)
        context = ngram[:-1]
        context_count = self.context_counts.get(context, 0)

        if count > 0:
            discount = self.discounts.get(count, 1.0)
            if context_count == 0:
               return 0.0
            return (count * discount) / context_count

        else:
            alpha = self.alphas.get(context, 1.0)
            return alpha * self.lower_order_model.prob(ngram[1:])

  def sentence_log_prob(self, tokens):
        log_prob = 0.0
        for i in range(len(tokens) - self.n + 1):
            ngram = tuple(tokens[i:i+self.n])
            p = self.prob(ngram)
            if p > 0:
                log_prob += math.log(p)
            else:
                return float('-inf')
        return log_prob

In [None]:
unigram_katz = KatzNgram(1, uni_counts, {}, lower_order_model=None)
bigram_katz = KatzNgram(2, bi_counts, uni_counts, lower_order_model=unigram_katz)
trigram_katz = KatzNgram(3, tri_counts, bi_counts, lower_order_model=bigram_katz)
quadgram_katz = KatzNgram(4, quad_counts, tri_counts, lower_order_model=trigram_katz)
print("Models built.")

for sentence in test_df['body'][:10]:
    print("Sentence:", sentence[:60], "...")
    tokens_uni = telugu_word_tokenizer(sentence) + ["<eos>"]

    tokens_bi = ["<bos>"] + telugu_word_tokenizer(sentence) + ["<eos>"]

    tokens_tri = ["<bos>"] * 2 + telugu_word_tokenizer(sentence) + ["<eos>"]

    tokens_quad = ["<bos>"] * 3 + telugu_word_tokenizer(sentence) + ["<eos>"]

    print(f"Unigram Log Prob: {unigram_katz.sentence_log_prob(tokens_uni):.2f}")
    print(f"Bigram Log Prob:  {bigram_katz.sentence_log_prob(tokens_bi):.2f}")
    print(f"Trigram Log Prob: {trigram_katz.sentence_log_prob(tokens_tri):.2f}")
    print(f"Quadgram Log Prob:{quadgram_katz.sentence_log_prob(tokens_quad):.2f}")
    print("="*60)

Models built.
Sentence: కరాచీ: చాంపియన్స్ ట్రోఫీలో భారత్‌ను ఓడించే సత్తా పాకిస్థాన్‌ ...
Unigram Log Prob: -1007.72
Bigram Log Prob:  -1007.72
Trigram Log Prob: -1007.72
Quadgram Log Prob:-1007.72
Sentence: చాలా ప్రతిష్టాత్మకంగా తెరకెక్కనున్న చిరంజీవి 151వ చిత్రం సై. ...
Unigram Log Prob: -974.14
Bigram Log Prob:  -974.14
Trigram Log Prob: -974.14
Quadgram Log Prob:-974.14
Sentence: అమెరికా కొత్త అధ్యక్షుడు ట్రంప్‌కు చెందిన దుబాయిలోని గోల్ఫ్‌ ...
Unigram Log Prob: -1448.16
Bigram Log Prob:  -1448.16
Trigram Log Prob: -1448.16
Quadgram Log Prob:-1448.16
Sentence: రాహుల్‌కు ఇద్దరు విదేశీ గర్ల్‌ఫ్రెండ్స్‌.. పెళ్లిపై వేదాంత ధ ...
Unigram Log Prob: -2553.84
Bigram Log Prob:  -2553.84
Trigram Log Prob: -2553.84
Quadgram Log Prob:-2553.84
Sentence: 
నీరజ్‌ శ్యామ్‌, నైరా షా జంటగా నటించిన చిత్రం ‘ఇ.ఈ’. రామ్‌ గ ...
Unigram Log Prob: -696.23
Bigram Log Prob:  -696.23
Trigram Log Prob: -696.23
Quadgram Log Prob:-696.23
Sentence: చండీగఢ్: అత్యాచారం కేసుల్లో కారాగార శిక్ష అనుభవిస్తున్న డేరా ...
Unig

In [None]:
class KneserNeyNgram:
    def __init__(self, n, counts, context_counts, lower_order_model=None, all_ngrams_by_order=None, d=0.75):
        self.n = n
        self.d = d
        self.counts = counts
        self.context_counts = context_counts
        self.lower_order_model = lower_order_model
        self.all_ngrams_by_order = all_ngrams_by_order

        if self.n > 1:
            self.lambda_numerator_counts = defaultdict(set)
            for ngram in self.counts:
                context = ngram[:-1]
                word = ngram[-1]
                self.lambda_numerator_counts[context].add(word)
        else:
            self.continuation_counts = defaultdict(set)
            bigrams = self.all_ngrams_by_order.get(2, set())
            for w1, w2 in bigrams:
                self.continuation_counts[w2].add(w1)
            self.total_bigrams = len(bigrams)

    def _get_continuation_prob(self, ngram_tuple):
        if self.n == 1:
            word = ngram_tuple[0]
            numerator = len(self.continuation_counts.get(word, set()))
            denominator = self.total_bigrams if self.total_bigrams > 0 else 1
            return numerator / denominator
        else:
            raise NotImplementedError("Continuation prob is handled by recursion.")

    def prob(self, ngram):
        if self.n == 1:
            return self._get_continuation_prob(ngram)

        context = ngram[:-1]
        context_count = self.context_counts.get(context, 0)

        if context_count == 0:
            return self.lower_order_model.prob(ngram[1:])

        ngram_count = self.counts.get(ngram, 0)
        first_term = max(ngram_count - self.d, 0) / context_count

        num_unique_following_words = len(self.lambda_numerator_counts.get(context, set()))
        lambda_weight = (self.d / context_count) * num_unique_following_words

        lower_order_prob = self.lower_order_model.prob(ngram[1:])

        return first_term + (lambda_weight * lower_order_prob)

    def sentence_log_prob(self, tokens):
        log_prob = 0.0
        for i in range(len(tokens) - self.n + 1):
            ngram = tuple(tokens[i : i + self.n])
            p = self.prob(ngram)
            if p > 0:
                log_prob += math.log(p)
            else:
                return float('-inf')
        return log_prob

In [None]:
def count_all_ngrams_stream(dataset, max_n, limit=500000):
    counts = {n: defaultdict(int) for n in range(1, max_n + 1)}
    vocab = set()
    dataset = dataset.iter(batch_size=1)

    for example in islice(dataset, limit):
        raw_text = example.get("text") or example.get("content") or ""

        if isinstance(raw_text, list):
            text = " ".join(raw_text)
        else:
            text = raw_text
        if not text.strip():
            continue

        for sentence in telugu_sentence_tokenizer(text):
            tokens = (["<bos>"] * (max_n - 1)) + telugu_word_tokenizer(sentence) + ["<eos>"]
            vocab.update(tokens)
            for n in range(1, max_n + 1):
                for i in range(len(tokens) - n + 1):
                    ngram = tuple(tokens[i:i+n])
                    counts[n][ngram] += 1

    ngram_counts = {n: dict(counts[n]) for n in range(1, max_n + 1)}
    return ngram_counts, vocab

In [None]:
print("Counting all n-grams in a single pass...")
ngram_counts, vocab = count_all_ngrams_stream(dataset, max_n=4)
print("Counting complete.")

uni_counts = ngram_counts[1]
bi_counts = ngram_counts[2]
tri_counts = ngram_counts[3]
quad_counts = ngram_counts[4]

all_ngrams_by_order = {
    1: set(uni_counts.keys()),
    2: set(bi_counts.keys()),
    3: set(tri_counts.keys()),
    4: set(quad_counts.keys())
}

print("Building Kneser-Ney backoff models...")
unigram_kn = KneserNeyNgram(1, uni_counts, {},
                            lower_order_model=None,
                            all_ngrams_by_order=all_ngrams_by_order)

bigram_kn = KneserNeyNgram(2, bi_counts, uni_counts,
                           lower_order_model=unigram_kn,
                           all_ngrams_by_order=all_ngrams_by_order)

trigram_kn = KneserNeyNgram(3, tri_counts, bi_counts,
                            lower_order_model=bigram_kn,
                            all_ngrams_by_order=all_ngrams_by_order)

quadgram_kn = KneserNeyNgram(4, quad_counts, tri_counts,
                             lower_order_model=trigram_kn,
                             all_ngrams_by_order=all_ngrams_by_order)
print("Models built successfully.")

for sentence in test_df['body'][:10]:
    print("Sentence:", sentence[:60], "...")
    tokens = ["<bos>"] * 3 + telugu_word_tokenizer(sentence) + ["<eos>"]
    log_probability = quadgram_kn.sentence_log_prob(tokens)
    print(f"Quadgram Kneser-Ney Log Prob: {log_probability:.2f}")
    print("="*60)

Counting all n-grams in a single pass...
Counting complete.
Building Kneser-Ney backoff models...


In [None]:
import heapq

class NgramGenerator:
    def __init__(self, unigram, bigram, trigram, quadgram):
        self.models = {
            1: unigram,
            2: bigram,
            3: trigram,
            4: quadgram
        }

    def _get_next_word_candidates(self, model, context):
        if not isinstance(context, tuple):
            context = tuple(context)

        candidates = []
        if model.n == 1:
            for ngram, count in model.counts.items():
                log_prob = math.log(model.prob(ngram))
                candidates.append((log_prob, ngram[0]))
            return candidates

        for ngram, count in model.counts.items():
            if ngram[:-1] == context:
                log_prob = math.log(model.prob(ngram))
                candidates.append((log_prob, ngram[-1]))

        return sorted(candidates, key=lambda x: x[0], reverse=True)

    def generate_greedy(self, model_order, max_len=25):
        model = self.models[model_order]
        n = model.n

        sentence = ["<bos>"] * (n - 1) if n > 1 else []

        for _ in range(max_len):
            context = tuple(sentence[-(n - 1):]) if n > 1 else tuple()

            candidates = self._get_next_word_candidates(model, context)

            if not candidates:
                break
            best_word = candidates[0][1]

            if best_word == "<eos>":
                break

            sentence.append(best_word)

        return " ".join(sentence[(n-1):])

    def generate_beam(self, model_order, beam_size=20, max_len=30):
        model = self.models[model_order]
        n = model.n

        initial_tokens = ["<bos>"] * (n - 1) if n > 1 else []
        active_beams = [(0.0, initial_tokens)]
        completed_beams = []

        for _ in range(max_len):
            new_beams = []
            for log_prob, tokens in active_beams:
                if tokens[-1] == "<eos>":
                    completed_beams.append((log_prob, tokens))
                    continue

                context = tuple(tokens[-(n - 1):]) if n > 1 else tuple()

                candidates = self._get_next_word_candidates(model, context)

                for cand_log_prob, cand_word in candidates[:beam_size]:
                    new_tokens = tokens + [cand_word]
                    new_log_prob = log_prob + cand_log_prob
                    new_beams.append((new_log_prob, new_tokens))
            active_beams = heapq.nlargest(beam_size, new_beams, key=lambda x: x[0])

            if not active_beams:
                break

        all_candidates = completed_beams + active_beams
        if not all_candidates:
            return ""
        best_log_prob, best_sentence = max(all_candidates, key=lambda x: x[0])
        start_index = (n-1) if n > 1 else 0
        if best_sentence[-1] == "<eos>":
            best_sentence = best_sentence[:-1]

        return " ".join(best_sentence[start_index:])

In [None]:
generator = NgramGenerator(unigram, bigram, trigram, quadgram)
generation_tasks = {
    "Unigram": 1,
    "Bigram": 2,
    "Trigram": 3,
    "Quadgram": 4
}

for name, order in generation_tasks.items():
    print(f"\n{'='*20} Generating for {name} Model {'='*20}")

    print(f"\n--- Method: Greedy Approach ---")
    greedy_sentences = [generator.generate_greedy(order) for _ in range(100)]
    for i, sentence in enumerate(greedy_sentences[:3]):
        print(f"  {i+1}: {sentence}")
    if len(greedy_sentences) > 3:
        print("  ...")

    print(f"\n--- Method: Beam Search (beam_size=20) ---")
    beam_sentences = [generator.generate_beam(order, beam_size=20) for _ in range(100)]
    for i, sentence in enumerate(beam_sentences[:3]):
        print(f"  {i+1}: {sentence}")
    if len(beam_sentences) > 3:
        print("  ...")