In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

import random, math, itertools
from pathlib import Path
from collections import defaultdict, Counter
from typing import Dict, Tuple, Generator, List
import numpy as np

INPUT_SOURCE_PATH = "/content/drive/MyDrive/dataset/sentences_of_hindi_corpus_1.txt"
TRAIN_FILENAME = "train.txt"
VAL_FILENAME = "val.txt"
TEST_FILENAME = "test.txt"

MAX_SENTENCE_LIMIT = 1000000
MAX_N = 4
START_TOKEN = "<s>"
END_TOKEN = "</s>"

VAL_SIZE = 1000
TEST_SIZE = 1000
STEPS = [i / 10 for i in range(11)]


In [3]:

def stream_tokens(path: Path) -> Generator[str, None, None]:
    with path.open("r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            if line:
                yield START_TOKEN
                for tok in line.split():
                    t = tok.strip()
                    if t:
                        yield t
                yield END_TOKEN

def create_data_splits(input_path: str, max_limit: int):
    with open(input_path, "r", encoding="utf-8") as f:
        all_sentences = [line for line in itertools.islice(f, max_limit)]

    random.seed(42)
    random.shuffle(all_sentences)

    val_sentences = all_sentences[:VAL_SIZE]
    test_sentences = all_sentences[VAL_SIZE:VAL_SIZE+TEST_SIZE]
    train_sentences = all_sentences[VAL_SIZE+TEST_SIZE:]

    with open(TRAIN_FILENAME, "w", encoding="utf-8") as f: f.writelines(train_sentences)
    with open(VAL_FILENAME, "w", encoding="utf-8") as f: f.writelines(val_sentences)
    with open(TEST_FILENAME, "w", encoding="utf-8") as f: f.writelines(test_sentences)

create_data_splits(INPUT_SOURCE_PATH, MAX_SENTENCE_LIMIT)
print("Data split done.")


Data split done.


In [4]:
# Cell 3: Optimized N-gram Counter
def count_ngrams(tokens: List[str], max_n: int = 4):
    counts = {n: defaultdict(int) for n in range(1, max_n+1)}
    window = []

    for tok in tokens:
        counts[1][(tok,)] += 1
        window.append(tok)

        for n in range(2, max_n+1):
            if len(window) >= n:
                gram = tuple(window[-n:])
                counts[n][gram] += 1

        if len(window) > max_n:
            window.pop(0)

    return counts

train_tokens = list(stream_tokens(Path(TRAIN_FILENAME)))
train_counts = count_ngrams(train_tokens)

vocab = set(train_tokens)
vocab_size = len(vocab)

print("N-gram counting done.")


N-gram counting done.


In [5]:
# Optimized Good-Turing
def good_turing_probs(counts: Dict[Tuple[str, ...], int], n: int, vocab_size: int):
    Nc = Counter(counts.values())
    N = sum(counts.values())
    N1 = Nc.get(1, 0)

    if n == 1:
        U = len(counts)
        num_unseen = max(vocab_size - U, 1)
    else:
        num_unseen = max(1, vocab_size**n - len(counts))

    P_unseen_individual = N1 / (N * num_unseen)

    by_count = defaultdict(list)
    for gram, c in counts.items():
        by_count[c].append(gram)

    max_c = max(Nc)
    cstar_table = {}
    probs = {}

    for c in range(max_c + 1):
        Nc_c = Nc.get(c, 0)
        Nc1 = Nc.get(c + 1, 0)

        if c == 0:
            cstar = 0.0
        elif Nc_c > 0 and Nc1 > 0:
            cstar = (c + 1) * Nc1 / Nc_c
        else:
            cstar = float(c)

        cstar_table[c] = (Nc_c, cstar)

        if c in by_count and c > 0:
            p_star = cstar / N
            for gram in by_count[c]:
                probs[gram] = p_star

    return probs, P_unseen_individual, cstar_table

gt_probs_models = {}
p_unseen_map = {}

for n in range(1, MAX_N+1):
    print("Good-Turing for n =", n)
    probs, p_u, cstar_table = good_turing_probs(train_counts[n], n, vocab_size)
    gt_probs_models[n] = probs
    p_unseen_map[n] = p_u

print("Good-Turing done.")


Good-Turing for n = 1
Good-Turing for n = 2
Good-Turing for n = 3
Good-Turing for n = 4
Good-Turing done.


In [6]:
# Top 100 Table
Nc_unigram = Counter(train_counts[1].values())
cstar_unigram = {}

max_c = max(Nc_unigram)
for c in range(max_c+1):
    Nc_c = Nc_unigram.get(c,0)
    Nc1 = Nc_unigram.get(c+1,0)
    if c==0:
        cstar = p_unseen_map[1]
    elif Nc_c>0 and Nc1>0:
        cstar = (c+1)*Nc1/Nc_c
    else:
        cstar = float(c)
    cstar_unigram[c] = (Nc_c, cstar)

print("C (MLE)\tNc\tC*")
print(f"0\t{cstar_unigram[0][0]}\t{cstar_unigram[0][1]:.6f}")

items = sorted([(c, Nc_unigram[c]) for c in Nc_unigram if c>0], key=lambda x: -x[1])[:99]
for c, Nc_val in items:
    print(f"{c}\t{Nc_val}\t{cstar_unigram[c][1]:.6f}")


C (MLE)	Nc	C*
0	0	0.007399
1	141131	0.470046
2	33169	1.407610
3	15563	2.449656
4	9531	3.418319
5	6516	4.364641
6	4740	5.607384
7	3797	6.455623
8	3064	7.499021
9	2553	8.139444
10	2078	9.983638
11	1886	10.759279
12	1691	11.370195
13	1479	11.898580
14	1257	13.078759
15	1096	15.036496
16	1030	15.993204
17	969	16.179567
18	871	17.822044
19	817	18.188494
20	743	19.728129
21	698	19.762178
22	627	20.432217
24	584	21.104452
23	557	25.163375
26	502	24.956175
25	493	26.474645
28	473	26.731501
27	464	28.543103
29	436	27.178899
30	395	30.607595
31	390	28.307692
33	356	32.949438
34	345	31.144928
32	345	34.052174
35	307	34.710098
36	296	34.375000
39	289	34.048443
38	288	39.135417
37	275	39.796364
41	254	40.677165
42	246	39.678862
40	246	42.333333
44	236	42.711864
43	227	45.744493
48	227	43.171806
45	224	43.741071
46	213	46.558685
47	211	51.639810
49	200	40.000000
51	180	45.933333
53	178	50.359551
57	167	54.526946
54	166	51.686747
50	160	57.375000
52	159	59.333333
59	157	54.267516
58	157	59.000000
55	

In [7]:
# Perplexity
def calculate_log_prob(sentences, model_probs, vocab_size, n, punseen):
    total_log = 0
    total_words = 0

    for file_name in sentences:
        tokens = list(stream_tokens(Path(file_name)))
        total_words += len(tokens)-1

        for i in range(1, len(tokens)):
            cur_n = min(n, i+1)
            gram = tuple(tokens[i-cur_n+1:i+1])
            p = model_probs[n].get(gram, punseen)
            total_log += math.log(max(p, 1e-12))

    ppl = math.exp(-total_log / max(total_words,1))
    return total_log, ppl, total_words

for n in range(1, MAX_N+1):
    for fname, label in [(VAL_FILENAME,"VAL"), (TEST_FILENAME,"TEST")]:
        lp, ppl, tw = calculate_log_prob([fname], gt_probs_models, vocab_size, n, p_unseen_map[n])
        print(n, label, lp, ppl, tw)


1 VAL -138153.81354047495 1260.160137835918 19352
1 TEST -137698.58100238082 1293.470024984462 19218
2 VAL -250680.97030591636 422417.7430454817 19352
2 TEST -251180.52265973605 474523.158490282 19218
3 VAL -360760.0282736218 124772456.42342398 19352
3 TEST -361329.77639926795 146367721.97153842 19218
4 VAL -438769.8990259785 7027546691.647323 19352
4 TEST -438702.05169908085 8202167269.215983 19218


In [8]:
# Precompute MLE
mle = {1:{}, 2:{}, 3:{}, 4:{}}

tot1 = sum(train_counts[1].values())
for g,c in train_counts[1].items():
    mle[1][g] = c / tot1

for n in [2,3,4]:
    for g,c in train_counts[n].items():
        hist = g[:-1]
        denom = train_counts[n-1].get(hist,0)
        mle[n][g] = c/denom if denom>0 else 0


In [9]:
# Prepare quad validation data
val_tokens = list(stream_tokens(Path(VAL_FILENAME)))
val_counts = count_ngrams(val_tokens)

quad_list = []
for gram, count in val_counts[4].items():
    w = gram[-1]
    p1 = mle[1].get((w,),0)
    p2 = mle[2].get(gram[2:],0)
    p3 = mle[3].get(gram[1:],0)
    p4 = mle[4].get(gram,0)
    quad_list.append((count, p1, p2, p3, p4))

quad_array = np.array(quad_list)
counts = quad_array[:,0]
p1s = quad_array[:,1]
p2s = quad_array[:,2]
p3s = quad_array[:,3]
p4s = quad_array[:,4]

print("Prepared validation quad data.")


Prepared validation quad data.


In [10]:
# Deleted Interpolation
best_logprob = -1e300
best_lambdas = (0,0,0,0)

for l4 in STEPS:
    for l3 in STEPS:
        for l2 in STEPS:
            l1 = 1 - (l4 + l3 + l2)
            if l1 < 0: continue

            p = l1*p1s + l2*p2s + l3*p3s + l4*p4s
            p = np.clip(p, 1e-12, 1)
            logprob = np.sum(counts * np.log(p))

            if logprob > best_logprob:
                best_logprob = logprob
                best_lambdas = (l1, l2, l3, l4)

print("Best lambdas:", best_lambdas)


Best lambdas: (0.19999999999999996, 0.5, 0.2, 0.1)


In [11]:
# Evaluate on test set
test_tokens = list(stream_tokens(Path(TEST_FILENAME)))

ll = 0
total = 0
window = []

l1,l2,l3,l4 = best_lambdas

for tok in test_tokens:
    if tok == START_TOKEN:
        window = []
        continue

    total += 1
    window.append(tok)

    if len(window) >= 4:
        gram = tuple(window[-4:])
        w = gram[-1]
        p4 = mle[4].get(gram,0)
        p3 = mle[3].get(gram[1:],0)
        p2 = mle[2].get(gram[2:],0)
        p1 = mle[1].get((w,),0)
        p = l1*p1 + l2*p2 + l3*p3 + l4*p4
        ll += math.log(max(p,1e-12))

    if len(window)>4:
        window.pop(0)

ppl = math.exp(-ll/max(total,1))
print("Test LogP:", ll)
print("Test Perplexity:", ppl)


Test LogP: -81423.6035077737
Test Perplexity: 87.28326622987194
