In [None]:
# Function to load counts from a CSV (handles quotes, spaces)
def load_csv_counts(filename):
    counts = {}
    with open(filename, "r", encoding="utf-8") as f:
        header = f.readline()  # skip header
        for line in f:
            parts = line.strip().split(",")
            if len(parts) < 2:
                continue
            ngram_str = parts[0].strip().strip('"').strip()
            count_str = parts[1].strip().strip('"').strip()
            try:
                count = int(count_str)
            except:
                continue
            ngram = tuple(ngram_str.split())
            counts[ngram] = count
    return counts

# Load n-gram counts
uni_counts  = load_csv_counts(r"C:\Users\dubey\OneDrive\Desktop\Coding\NLP-lab\LAB4\unigram.csv")
bi_counts   = load_csv_counts(r"C:\Users\dubey\OneDrive\Desktop\Coding\NLP-lab\LAB4\bigram.csv")
tri_counts  = load_csv_counts(r"C:\Users\dubey\OneDrive\Desktop\Coding\NLP-lab\LAB4\trigram.csv")
quad_counts = load_csv_counts(r"C:\Users\dubey\OneDrive\Desktop\Coding\NLP-lab\LAB4\quadrigram.csv")

ngram_counts = {1: uni_counts, 2: bi_counts, 3: tri_counts, 4: quad_counts}


In [29]:
# Get count of an n-gram
def get_count(ngram, counts_dict):
    return counts_dict.get(ngram, 0)

# Get denominator = prefix count
def get_prefix_count(ngram, counts_dict):
    prefix = ngram[:-1]
    prefix_counts = 0
    for g, c in counts_dict.items():
        if g[:-1] == prefix:
            prefix_counts += c
    return prefix_counts


In [30]:
def katz_backoff_prob(ngram, counts_dicts, d=0.5):
    if len(ngram) == 0:
        return 0.0  # safety: no tokens left

    n = len(ngram)
    if n == 1:  # unigram
        total_uni = sum(counts_dicts[1].values())
        return get_count(ngram, counts_dicts[1]) / total_uni if total_uni > 0 else 0

    # Counts
    c_ngram = get_count(ngram, counts_dicts[n])
    c_prefix = get_prefix_count(ngram, counts_dicts[n])

    if c_ngram > 0:
        return (c_ngram - d) / c_prefix if c_prefix > 0 else 0
    else:
        # backoff
        seen = [g for g in counts_dicts[n] if g[:-1] == ngram[:-1]] if c_prefix > 0 else []
        alpha = (d * len(seen) / c_prefix) if c_prefix > 0 else 1
        return alpha * katz_backoff_prob(ngram[1:], counts_dicts, d)


In [None]:
# Write first 50,000 quadrigrams + Katz probability
output_file = r"C:\Users\evilk\OneDrive\Desktop\III YEAR\LABS\NLP\LAB6\quadrigram_katz.csv"

with open(output_file, "w", encoding="utf-8") as f:
    f.write("Ngram,Count,Katz_Prob\n")
    for i, (ngram, count) in enumerate(quad_counts.items()):
        if i >= 10000:   
            break
        prob = katz_backoff_prob(ngram, ngram_counts, d=0.5)
        ngram_str = " ".join(ngram)
        f.write(f'"{ngram_str}",{count},{prob}\n')

print("✅ Katz Backoff probabilities for first 50,000 quadrigrams written to:", output_file)


✅ Katz Backoff probabilities for first 50,000 quadrigrams written to: C:\Users\evilk\OneDrive\Desktop\III YEAR\LABS\NLP\LAB6\quadrigram_katz_50k.csv
