In [None]:
# Function to load counts from a CSV (handles quotes, spaces)
def load_csv_counts(filename):
    counts = {}
    with open(filename, "r", encoding="utf-8") as f:
        header = f.readline()  # skip header
        for line in f:
            parts = line.strip().split(",")
            if len(parts) < 2:
                continue
            ngram_str = parts[0].strip().strip('"').strip()
            count_str = parts[1].strip().strip('"').strip()
            try:
                count = int(count_str)
            except:
                continue
            ngram = tuple(ngram_str.split())
            counts[ngram] = count
    return counts

# Load n-gram counts
uni_counts  = load_csv_counts(r"C:\Users\dubey\OneDrive\Desktop\Coding\NLP-lab\LAB4\unigram.csv")
bi_counts   = load_csv_counts(r"C:\Users\dubey\OneDrive\Desktop\Coding\NLP-lab\LAB4\bigram.csv")
tri_counts  = load_csv_counts(r"C:\Users\dubey\OneDrive\Desktop\Coding\NLP-lab\LAB4\trigram.csv")
quad_counts = load_csv_counts(r"C:\Users\dubey\OneDrive\Desktop\Coding\NLP-lab\LAB4\quadrigram.csv")

ngram_counts = {1: uni_counts, 2: bi_counts, 3: tri_counts, 4: quad_counts}


In [15]:
# Get count safely
def get_count(ngram, counts_dict):
    return counts_dict.get(ngram, 0)

# Get prefix count
def get_prefix_count(ngram, counts_dict):
    prefix = ngram[:-1]
    prefix_counts = 0
    for g, c in counts_dict.items():
        if g[:-1] == prefix:
            prefix_counts += c
    return prefix_counts

# Get number of unique continuations (N+)
def unique_continuations(prefix, counts_dict):
    seen = set()
    for g in counts_dict.keys():
        if g[:-1] == prefix:
            seen.add(g[-1])
    return len(seen)

# Get number of unique histories for continuation probability
def unique_histories(word, counts_dict):
    seen = set()
    for g in counts_dict.keys():
        if g[-1] == word:
            seen.add(g[:-1])
    return len(seen)


In [16]:
def kn_prob(ngram, counts_dicts, d=0.75):
    """
    Recursive Kneser–Ney probability with safety for empty or short ngrams.
    """
    if len(ngram) == 0:
        return 0.0  # safety for empty ngram

    n = len(ngram)

    # Unigram case: use continuation probability
    if n == 1:
        word = ngram[0]
        if not counts_dicts[2]:  # no bigram counts
            return 0.0
        # compute total number of unique histories
        total_types = 0
        for g in counts_dicts[2]:
            if len(g) < 2:
                continue
            total_types += 1
        if total_types == 0:
            return 0.0
        # number of histories where last word = word
        word_count = sum(1 for g in counts_dicts[2] if len(g) >= 2 and g[-1] == word)
        return word_count / total_types

    # Counts
    c_ngram = counts_dicts[n].get(ngram, 0)
    c_prefix = sum(c for g, c in counts_dicts[n].items() if len(g) >= n-1 and g[:-1] == ngram[:-1])

    # First term: discounted MLE
    first = max(c_ngram - d, 0) / c_prefix if c_prefix > 0 else 0

    # Lambda: backoff weight
    N_plus = len([g for g in counts_dicts[n] if len(g) >= n-1 and g[:-1] == ngram[:-1]])
    lam = (d * N_plus / c_prefix) if c_prefix > 0 else 1

    # Recursive backoff
    backoff = kn_prob(ngram[1:], counts_dicts, d)

    return first + lam * backoff


In [None]:
output_file_kn = r"C:\Users\dubey\OneDrive\Desktop\Coding\NLP-lab\LAB6\quadrigram_kneserney.csv"

with open(output_file_kn, "w", encoding="utf-8") as f:
    f.write("Ngram,Count,KneserNey_Prob\n")
    written = 0
    for ngram, count in quad_counts.items():
        if written >= 5000:
            break
        if len(ngram) < 4:
            continue
        prob = kn_prob(ngram, ngram_counts, d=0.75)
        ngram_str = " ".join(ngram)
        f.write(f'"{ngram_str}",{count},{prob}\n')
        written += 1

print("✅ Kneser–Ney probabilities for first 10,000 quadrigrams written to:", output_file_kn)


✅ Kneser–Ney probabilities for first 10,000 quadrigrams written to: C:\Users\evilk\OneDrive\Desktop\III YEAR\LABS\NLP\LAB6\quadrigram_kneserney.csv
