In [None]:
# Function to load counts from CSV
def load_csv_counts(filename):
    counts = {}
    with open(filename, "r", encoding="utf-8") as f:
        header = f.readline()
        for line in f:
            parts = line.strip().split(",")
            if len(parts) < 2:
                continue
            ngram_str = parts[0].strip().strip('"')
            count_str = parts[1].strip().strip('"')
            try:
                count = int(count_str)
            except:
                continue
            ngram = tuple(ngram_str.split())
            if len(ngram) == 0:
                continue
            counts[ngram] = count
    return counts

# Load trigram and quadrigram
tri_counts  = load_csv_counts(r"C:\Users\dubey\OneDrive\Desktop\Coding\NLP-lab\LAB4\trigram.csv")
quad_counts = load_csv_counts(r"C:\Users\dubey\OneDrive\Desktop\Coding\NLP-lab\LAB4\quadrigram.csv")

ngram_counts = {3: tri_counts, 4: quad_counts}


In [2]:
import random

def get_prob(word, context, counts_dicts, n):
    """
    Returns MLE probability of 'word' given 'context'.
    Uses backoff if context not found.
    """
    if n == 1:
        total = sum(counts_dicts[3].values())  # fallback: use trigram counts last word frequencies
        word_counts = sum(c for g, c in counts_dicts[3].items() if g[-1]==word)
        return word_counts / total if total > 0 else 0

    ctx = tuple(context[-(n-1):])
    ngram = ctx + (word,)
    c_ngram = counts_dicts[n].get(ngram, 0)
    c_prefix = sum([c for g, c in counts_dicts[n].items() if g[:-1] == ctx])
    if c_prefix == 0:
        return get_prob(word, context, counts_dicts, n-1)
    return c_ngram / c_prefix


In [3]:
def get_start_context(counts_dicts, n):
    # Choose a context that exists in the n-grams and starts with <s>
    candidates = [g[:-1] for g in counts_dicts[n] if g[0] == "<s>"]
    if candidates:
        return list(random.choice(candidates))
    else:
        return ["<s>"] * (n-1)


In [4]:
def generate_greedy_ng(counts_dicts, n, max_len=15):
    sentence = get_start_context(counts_dicts, n)
    
    for _ in range(max_len):
        ctx = tuple(sentence[-(n-1):])
        candidates_dict = {g[-1]: counts_dicts[n][g] for g in counts_dicts[n] if g[:-1]==ctx}
        
        if not candidates_dict:
            # fallback: pick any last word from trigram
            last_words = [g[-1] for g in counts_dicts[3]]
            next_word = random.choice(last_words)
            sentence.append(next_word)
            if next_word == "</s>":
                break
            continue
        
        # Probabilistic sampling
        words, counts = zip(*candidates_dict.items())
        probs = [c/sum(counts) for c in counts]
        next_word = random.choices(words, probs)[0]
        sentence.append(next_word)
        if next_word == "</s>":
            break
    return " ".join(sentence)


In [5]:
def generate_beam_ng(counts_dicts, n, beam_size=20, max_len=15):
    sequences = [(get_start_context(counts_dicts, n), 1.0)]
    
    for _ in range(max_len):
        all_candidates = []
        for seq, seq_prob in sequences:
            ctx = tuple(seq[-(n-1):])
            candidates_dict = {g[-1]: counts_dicts[n][g] for g in counts_dicts[n] if g[:-1]==ctx}
            
            if not candidates_dict:
                # fallback: pick any last word from trigram
                last_words = [g[-1] for g in counts_dicts[3]]
                next_word = random.choice(last_words)
                all_candidates.append((seq+[next_word], seq_prob))
                continue
            
            words, counts = zip(*candidates_dict.items())
            probs = [c/sum(counts) for c in counts]
            for w, p in zip(words, probs):
                all_candidates.append((seq+[w], seq_prob * p))
        
        all_candidates.sort(key=lambda x: x[1], reverse=True)
        sequences = all_candidates[:beam_size]
        
        if all(seq[-1] == "</s>" for seq, _ in sequences):
            break
    return [" ".join(seq) for seq, _ in sequences]


In [None]:
import csv

n_values = [3,4]  # trigram and quadrigram
num_sentences = 100

for n in n_values:
    # Greedy
    greedy_sentences = [generate_greedy_ng(ngram_counts, n) for _ in range(num_sentences)]
    output_file = f"C:\\Users\\Dubey\\OneDrive\\Desktop\\Coding\\NLP-Lab\\LAB6\\greedy_{n}gram_100.csv"
    with open(output_file, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["Sentence"])
        for s in greedy_sentences:
            writer.writerow([s])
    
    # Beam search
    beam_sentences = []
    while len(beam_sentences) < num_sentences:
        seqs = generate_beam_ng(ngram_counts, n, beam_size=20)
        beam_sentences.extend(seqs)
    beam_sentences = beam_sentences[:num_sentences]
    
    output_file_beam = f"C:\\Users\\Dubey\\OneDrive\\Desktop\\Coding\\NLP-Lab\\LAB6\\beam_{n}gram_100.csv"
    with open(output_file_beam, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["Sentence"])
        for s in beam_sentences:
            writer.writerow([s])
    
    print(f"✅ {n}-gram greedy & beam sentences saved.")


✅ 3-gram greedy & beam sentences saved.
✅ 4-gram greedy & beam sentences saved.
