In [None]:
# language_models_full.py
import pandas as pd
from collections import Counter, defaultdict
import random
import re
import numpy as np
import time
import math
import csv
import os

# -------------------------
# Configuration
# -------------------------
PARQUET_FILE_PATH = '../ass1/gujarati_sentence_tokenized.parquet'  # update if needed
COLUMN_NAME = 'sentence'
RANDOM_SEED = 42
DEBUG_LIMIT = None   # set to an int for quick debugging (e.g. 5000). Set to None to use full file.
RESULTS_CSV = 'lm_evaluation_results.csv'

# -------------------------
# Tokenizer / Preprocessing
# -------------------------
def gujarati_word_tokenizer(sentence):
    """
    Tokenizes a Gujarati sentence using comprehensive regex logic.
    Returns list of token strings.
    """
    if not isinstance(sentence, str):
        return []
    sentence = re.sub(r'\s+', ' ', sentence.strip())

    url_pattern = r'https?://\S+|www\.\S+'
    email_pattern = r'\b[\w\.-]+@[\w\.-]+\.\w+\b'
    date_pattern = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{1,2}(?:st|nd|rd|th)?\s+\w+\s+\d{4}\b'
    number_pattern = r'\b\d+(?:[\.,]\d+)?\b'
    # match english words, gujarati unicode range, or single non-whitespace punctuation
    full_pattern = re.compile(
        f'{url_pattern}|{email_pattern}|{date_pattern}|{number_pattern}|[a-zA-Z]+|[\u0A80-\u0AFF]+|[^\w\s]',
        re.UNICODE
    )
    words = re.findall(full_pattern, sentence)
    return words

def load_data_from_parquet(file_path, column_name, debug_limit=None):
    """
    Loads sentences from Parquet file. Uses full dataset unless debug_limit provided.
    """
    try:
        print(f"Reading parquet from: {file_path}")
        df = pd.read_parquet(file_path)
        if column_name not in df.columns:
            raise KeyError(f"Column '{column_name}' not found. Available: {df.columns.tolist()}")
        sentences = df[column_name].astype(str).tolist()
        if debug_limit:
            sentences = sentences[:debug_limit]
            print(f"DEBUG: limiting to first {debug_limit} sentences.")
        print(f"Loaded {len(sentences)} sentences.")
        return sentences
    except FileNotFoundError:
        raise FileNotFoundError(f"File not found: {file_path}")
    except Exception as e:
        raise e

def prepare_sentences(sentences):
    """
    Tokenize and add start tokens for higher-order ngrams.
    We'll add 3 start tokens ('<s>') so quadrigram contexts are easy,
    and one end token '</s>'.
    """
    print("Tokenizing and preparing sentences (adding <s> x3 and </s>) ...")
    processed = []
    for sent in sentences:
        tokens = gujarati_word_tokenizer(sent)
        # Add 3 start tokens for consistent context across all n-gram orders
        processed.append(['<s>','<s>','<s>'] + tokens + ['</s>'])
    print(f"Prepared {len(processed)} sentences.")
    return processed

# -------------------------
# N-gram builders
# -------------------------
def generate_ngrams(tokens, n):
    """
    Yields n-gram tuples from a list of tokens.
    """
    for i in range(len(tokens) - n + 1):
        yield tuple(tokens[i:i+n])

def build_ngram_models(prepared_sentences):
    """
    Build unigram/bigram/trigram/quadrigram counts and return along with total token count.
    """
    print("Building n-gram counts on full dataset...")
    unigram_counts = Counter()
    bigram_counts = Counter()
    trigram_counts = Counter()
    quadrigram_counts = Counter()
    total_tokens = 0

    for sent in prepared_sentences:
        total_tokens += len(sent)
        unigram_counts.update(sent)
        bigram_counts.update(generate_ngrams(sent, 2))
        trigram_counts.update(generate_ngrams(sent, 3))
        quadrigram_counts.update(generate_ngrams(sent, 4))

    print("Done building n-gram counts.")
    return {
        'unigram': unigram_counts,
        'bigram': bigram_counts,
        'trigram': trigram_counts,
        'quadrigram': quadrigram_counts
    }, total_tokens

# -------------------------
# Smoothing probability functions
# -------------------------
def prob_add_k_unigram(word, k, vocab_size, unigram_counts, total_tokens):
    numerator = unigram_counts.get(word, 0) + k
    denominator = total_tokens + k * vocab_size
    return numerator / denominator if denominator > 0 else 0.0

def prob_add_k_bigram(word, prev_word, k, vocab_size, unigram_counts, bigram_counts):
    numerator = bigram_counts.get((prev_word, word), 0) + k
    denominator = unigram_counts.get(prev_word, 0) + k * vocab_size
    return numerator / denominator if denominator > 0 else 0.0

def prob_add_k_trigram(word, p1, p2, k, vocab_size, bigram_counts, trigram_counts):
    context = (p1, p2)
    numerator = trigram_counts.get((*context, word), 0) + k
    denominator = bigram_counts.get(context, 0) + k * vocab_size
    return numerator / denominator if denominator > 0 else 0.0

def prob_add_k_quadrigram(word, p1, p2, p3, k, vocab_size, trigram_counts, quadrigram_counts):
    context = (p1, p2, p3)
    numerator = quadrigram_counts.get((*context, word), 0) + k
    denominator = trigram_counts.get(context, 0) + k * vocab_size
    return numerator / denominator if denominator > 0 else 0.0

# Token-Type smoothing utilities
def compute_follower_counts(ngram_counts):
    """
    ngram_counts: Counter where keys are tuples (of length n), value counts
    returns: dict mapping context tuple (n-1 length) -> number of unique follower types
    """
    follower_map = defaultdict(set)
    for ngram in ngram_counts:
        context = ngram[:-1]
        follower = ngram[-1]
        follower_map[context].add(follower)
    # convert to lengths
    return {context: len(fset) for context, fset in follower_map.items()}

def prob_token_type_bigram(word, prev_word, k, unigram_counts, bigram_counts, follower_counts, fallback_vocab_size):
    context = (prev_word,)
    num_follower_types = follower_counts.get(context, 0)
    # fallback to vocab size if follower_types is 0 (avoid zero denominator scaling)
    scale = num_follower_types if num_follower_types > 0 else fallback_vocab_size
    numerator = bigram_counts.get((prev_word, word), 0) + k
    denominator = unigram_counts.get(prev_word, 0) + k * scale
    return numerator / denominator if denominator > 0 else 0.0

def prob_token_type_trigram(word, p1, p2, k, bigram_counts, trigram_counts, follower_counts, fallback_vocab_size):
    context = (p1, p2)
    num_follower_types = follower_counts.get(context, 0)
    scale = num_follower_types if num_follower_types > 0 else fallback_vocab_size
    numerator = trigram_counts.get((*context, word), 0) + k
    denominator = bigram_counts.get(context, 0) + k * scale
    return numerator / denominator if denominator > 0 else 0.0

def prob_token_type_quadrigram(word, p1, p2, p3, k, trigram_counts, quadrigram_counts, follower_counts, fallback_vocab_size):
    context = (p1, p2, p3)
    num_follower_types = follower_counts.get(context, 0)
    scale = num_follower_types if num_follower_types > 0 else fallback_vocab_size
    numerator = quadrigram_counts.get((*context, word), 0) + k
    denominator = trigram_counts.get(context, 0) + k * scale
    return numerator / denominator if denominator > 0 else 0.0

# -------------------------
# Sentence log-prob calculation
# -------------------------
def calculate_sentence_log_prob(sentence_tokens, model_name, smoothing_type, k, params):
    """
    sentence_tokens: tokenized sentence already prepared with <s> x3 and </s>
    model_name: 'unigram'|'bigram'|'trigram'|'quadrigram'
    smoothing_type: 'add_k'|'token_type'
    k: smoothing parameter
    params: tuple (counts, total_tokens, vocab_size, follower_maps)
    returns: log probability (natural log)
    """
    counts, total_tokens, vocab_size, follower_maps = params
    unigrams = counts['unigram']
    bigrams = counts['bigram']
    trigrams = counts['trigram']
    quadrigrams = counts['quadrigram']

    log_prob = 0.0
    eps = 1e-12  # to avoid log(0); we already aim to avoid zero probs by smoothing

    # iterate over tokens starting after the 3 start tokens (index 3) up to end
    # We include the end token '</s>' in scoring.
    for i in range(3, len(sentence_tokens)):
        w = sentence_tokens[i]
        prob = 0.0
        if model_name == 'unigram':
            prob = prob_add_k_unigram(w, k, vocab_size, unigrams, total_tokens)
        elif model_name == 'bigram':
            prev = sentence_tokens[i-1]
            if smoothing_type == 'add_k':
                prob = prob_add_k_bigram(w, prev, k, vocab_size, unigrams, bigrams)
            elif smoothing_type == 'token_type':
                prob = prob_token_type_bigram(w, prev, k, unigrams, bigrams, follower_maps['bigram'], vocab_size)
            else:
                raise ValueError("Unknown smoothing for bigram")
        elif model_name == 'trigram':
            p1 = sentence_tokens[i-2]
            p2 = sentence_tokens[i-1]
            if smoothing_type == 'add_k':
                prob = prob_add_k_trigram(w, p1, p2, k, vocab_size, bigrams, trigrams)
            elif smoothing_type == 'token_type':
                prob = prob_token_type_trigram(w, p1, p2, k, bigrams, trigrams, follower_maps['trigram'], vocab_size)
            else:
                raise ValueError("Unknown smoothing for trigram")
        elif model_name == 'quadrigram':
            p1 = sentence_tokens[i-3]
            p2 = sentence_tokens[i-2]
            p3 = sentence_tokens[i-1]
            if smoothing_type == 'add_k':
                prob = prob_add_k_quadrigram(w, p1, p2, p3, k, vocab_size, trigrams, quadrigrams)
            elif smoothing_type == 'token_type':
                prob = prob_token_type_quadrigram(w, p1, p2, p3, k, trigrams, quadrigrams, follower_maps['quadrigram'], vocab_size)
            else:
                raise ValueError("Unknown smoothing for quadrigram")
        else:
            raise ValueError("Unknown model name")

        # avoid -inf
        log_prob += math.log(prob + eps)

    return log_prob

# -------------------------
# Main pipeline
# -------------------------
def main():
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    start_time = time.time()

    # 1. Load data (full unless DEBUG_LIMIT set)
    sentences_raw = load_data_from_parquet(PARQUET_FILE_PATH, COLUMN_NAME, debug_limit=DEBUG_LIMIT)

    # 2. Prepare sentences (tokenize + add 3 start tokens + end token)
    prepared = prepare_sentences(sentences_raw)

    # 3. Build n-gram models on full prepared dataset
    counts, total_tokens = build_ngram_models(prepared)
    vocab_size = len(counts['unigram'])  # includes <s> and </s>
    print(f"Vocab size: {vocab_size}, total tokens: {total_tokens}")

    # 4. Pre-compute follower maps for token-type smoothing
    print("Computing follower counts for token-type smoothing...")
    follower_maps = {
        'bigram': compute_follower_counts(counts['bigram']),
        'trigram': compute_follower_counts(counts['trigram']),
        'quadrigram': compute_follower_counts(counts['quadrigram'])
    }
    print("Follower maps ready.")

    # 5. Select 1000 random sentences (or fewer if corpus smaller) for evaluation
    N_TEST = 1000
    if len(prepared) >= N_TEST:
        test_sentences = random.sample(prepared, N_TEST)
    else:
        test_sentences = prepared
        print(f"Warning: only {len(prepared)} sentences available for testing.")

    print(f"Selected {len(test_sentences)} sentences for evaluation.")

    # 6. Evaluate sentences under different models & smoothings and store results
    results = []
    models = ['unigram', 'bigram', 'trigram', 'quadrigram']
    k_for_add_k = 0.5  # Add-K example
    ks_to_test = [1.0, k_for_add_k]  # add-1 (laplace) and add-k
    # token-type smoothing will use k=1 by default here
    params = (counts, total_tokens, vocab_size, follower_maps)

    # iterate and compute log probs
    print("Evaluating...")
    for idx, sent in enumerate(test_sentences, 1):
        # compact sentence string for display (but keep tokens for scoring)
        sent_str = ' '.join(sent)
        if len(sent_str) > 120:
            sent_brief = sent_str[:117] + '...'
        else:
            sent_brief = sent_str

        if idx % 100 == 0:
            print(f"Processed {idx}/{len(test_sentences)}")

        for model in models:
            # Unigram only needs add-k variants (token-type not meaningful for unigram)
            if model == 'unigram':
                for k in ks_to_test:
                    logp = calculate_sentence_log_prob(sent, 'unigram', 'add_k', k, params)
                    results.append({
                        'Sentence_ID': idx,
                        'Sentence': sent_brief,
                        'Model': f'unigram_add_{k}',
                        'LogProb': logp
                    })
                continue

            # For bigram/trigram/quadrigram: add-1, add-k, token-type(k=1)
            # Add-1
            logp_a1 = calculate_sentence_log_prob(sent, model, 'add_k', 1.0, params)
            results.append({'Sentence_ID': idx, 'Sentence': sent_brief, 'Model': f'{model}_add_1', 'LogProb': logp_a1})
            # Add-k
            logp_ak = calculate_sentence_log_prob(sent, model, 'add_k', k_for_add_k, params)
            results.append({'Sentence_ID': idx, 'Sentence': sent_brief, 'Model': f'{model}_add_{k_for_add_k}', 'LogProb': logp_ak})
            # Token-type smoothing (k=1)
            logp_tt = calculate_sentence_log_prob(sent, model, 'token_type', 1.0, params)
            results.append({'Sentence_ID': idx, 'Sentence': sent_brief, 'Model': f'{model}_token_type_k1', 'LogProb': logp_tt})

    # 7. Save results to CSV and show top rows
    print(f"Saving {len(results)} evaluation rows to {RESULTS_CSV} ...")
    # ensure output dir exists
    out_dir = os.path.dirname(os.path.abspath(RESULTS_CSV))
    os.makedirs(out_dir, exist_ok=True)

    df_results = pd.DataFrame(results)
    df_results.to_csv(RESULTS_CSV, index=False)
    print(df_results.head(20).to_string(index=False))

    end_time = time.time()
    print(f"Completed in {end_time - start_time:.2f} seconds.")

if __name__ == '__main__':
    main()


Reading parquet from: ../ass1/gujarati_sentence_tokenized.parquet
Loaded 12118231 sentences.
Tokenizing and preparing sentences (adding <s> x3 and </s>) ...
