In [1]:
import pandas as pd
from collections import Counter, defaultdict
import random
import re
import numpy as np
import time

# --- 1. Data Loading and Preparation ---

PARQUET_FILE_PATH = '../ass1/gujarati_sentence_tokenized.parquet'
COLUMN_NAME = 'sentence'

def load_data_from_parquet(file_path, column_name):
    """
    Loads the full list of sentences from a specified column in a Parquet file.
    """
    try:
        print(f"Attempting to read data from '{file_path}'...")
        df = pd.read_parquet(file_path)
        
        if column_name not in df.columns:
            print(f"Error: Column '{column_name}' not found in the Parquet file.")
            print(f"Available columns are: {df.columns.tolist()}")
            return None
            
        # As per the requirement, use the full dataset for training the models.
        sentences = df[column_name].tolist()[:100000]
        print(f"Successfully loaded {len(sentences)} sentences for the training set.")
        return sentences
        
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        print("Please make sure the PARQUET_FILE_PATH is correct.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def gujarati_word_tokenizer(sentence):
    """
    Tokenizes a Gujarati sentence using comprehensive regex logic.
    """
    if not isinstance(sentence, str):
        return []
        
    sentence = re.sub(r'\s+', ' ', sentence.strip())
    
    url_pattern = r'https?://\S+|www\.\S+'
    email_pattern = r'\b[\w\.-]+@[\w\.-]+\.\w+\b'
    date_pattern = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{1,2}(?:st|nd|rd|th)?\s+\w+\s+\d{4}\b'
    number_pattern = r'\b\d+(?:[\.,]\d+)?\b'
    full_pattern = re.compile(
        f'{url_pattern}|{email_pattern}|{date_pattern}|{number_pattern}|[a-zA-Z]+|[\u0A80-\u0AFF]+|[^\w\s]',
        re.UNICODE
    )
    words = re.findall(full_pattern, sentence)
    return words

def prepare_sentences(sentences):
    """
    Tokenizes each sentence and adds start (<s>) and end (</s>) tokens.
    """
    print("Tokenizing sentences and adding start/end tokens...")
    processed_sentences = []
    for sentence in sentences:
        tokens = gujarati_word_tokenizer(sentence)
        # We add one start token for unigram/bigram, but it's simpler to be consistent.
        # Higher-order models will use multiple start tokens as padding during calculation.
        processed_sentences.append(['<s>'] + tokens + ['</s>'])
    print(f"Finished preparing {len(processed_sentences)} sentences.")
    return processed_sentences

# --- 2. N-gram Model Building ---

def generate_ngrams(words, n):
    """A generator for n-grams using a sliding window."""
    for i in range(len(words) - n + 1):
        yield tuple(words[i : i + n])

def build_ngram_models(sentences):
    """Builds Unigram, Bigram, Trigram, and Quadrigram models from prepared sentences."""
    print("Building language models...")
    unigram_counts = Counter()
    bigram_counts = Counter()
    trigram_counts = Counter()
    quadrigram_counts = Counter()

    total_words = 0
    for sentence in sentences:
        total_words += len(sentence)
        unigram_counts.update(sentence)
        bigram_counts.update(generate_ngrams(sentence, 2))
        trigram_counts.update(generate_ngrams(sentence, 3))
        quadrigram_counts.update(generate_ngrams(sentence, 4))
    
    print("Models built successfully.")
    return {
        'unigram': unigram_counts,
        'bigram': bigram_counts,
        'trigram': trigram_counts,
        'quadrigram': quadrigram_counts
    }, total_words

# --- 3. Smoothing Implementations ---

# 3a. Add-K Smoothing Probability Functions
def prob_add_k_unigram(word, k, vocab_size, unigram_counts, total_word_count):
    """Calculates P(word) with Add-K smoothing."""
    numerator = unigram_counts.get(word, 0) + k
    denominator = total_word_count + (k * vocab_size)
    return numerator / denominator if denominator > 0 else 0.0

def prob_add_k_bigram(word, prev_word, k, vocab_size, unigram_counts, bigram_counts):
    """Calculates P(word | prev_word) with Add-K smoothing."""
    numerator = bigram_counts.get((prev_word, word), 0) + k
    denominator = unigram_counts.get(prev_word, 0) + (k * vocab_size)
    return numerator / denominator if denominator > 0 else 0.0

def prob_add_k_trigram(word, p1, p2, k, vocab_size, bigram_counts, trigram_counts):
    """Calculates P(word | p1, p2) with Add-K smoothing."""
    context = (p1, p2)
    numerator = trigram_counts.get((*context, word), 0) + k
    denominator = bigram_counts.get(context, 0) + (k * vocab_size)
    return numerator / denominator if denominator > 0 else 0.0

def prob_add_k_quadrigram(word, p1, p2, p3, k, vocab_size, trigram_counts, quadrigram_counts):
    """Calculates P(word | p1, p2, p3) with Add-K smoothing."""
    context = (p1, p2, p3)
    numerator = quadrigram_counts.get((*context, word), 0) + k
    denominator = trigram_counts.get(context, 0) + (k * vocab_size)
    return numerator / denominator if denominator > 0 else 0.0

# 3b. Token Type Smoothing Preparation and Probability Functions
def compute_follower_counts(ngram_counts):
    """Computes the number of unique token types that follow a given context."""
    follower_counts = defaultdict(set)
    for ngram in ngram_counts:
        context = ngram[:-1]
        follower = ngram[-1]
        follower_counts[context].add(follower)
    return {context: len(followers) for context, followers in follower_counts.items()}

def prob_token_type_bigram(word, prev_word, k, unigram_counts, bigram_counts, follower_counts):
    """Calculates P(word | prev_word) with Token Type Smoothing."""
    context = (prev_word,)
    num_follower_types = follower_counts.get(context, 0)
    numerator = bigram_counts.get((prev_word, word), 0) + k
    denominator = unigram_counts.get(prev_word, 0) + (k * num_follower_types)
    return numerator / denominator if denominator > 0 else 0.0

def prob_token_type_trigram(word, p1, p2, k, bigram_counts, trigram_counts, follower_counts):
    """Calculates P(word | p1, p2) with Token Type Smoothing."""
    context = (p1, p2)
    num_follower_types = follower_counts.get(context, 0)
    numerator = trigram_counts.get((*context, word), 0) + k
    denominator = bigram_counts.get(context, 0) + (k * num_follower_types)
    return numerator / denominator if denominator > 0 else 0.0

def prob_token_type_quadrigram(word, p1, p2, p3, k, trigram_counts, quadrigram_counts, follower_counts):
    """Calculates P(word | p1, p2, p3) with Token Type Smoothing."""
    context = (p1, p2, p3)
    num_follower_types = follower_counts.get(context, 0)
    numerator = quadrigram_counts.get((*context, word), 0) + k
    denominator = trigram_counts.get(context, 0) + (k * num_follower_types)
    return numerator / denominator if denominator > 0 else 0.0


# --- 4. Sentence Probability Calculation ---

def calculate_sentence_log_prob(sentence, model_name, smoothing_type, k, params):
    """Calculates the log probability of a sentence using a specified model and smoothing."""
    log_prob = 0.0
    epsilon = 1e-10 

    # Unpack parameters
    counts, total_words, vocab_size, follower_maps = params
    unigrams, bigrams, trigrams, quadrigrams = counts['unigram'], counts['bigram'], counts['trigram'], counts['quadrigram']
    
    # Pad sentence with start tokens for higher-order models
    padded_sentence = ['<s>'] * 3 + sentence[1:] # Max padding needed is 3 for quadrigram

    for i in range(1, len(sentence)): # Start from first word after <s>
        word = sentence[i]
        prob = 0.0
        
        # Select the correct function based on model and smoothing
        if model_name == 'unigram':
             prob = prob_add_k_unigram(word, k, vocab_size, unigrams, total_words)
        elif model_name == 'bigram':
            prev = padded_sentence[i+2] # Context: w_{i-1}
            if smoothing_type == 'add_k':
                prob = prob_add_k_bigram(word, prev, k, vocab_size, unigrams, bigrams)
            elif smoothing_type == 'token_type':
                prob = prob_token_type_bigram(word, prev, k, unigrams, bigrams, follower_maps['bigram'])
        elif model_name == 'trigram':
            p1, p2 = padded_sentence[i+1], padded_sentence[i+2] # Context: w_{i-2}, w_{i-1}
            if smoothing_type == 'add_k':
                prob = prob_add_k_trigram(word, p1, p2, k, vocab_size, bigrams, trigrams)
            elif smoothing_type == 'token_type':
                prob = prob_token_type_trigram(word, p1, p2, k, bigrams, trigrams, follower_maps['trigram'])
        elif model_name == 'quadrigram':
            p1, p2, p3 = padded_sentence[i], padded_sentence[i+1], padded_sentence[i+2] # Context: w_{i-3}, w_{i-2}, w_{i-1}
            if smoothing_type == 'add_k':
                prob = prob_add_k_quadrigram(word, p1, p2, p3, k, vocab_size, trigrams, quadrigrams)
            elif smoothing_type == 'token_type':
                prob = prob_token_type_quadrigram(word, p1, p2, p3, k, trigrams, quadrigrams, follower_maps['quadrigram'])

        log_prob += np.log(prob + epsilon)
        
    return log_prob

# --- 5. Main Execution Block ---
def main():
    """Main function to run the entire pipeline."""
    start_time = time.time()
    
    # 1. Load and Prepare Data
    all_sentences = load_data_from_parquet(PARQUET_FILE_PATH, COLUMN_NAME)
    if not all_sentences:
        print("\nExecution stopped: No data was loaded.")
        return

    prepared_sentences = prepare_sentences(all_sentences)
    
    # 2. Build Language Models on the full dataset
    counts, total_word_count = build_ngram_models(prepared_sentences)
    vocab_size = len(counts['unigram'])
    print(f"Vocabulary Size (V): {vocab_size}")
    print(f"Total tokens in corpus (N): {total_word_count}")

    # 3. Pre-compute follower counts for Token Type Smoothing
    print("\n--- Pre-computing Follower Counts for Token Type Smoothing ---")
    follower_maps = {
        'bigram': compute_follower_counts(counts['bigram']),
        'trigram': compute_follower_counts(counts['trigram']),
        'quadrigram': compute_follower_counts(counts['quadrigram'])
    }
    print("Follower counts computed.")
    
    # 4. Select 1000 random sentences for evaluation
    print("\n--- Selecting 1000 Sentences for Evaluation ---")
    if len(prepared_sentences) >= 1000:
        test_sentences = random.sample(prepared_sentences, 1000)
        print(f"Randomly selected {len(test_sentences)} sentences.")
    else:
        test_sentences = prepared_sentences
        print(f"Warning: Fewer than 1000 sentences. Using all {len(test_sentences)} for testing.")

    # 5. Apply models and compute probabilities
    print("\n--- Applying Smoothed Models to Test Sentences ---")
    results = []
    models_to_test = ['unigram', 'bigram', 'trigram', 'quadrigram']
    k_for_add_k = 0.5  # A common choice for K in Add-K smoothing

    # Package all model parameters for easy passing
    params = (counts, total_word_count, vocab_size, follower_maps)

    for i, sentence in enumerate(test_sentences):
        if (i + 1) % 100 == 0:
            print(f"Processing sentence {i+1}/{len(test_sentences)}...")

        sent_str = ' '.join(sentence)
        if len(sent_str) > 70: sent_str = sent_str[:67] + '...'

        for model in models_to_test:
            # Unigram only uses Add-K
            if model == 'unigram':
                log_prob_add1 = calculate_sentence_log_prob(sentence, model, 'add_k', 1, params)
                results.append({'Sentence': sent_str, 'Model': f'{model}_add_1', 'Log Probability': log_prob_add1})
                continue

            # Add-1 Smoothing (Laplace)
            log_prob_add1 = calculate_sentence_log_prob(sentence, model, 'add_k', 1, params)
            results.append({'Sentence': sent_str, 'Model': f'{model}_add_1', 'Log Probability': log_prob_add1})

            # Add-K Smoothing
            log_prob_add_k = calculate_sentence_log_prob(sentence, model, 'add_k', k_for_add_k, params)
            results.append({'Sentence': sent_str, 'Model': f'{model}_add_{k_for_add_k}', 'Log Probability': log_prob_add_k})
            
            # Token Type Smoothing (k=1 is a common choice)
            log_prob_token_type = calculate_sentence_log_prob(sentence, model, 'token_type', 1, params)
            results.append({'Sentence': sent_str, 'Model': f'{model}_token_type', 'Log Probability': log_prob_token_type})
            
    print("Evaluation complete.")

    # 6. Display Results
    print("\n--- Sample of Evaluation Results ---")
    results_df = pd.DataFrame(results)
    pd.set_option('display.max_rows', 24)
    pd.set_option('display.width', 120)
    print(results_df.head(22)) # Display results for a couple of sentences

    end_time = time.time()
    print(f"\nTotal execution time: {end_time - start_time:.2f} seconds.")


if __name__ == "__main__":
    main()

Attempting to read data from '../ass1/gujarati_sentence_tokenized.parquet'...
Successfully loaded 100000 sentences for the training set.
Tokenizing sentences and adding start/end tokens...
Finished preparing 100000 sentences.
Building language models...
Models built successfully.
Vocabulary Size (V): 143826
Total tokens in corpus (N): 2008988

--- Pre-computing Follower Counts for Token Type Smoothing ---
Follower counts computed.

--- Selecting 1000 Sentences for Evaluation ---
Randomly selected 1000 sentences.

--- Applying Smoothed Models to Test Sentences ---
Processing sentence 100/1000...
Processing sentence 200/1000...
Processing sentence 300/1000...
Processing sentence 400/1000...
Processing sentence 500/1000...
Processing sentence 600/1000...
Processing sentence 700/1000...
Processing sentence 800/1000...
Processing sentence 900/1000...
Processing sentence 1000/1000...
Evaluation complete.

--- Sample of Evaluation Results ---
                                             Sente