# 50.007 Machine Learning, Spring 2025
# Design Project

Due 27 Apr 2025, 5:00pm

By: Aishwarya Iyer (1007141) and Khoo Zi Qi (1006984)

## Part 1 (30points)

In [None]:
train_file_path = r"EN\train"
# Raw string to handle Windows paths
# can't use \t, it will be seen as a tab

Write a function that estimates the emission parameters from the training set using MLE (maximum likelihood estimation): 

In [11]:
"""
Computes emission parameters for an HMM: e(x|y) = Count(y → x) / Count(y)
where:
- x: observed word
- y: corresponding tag (e.g., 'B-NP', 'I-VP', 'O')
"""
# Use defaultdict to automatically handles missing keys
from collections import defaultdict

def compute_emission_parameters(train_file_path):
    """
    Args:
        train_file_path: Path to training file (word-tag pairs separated by whitespace)
    
    Returns:
        Dictionary of dictionaries: emission_parameters[tag][word] = probability
    """
    
    # Initialize counters:
    # - emission_counts[tag][word] = times word appears with tag
    # - tag_counts[tag] = total occurrences of tag
    emission_counts = defaultdict(lambda: defaultdict(int))
    tag_counts = defaultdict(int)

    # Count word-tag co-occurrences and tag frequencies
    with open(train_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line:  # Skip empty lines
                try:
                    word, tag = line.split()  # Split by any whitespace
                    emission_counts[tag][word] += 1
                    tag_counts[tag] += 1
                except ValueError:
                    print(f"Skipping invalid line: {line}")

    # Calculate emission probabilities
    emission_parameters = defaultdict(dict)
    for tag in emission_counts:
        total_tag_occurrences = tag_counts[tag]
        for word in emission_counts[tag]:
            emission_parameters[tag][word] = (
                emission_counts[tag][word] / total_tag_occurrences
            )
    
    return emission_parameters

emission_parameters = compute_emission_parameters(train_file_path)
# print(emission_parameters)

Use smoothing (10 points)
- Identify words that appear less than 3 times
- Replace those words with #UNK#


In [None]:
def compute_emission_parameters_smoothing(train_file_path, k):

    """
    Args:
        train_file_path: Path to training file (word-tag pairs separated by whitespace)
        k: minimum count of word. If less, replace word with #UNK#.
    
    Returns:
        Dictionary of dictionaries: emission_parameters[tag][word] = probability
    """
    
    word_counts = defaultdict(int)
    with open(train_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:  # not an empty line
                word, tag = line.split()
                word_counts[word] += 1
    
    # Identify rare words
    rare_words = {word for word, count in word_counts.items() if count < k}

    # Initialize counters:
    # - emission_counts[tag][word] = times word appears with tag
    # - tag_counts[tag] = total occurrences of tag
    emission_counts = defaultdict(lambda: defaultdict(int))
    tag_counts = defaultdict(int)

    # Count word-tag co-occurrences and tag frequencies
    with open(train_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line:  # Skip empty lines
                try:
                    word, tag = line.split()  # Split by any whitespace
                    processed_word = word if word not in rare_words else '#UNK#' #modify training set
                    emission_counts[tag][processed_word] += 1
                    tag_counts[tag] += 1
                except ValueError:
                    print(f"Skipping invalid line: {line}")

    # Calculate emission probabilities
    emission_parameters = defaultdict(dict)
    for tag in emission_counts:
        total_tag_occurrences = tag_counts[tag]
        for word in emission_counts[tag]:
            emission_parameters[tag][word] = (
                emission_counts[tag][word] / total_tag_occurrences
            )
    
    return emission_parameters

emission_parameters = compute_emission_parameters_smoothing(train_file_path, k = 3)
# print(emission_parameters)

