In [59]:
# imports
from collections import Counter, defaultdict
import numpy as np

# Pre-process Data

In [94]:
def preprocess_data(train_file, test_file):
    # Read and process the training data
    with open(train_file, 'r', encoding='utf-8') as file:
        train_lines = file.readlines()

    # Lowercase and count word occurrences
    word_count = {}
    train_processed = []
    for line in train_lines:
        # Pad and lowercase the line
        padded_line = '<s> ' + line.strip().lower() + ' </s>'
        train_processed.append(padded_line)
        
        # Count words
        words = padded_line.split()
        for word in words:
            if word not in word_count:
                word_count[word] = 0
            word_count[word] += 1

    # Replace single-occurrence words with <unk>
    train_final = []
    for line in train_processed:
        new_line = ' '.join('<unk>' if word_count[word] == 1 else word for word in line.split())
        train_final.append(new_line)

    # Process the test data, replacing unseen words with <unk>
    with open(test_file, 'r', encoding='utf-8') as file:
        test_lines = file.readlines()

    test_processed = []
    for line in test_lines:
        # Pad and lowercase the line
        padded_line = '<s> ' + line.strip().lower() + ' </s>'
        # Replace unseen words
        new_line = ' '.join('<unk>' if word not in word_count else word for word in padded_line.split())
        test_processed.append(new_line)

    train_split = [train_sentence.split() for train_sentence in train_final]
    test_split = [test_sentence.split() for test_sentence in test_processed]
    return train_split, test_split

In [83]:
train_file = 'brown.train.txt'
test_file = 'brown.test.txt'

train_data, test_data = preprocess_data(train_file, test_file)

In [84]:
# prints out 5 lines from the processed test and training txt file
def print_txt(data, num_samples=5):
    for i in range(num_samples):
        print(data[i])


# Display processed training and test data
print("Training Data Samples:")
print_txt(train_data)
print("\nTest Data Samples:")
print_txt(test_data)

Training Data Samples:
['<s>', 'the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.', '</s>']
['<s>', 'the', '<unk>', 'term', 'jury', 'had', 'been', 'charged', 'by', 'fulton', 'superior', 'court', 'judge', '<unk>', '<unk>', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', '<unk>', 'ivan', 'allen', 'jr.', '.', '</s>']
['<s>', 'the', 'jury', 'said', 'it', 'did', 'find', 'that', 'many', 'of', "georgia's", 'registration', 'and', 'election', 'laws', '``', 'are', 'outmoded', 'or', 'inadequate', 'and', 'often', 'ambiguous', "''", '.', '</s>']
['<s>', 'it', 'recommended', 'that', 'fulton', 'legislators', 'act', '``', 'to', 'have', 'these', 'laws', 'studied', 'and', 'revised', 'to', 'the', 'end', 'of', 'modernizing', 'and

# 1.2 Training the Models

## Helper functions to generate unigrams and bigrams

In [85]:
# Helper function to generate unigrams and bigrams
def generate_ngrams(sentences):
    unigrams = Counter()
    bigrams = defaultdict(Counter)

    for sentence in sentences:
        # Generate unigrams
        for word in sentence:
            unigrams[word] += 1
        # Generate bigrams
        for w1, w2 in zip(sentence[:-1], sentence[1:]):
            bigrams[w1][w2] += 1

    return unigrams, bigrams


# Generate the unigrams and bigrams from the sentences
unigrams, bigrams = generate_ngrams(train_data)

# Total number of words (tokens) in the corpus
total_tokens = sum(unigrams.values())
print("Total # of Tokens: ", total_tokens)

# Vocabulary size
vocabulary_size = len(unigrams)
print("Total # of Unique Words: ", vocabulary_size)

Total # of Tokens:  1018784
Total # of Unique Words:  24796


In [88]:
# Unigram Maximum Likelihood Estimate

def unigram_mle(word):
    return unigrams[word] / total_tokens

In [89]:
# Bigram Maximum Likelihood Estimate

def bigram_mle(w1, w2):
    if unigrams[w1] == 0: return 0
    return bigrams[w1][w2] / unigrams[w1]

In [90]:
# Bigram Add-One Smoothing

def bigram_add_one(w1, w2):
    return (bigrams[w1][w2] + 1) / (unigrams[w1] + vocabulary_size)

In [91]:
# Katz Backoff

def bigram_katz_backoff(w1, w2, discount=0.5):
    bigram_count = bigrams[w1][w2]
    unigram_count = unigrams[w1]

    # Epsilon is a small value to handle unseen words and avoid division by zero
    epsilon = 1e-6

    if bigram_count > 0:
        return max((bigram_count - discount) / (unigram_count if unigram_count > 0 else epsilon), epsilon)
    else:
        # Calculate backoff weight only if w1 is seen
        backoff_weight = (discount * len(bigrams[w1]) / (unigram_count if unigram_count > 0 else epsilon)) if unigram_count > 0 else epsilon
        unigram_probability = unigrams[w2] / (total_tokens if total_tokens > 0 else epsilon)
        return max(backoff_weight * unigram_probability, epsilon)

# 1.3: Questions

In [95]:
def read_corpus(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip().lower() for line in file]


def count_words(corpus):
    word_count = Counter()
    for line in corpus:
        words = line.split()
        word_count.update(words)
    return word_count


train_corpus = read_corpus(train_file)
test_corpus = read_corpus(test_file)

## 1. Number of Word Types

In [97]:
# Number of word types in training
train_word_count = count_words(train_corpus)
# Adding the unknown token
train_word_count['<unk>'] += 1

# Count unique word types including padding symbols and the unknown token
train_types_including_padding_and_unk = len(train_word_count)


#######################################################################################
# Number of word tokens in training

# The sum of the values in train_word_count gives the number of tokens
train_tokens_including_padding = sum(train_word_count.values())


######################################################################################## 
# Percent of unseen word tokens + types in test
test_word_count = count_words(test_corpus)

# Identify unseen words and count unseen tokens
unseen_test_types = set(test_word_count.keys()) - set(train_word_count.keys())
unseen_test_tokens_count = sum(test_word_count[word] for word in unseen_test_types)

# Calculate percentages
percentage_unseen_test_types = (len(unseen_test_types) / len(test_word_count)) * 100
percentage_unseen_test_tokens = (unseen_test_tokens_count / sum(test_word_count.values())) * 100

In [98]:
print(f"Number of word types in the training corpus (including padding and <unk>): {train_types_including_padding_and_unk}")
print(f"Number of word tokens in the training corpus (including padding): {train_tokens_including_padding}")
print(f"Percentage of unseen word types in test data: {percentage_unseen_test_types:.2f}%")
print(f"Percentage of unseen word tokens in test data: {percentage_unseen_test_tokens:.2f}%")

Number of word types in the training corpus (including padding and <unk>): 44978
Number of word tokens in the training corpus (including padding): 927141
Percentage of unseen word types in test data: 15.63%
Percentage of unseen word tokens in test data: 2.19%


In [100]:
import math

# Define the sentences
sentences = [
    "<s> he was laughed off the screen . </s>",
    "<s> there was no compulsion behind them . </s>",
    "<s> i look forward to hearing your reply . </s>"
]

# Assumed precomputed counts from training data
# Assuming dictionaries `unigrams` and `bigrams` are already filled from previous code
# Assuming `vocabulary_size` and `total_tokens` are defined from training data preprocessing


def compute_unigram_log_prob(sentence, unigrams, total_tokens):
    words = sentence.split()
    log_prob = sum(math.log(unigrams[word] / total_tokens)for word in words)
    return log_prob


def compute_bigram_log_prob(sentence, unigrams, bigrams):
    words = sentence.split()
    log_prob = 0
    for w1, w2 in zip(words[:-1], words[1:]):
        bigram_count = bigrams[w1][w2]
        if bigram_count == 0:
            # Log probability is -inf if any bigram count is zero
            return float('-inf')
        log_prob += math.log(bigram_count / unigrams[w1])
    return log_prob


def compute_bigram_add_one_log_prob(sentence, unigrams, bigrams, vocabulary_size):
    words = sentence.split()
    log_prob = 0
    for w1, w2 in zip(words[:-1], words[1:]):
        bigram_count = bigrams[w1][w2]
        unigram_count = unigrams[w1]
        log_prob += math.log((bigram_count + 1) / (unigram_count + vocabulary_size))
    return log_prob


# Compute and display the log probabilities
for sentence in sentences:
    print(f"Sentence: {sentence}")
    unigram_log_prob = compute_unigram_log_prob(sentence, unigrams, total_tokens)
    bigram_log_prob = compute_bigram_log_prob(sentence, unigrams, bigrams)
    bigram_add_one_log_prob = compute_bigram_add_one_log_prob(sentence, unigrams, bigrams, vocabulary_size)
    print(f"Unigram Model Log Probability: {unigram_log_prob}")
    print(f"Bigram Model Log Probability: {bigram_log_prob}")
    print(f"Bigram with Add-One Smoothing Log Probability: {bigram_add_one_log_prob}")
    print()

Sentence: <s> he was laughed off the screen . </s>
Unigram Model Log Probability: -50.00253069510074
Bigram Model Log Probability: -29.226429204762596
Bigram with Add-One Smoothing Log Probability: -48.567846423407694

Sentence: <s> there was no compulsion behind them . </s>
Unigram Model Log Probability: -53.82193870947094
Bigram Model Log Probability: -24.057083130827618
Bigram with Add-One Smoothing Log Probability: -46.04452641155733

Sentence: <s> i look forward to hearing your reply . </s>
Unigram Model Log Probability: -63.4380424815825
Bigram Model Log Probability: -inf
Bigram with Add-One Smoothing Log Probability: -66.80947224624923



In [102]:
def compute_unigram_perplexity(sentence, unigrams, total_tokens):
    words = sentence.split()
    log_prob = sum(math.log2(unigrams[word] / total_tokens)for word in words)
    perplexity = 2 ** (-log_prob / len(words))
    return perplexity


def compute_bigram_perplexity(sentence, unigrams, bigrams):
    words = sentence.split()
    log_probs = []
    for w1, w2 in zip(words[:-1], words[1:]):
        bigram_count = bigrams[w1][w2]
        if bigram_count == 0:
            # Perplexity is infinite if any bigram count is zero
            return float('inf')
        log_probs.append(math.log2(bigram_count / unigrams[w1]))
    perplexity = 2 ** (-sum(log_probs) / len(words))
    return perplexity


def compute_bigram_add_one_perplexity(sentence, unigrams, bigrams, vocabulary_size):
    words = sentence.split()
    log_probs = []
    for w1, w2 in zip(words[:-1], words[1:]):
        bigram_count = bigrams[w1][w2]
        unigram_count = unigrams[w1]
        log_probs.append(math.log2((bigram_count + 1) /(unigram_count + vocabulary_size)))
    perplexity = 2 ** (-sum(log_probs) / len(words))
    return perplexity


# Compute and display the perplexities
sentences = [
    "<s> he was laughed off the screen . </s>",
    "<s> there was no compulsion behind them . </s>",
    "<s> i look forward to hearing your reply . </s>"
]

for sentence in sentences:
    print(f"Sentence: {sentence}")
    unigram_perplexity = compute_unigram_perplexity(sentence, unigrams, total_tokens)
    bigram_perplexity = compute_bigram_perplexity(sentence, unigrams, bigrams)
    bigram_add_one_perplexity = compute_bigram_add_one_perplexity(sentence, unigrams, bigrams, vocabulary_size)
    print(f"Unigram Model Perplexity: {unigram_perplexity:.2f}")
    print(f"Bigram Model Perplexity: {bigram_perplexity if bigram_perplexity != float('inf') else 'inf'}")
    print(f"Bigram with Add-One Smoothing Perplexity: {bigram_add_one_perplexity:.2f}")
    print()

Sentence: <s> he was laughed off the screen . </s>
Unigram Model Perplexity: 258.74
Bigram Model Perplexity: 25.72288397506204
Bigram with Add-One Smoothing Perplexity: 220.62

Sentence: <s> there was no compulsion behind them . </s>
Unigram Model Perplexity: 395.53
Bigram Model Perplexity: 14.483487925135439
Bigram with Add-One Smoothing Perplexity: 166.68

Sentence: <s> i look forward to hearing your reply . </s>
Unigram Model Perplexity: 568.96
Bigram Model Perplexity: inf
Bigram with Add-One Smoothing Perplexity: 797.07



In [105]:
def safe_log(x):
    if x <= 0:
        return float('-inf')
    return math.log(x)

# Calculate total log probability for unigram model


def unigram_corpus_log_prob(corpus, unigrams, total_tokens):
    total_log_prob = 0
    word_count = 0
    for sentence in corpus:
        for word in sentence:
            prob = unigrams.get(word, 1) / total_tokens
            total_log_prob += safe_log(prob)
            word_count += 1
    return total_log_prob, word_count

# Calculate total log probability for bigram model


def bigram_corpus_log_prob(corpus, unigrams, bigrams):
    total_log_prob = 0
    word_count = 0
    for sentence in corpus:
        for w1, w2 in zip(sentence[:-1], sentence[1:]):
            bigram_count = bigrams.get((w1, w2), 0)
            unigram_count = unigrams.get(w1, 1)
            prob = bigram_count / unigram_count if unigram_count != 0 else 0
            total_log_prob += safe_log(prob)
            word_count += 1
    return total_log_prob, word_count

# Calculate total log probability for bigram model with Add-One smoothing


def bigram_add_one_corpus_log_prob(corpus, unigrams, bigrams, vocabulary_size):
    total_log_prob = 0
    word_count = 0
    for sentence in corpus:
        for w1, w2 in zip(sentence[:-1], sentence[1:]):
            bigram_count = bigrams.get((w1, w2), 0)
            unigram_count = unigrams.get(w1, 0)
            prob = (bigram_count + 1) / (unigram_count + vocabulary_size)
            total_log_prob += safe_log(prob)
            word_count += 1
    return total_log_prob, word_count

# Calculate perplexity for the entire corpus


def corpus_perplexity(total_log_prob, total_word_count):
    return 2 ** (-total_log_prob / total_word_count)


# Compute log probabilities and word counts
total_log_prob_unigram, total_words_unigram = unigram_corpus_log_prob(
    test_corpus, unigrams, total_tokens)
total_log_prob_bigram, total_words_bigram = bigram_corpus_log_prob( test_corpus, unigrams, bigrams)
total_log_prob_bigram_add_one, total_words_bigram_add_one = bigram_add_one_corpus_log_prob(test_corpus, unigrams, bigrams, vocabulary_size)

# Compute perplexities
perplexity_unigram = corpus_perplexity(total_log_prob_unigram, total_words_unigram)
perplexity_bigram = corpus_perplexity(total_log_prob_bigram, total_words_bigram)
perplexity_bigram_add_one = corpus_perplexity(total_log_prob_bigram_add_one, total_words_bigram_add_one)

print(f"Unigram Model Perplexity for the Test Corpus: {perplexity_unigram}")
print(f"Bigram Model Perplexity for the Test Corpus: {perplexity_bigram if perplexity_bigram != float('inf') else 'inf'}")
print(f"Bigram with Add-One Smoothing Perplexity for the Test Corpus: {perplexity_bigram_add_one}")

Unigram Model Perplexity for the Test Corpus: 1261.2473476641203
Bigram Model Perplexity for the Test Corpus: inf
Bigram with Add-One Smoothing Perplexity for the Test Corpus: 1156.4532096440262


### Results and Conclusions

#### Overview
In this project, we analyzed a training corpus consisting of 927,141 word tokens and 44,978 unique word types (including padding symbols and a special `<unk>` token for unknown words). We applied three statistical language models — unigram, bigram, and bigram with Add-One smoothing — to compute the log probabilities and perplexities of various test sentences. Additionally, we explored the model performances across the entire test corpus.

#### Vocabulary and Coverage
The training corpus showed a rich vocabulary with nearly 45,000 unique types. However, when applying the models to the test data, 15.63% of word types and 2.19% of word tokens were unseen in the training data. This indicates a significant diversity in language usage between the training and test sets, which is a common challenge in language modeling.

#### Sentence-wise Analysis
For individual sentences, the log probability and perplexity metrics highlighted the strengths and weaknesses of each model:

- **Sentence: "He was laughed off the screen."**
  - The bigram model significantly outperformed the unigram model in terms of lower perplexity (25.72 vs. 258.74), indicating better context sensitivity.
  - The Add-One smoothing increased the perplexity relative to the bigram model, showing the trade-off between smoothing and model precision.

- **Sentence: "There was no compulsion behind them."**
  - Again, the bigram model achieved much lower perplexity (14.48) compared to the unigram model (395.53), highlighting its effectiveness in handling specific phrase structures.
  - The smoothed bigram model showed improved performance over the unigram but was not as effective as the unigram model in capturing sentence structure.

- **Sentence: "I look forward to hearing your reply."**
  - This sentence revealed limitations in the bigram model, which registered an infinite perplexity due to encountering an unseen bigram combination, showing the model's vulnerability to data sparsity.
  - The smoothed bigram model's high perplexity (797.07) indicates that even smoothing techniques may not always compensate adequately for sparse data.

#### Overall Corpus Analysis
- **Unigram Model Perplexity:** 1261.25
  - Indicates poor predictive performance across the test corpus, likely due to its ignorance of word order and context.
  
- **Bigram Model Perplexity:** ∞
  - Suggests the presence of many unseen bigrams in the test data, leading to infinite perplexity and highlighting the model's inability to handle new word combinations effectively without smoothing.
  
- **Bigram with Add-One Smoothing Perplexity:** 1156.45
  - Although better than the unigram model, the high perplexity indicates that even with smoothing, the model struggles with the diversity and complexity of the test corpus.

### Conclusion
The experiment underscores the critical importance of context in language processing, as demonstrated by the superior performance of the bigram model over the unigram model in most scenarios where the data was seen during training. However, the presence of unseen data poses significant challenges, particularly for the non-smoothed bigram model. The use of smoothing techniques, like Add-One smoothing, but may not be sufficient for highly diverse datasets.