In [28]:
from nltk.corpus import brown
import math
from collections import defaultdict

tagged_sents = brown.tagged_sents(categories='news')
split_point = math.floor(len(tagged_sents) * 0.9)

train_sents = tagged_sents[:split_point]
test_sents = tagged_sents[split_point:]

In [29]:
word_tag_counts = defaultdict(lambda: defaultdict(int))
for sent in train_sents:
    for word, tag in sent:
        word_tag_counts[word][tag] += 1

word_most_freq_tag = {}

for word, tag_counts in word_tag_counts.items():
    most_freq_tag = max(tag_counts.items(), key=lambda item: item[1])[0]
    word_most_freq_tag[word] = most_freq_tag

known_words = set(word_most_freq_tag.keys())

# Part (b)ii: Evaluate the baseline on the test set
total_known = 0
errors_known = 0

total_unknown = 0
errors_unknown = 0

total = 0
errors = 0

for sent in test_sents:
    for word, true_tag in sent:
        total += 1
        if word in known_words:
            predicted_tag = word_most_freq_tag[word]
            total_known += 1
            if predicted_tag != true_tag:
                errors_known += 1
        else:
            predicted_tag = 'NN'
            total_unknown += 1
            if predicted_tag != true_tag:
                errors_unknown += 1

error_rate_known = errors_known / total_known if total_known > 0 else 0
error_rate_unknown = errors_unknown / total_unknown if total_unknown > 0 else 0
total_error_rate = (errors_known + errors_unknown) / total if total > 0 else 0

print("=== Most Likely Tag Baseline Results ===")
print(f"Error Rate for Known Words: {error_rate_known:.4f}")
print(f"Error Rate for Unknown Words: {error_rate_unknown:.4f}")
print(f"Total Error Rate: {total_error_rate:.4f}")

=== Most Likely Tag Baseline Results ===
Error Rate for Known Words: 0.0832
Error Rate for Unknown Words: 0.7897
Total Error Rate: 0.1639


In [31]:
import math

tagged_sents = brown.tagged_sents(categories='news')
split_point = math.floor(len(tagged_sents) * 0.9)

train_sents = tagged_sents[:split_point]
test_sents = tagged_sents[split_point:]

# Part (c)i: Training phase
transition_counts = defaultdict(lambda: defaultdict(int))
emission_counts = defaultdict(lambda: defaultdict(int))
tag_counts = defaultdict(int)

START = "<s>"

for sent in train_sents:
    prev_tag = START
    tag_counts[prev_tag] += 1
    for word, tag in sent:
        transition_counts[prev_tag][tag] += 1
        emission_counts[tag][word] += 1
        tag_counts[tag] += 1
        prev_tag = tag

transition_probs = defaultdict(dict)
for prev_tag in transition_counts:
    total = sum(transition_counts[prev_tag].values())
    for tag in transition_counts[prev_tag]:
        transition_probs[prev_tag][tag] = transition_counts[prev_tag][tag] / total

emission_probs = defaultdict(dict)
for tag in emission_counts:
    total = sum(emission_counts[tag].values())
    for word in emission_counts[tag]:
        emission_probs[tag][word] = emission_counts[tag][word] / total


# Part (c)ii: Viterbi algorithm
def viterbi(sentence, transition_probs, emission_probs, tags, start_tag=START, unknown_tag='NN'):
    MIN_PROB = 1e-6
    trellis = [{}]
    paths = {}

    first_word = sentence[0]
    for tag in tags:
        transition_probability = transition_probs[start_tag].get(tag, MIN_PROB)
        emission_probability = emission_probs[tag].get(first_word, emission_probs[tag].get(unknown_tag, MIN_PROB))
        trellis[0][tag] = transition_probability * emission_probability
        paths[tag] = [tag]

    for position in range(1, len(sentence)):
        trellis.append({})
        new_paths = {}
        current_word = sentence[position]
        for current_tag in tags:
            emission_probability = emission_probs[current_tag].get(
                current_word,
                emission_probs[current_tag].get(unknown_tag, MIN_PROB)
            )
            max_probability = 0
            best_previous_tag = None
            for previous_tag in tags:
                transition_probability = transition_probs[previous_tag].get(current_tag, MIN_PROB)
                probability = trellis[position - 1][previous_tag] * transition_probability * emission_probability
                if probability > max_probability:
                    max_probability = probability
                    best_previous_tag = previous_tag
            trellis[position][current_tag] = max_probability
            new_paths[current_tag] = paths[best_previous_tag] + [current_tag]
        paths = new_paths

    best_final_tag = max(trellis[-1], key=trellis[-1].get)
    return paths[best_final_tag]


# Part (c)iii: Running on test set and computing error rates

known_words = set()
for tag in emission_counts:
    for word in emission_counts[tag]:
        known_words.add(word)

tags = list(tag_counts.keys())
unknown_tag = 'NN'  # Arbitrary tag for unknown words

total = 0
errors = 0

total_known = 0
errors_known = 0

total_unknown = 0
errors_unknown = 0

for sent in test_sents:
    words, true_tags = zip(*sent)
    predicted_tags = viterbi(words, transition_probs, emission_probs, tags, unknown_tag=unknown_tag)
    for word, pred, true in zip(words, predicted_tags, true_tags):
        total += 1
        if pred != true:
            errors += 1
        if word in known_words:
            total_known += 1
            if pred != true:
                errors_known += 1
        else:
            total_unknown += 1
            if pred != true:
                errors_unknown += 1

error_rate_known = errors_known / total_known if total_known > 0 else 0
error_rate_unknown = errors_unknown / total_unknown if total_unknown > 0 else 0
total_error_rate = errors / total if total > 0 else 0

print(f"Error Rate for Known Words: {error_rate_known:.4f}")
print(f"Error Rate for Unknown Words: {error_rate_unknown:.4f}")
print(f"Total Error Rate: {total_error_rate:.4f}")


Error Rate for Known Words: 0.0479
Error Rate for Unknown Words: 0.7190
Total Error Rate: 0.1246


In [34]:
V = len(known_words) + 1  # +1 for handling unknown words as '<UNK>'
emission_probs_smooth = defaultdict(dict)

for tag in emission_counts:
    denominator = tag_counts[tag] + V
    for word in emission_counts[tag]:
        emission_probs_smooth[tag][word] = (emission_counts[tag][word] + 1) / denominator
    emission_probs_smooth[tag]['<UNK>'] = 1 / denominator

def replace_unknowns(sentence, known_words):
    return [word if word in known_words else '<UNK>' for word in sentence]

total = 0
errors = 0

total_known = 0
errors_known = 0

total_unknown = 0
errors_unknown = 0

for sent in test_sents:
    words, true_tags = zip(*sent)
    processed_words = replace_unknowns(words, known_words)
    predicted_tags = viterbi(processed_words, transition_probs, emission_probs_smooth, tags, unknown_tag='<UNK>')
    for word, pred, true in zip(words, predicted_tags, true_tags):
        total += 1
        if pred != true:
            errors += 1
        if word in known_words:
            total_known += 1
            if pred != true:
                errors_known += 1
        else:
            total_unknown += 1
            if pred != true:
                errors_unknown += 1

error_rate_known = errors_known / total_known if total_known > 0 else 0
error_rate_unknown = errors_unknown / total_unknown if total_unknown > 0 else 0
total_error_rate = errors / total if total > 0 else 0

print("=== Add-One Smoothing Results ===")
print(f"Error Rate for Known Words: {error_rate_known:.4f}")
print(f"Error Rate for Unknown Words: {error_rate_unknown:.4f}")
print(f"Total Error Rate: {total_error_rate:.4f}")

=== Add-One Smoothing Results ===
Error Rate for Known Words: 0.1788
Error Rate for Unknown Words: 0.7522
Total Error Rate: 0.2443


In [26]:
FREQUENCY_THRESHOLD = 5

def create_pseudo_word(word, frequency):
    if any(char.isdigit() for char in word):
        return '_NUM'
    elif word.isupper():
        return '_ALLCAPS'
    elif word[0].isupper():
        return '_CAPITAL'
    elif word.endswith('ing'):
        return '_ING'
    elif word.endswith('ed'):
        return '_ED'
    elif word.endswith('s'):
        return '_S'
    elif word.endswith('ly'):
        return '_LY'
    elif frequency < FREQUENCY_THRESHOLD:
        return '_UNK'
    else:
        return word

word_freq = defaultdict(int)
for sent in train_sents:
    for word, tag in sent:
        word_freq[word] += 1

processed_train_sents_pseudo = []
for sent in train_sents:
    new_sent = []
    for word, tag in sent:
        pseudo_word = create_pseudo_word(word, word_freq[word])
        new_sent.append((pseudo_word, tag))
    processed_train_sents_pseudo.append(new_sent)

processed_test_sents_pseudo = []
for sent in test_sents:
    new_sent = []
    for word, tag in sent:
        pseudo_word = create_pseudo_word(word, word_freq.get(word, 0))
        new_sent.append((pseudo_word, tag))
    processed_test_sents_pseudo.append(new_sent)

transition_counts_pseudo = defaultdict(lambda: defaultdict(int))
emission_counts_pseudo = defaultdict(lambda: defaultdict(int))
tag_counts_pseudo = defaultdict(int)

for sent in processed_train_sents_pseudo:
    prev_tag = START
    tag_counts_pseudo[prev_tag] += 1
    for word, tag in sent:
        transition_counts_pseudo[prev_tag][tag] += 1
        emission_counts_pseudo[tag][word] += 1
        tag_counts_pseudo[tag] += 1
        prev_tag = tag

transition_probs_pseudo = defaultdict(dict)
for prev_tag in transition_counts_pseudo:
    total = sum(transition_counts_pseudo[prev_tag].values())
    for tag in transition_counts_pseudo[prev_tag]:
        transition_probs_pseudo[prev_tag][tag] = transition_counts_pseudo[prev_tag][tag] / total

emission_probs_pseudo = defaultdict(dict)
for tag in emission_counts_pseudo:
    total = sum(emission_counts_pseudo[tag].values())
    for word in emission_counts_pseudo[tag]:
        emission_probs_pseudo[tag][word] = emission_counts_pseudo[tag][word] / total

known_pseudo_words = set()
for tag in emission_counts_pseudo:
    for word in emission_counts_pseudo[tag]:
        known_pseudo_words.add(word)

tags_pseudo = list(tag_counts_pseudo.keys())

unknown_tag_pseudo = 'NN'

total_pseudo = 0
errors_pseudo = 0

total_known_pseudo = 0
errors_known_pseudo = 0

total_unknown_pseudo = 0
errors_unknown_pseudo = 0

for sent in processed_test_sents_pseudo:
    words, true_tags = zip(*sent)
    predicted_tags = viterbi(words, transition_probs_pseudo, emission_probs_pseudo, tags_pseudo, unknown_tag=unknown_tag_pseudo)
    for word, pred, true in zip(words, predicted_tags, true_tags):
        total_pseudo += 1
        if pred != true:
            errors_pseudo += 1
        if word in known_pseudo_words:
            total_known_pseudo += 1
            if pred != true:
                errors_known_pseudo += 1
        else:
            total_unknown_pseudo += 1
            if pred != true:
                errors_unknown_pseudo += 1

error_rate_known_pseudo = errors_known_pseudo / total_known_pseudo if total_known_pseudo > 0 else 0
error_rate_unknown_pseudo = errors_unknown_pseudo / total_unknown_pseudo if total_unknown_pseudo > 0 else 0
total_error_rate_pseudo = errors_pseudo / total_pseudo if total_pseudo > 0 else 1

# Print the results
print("=== Pseudo-Words with MLE Results ===")
print(f"Error Rate for Known Words: {error_rate_known_pseudo:.4f}")
print(f"Error Rate for Unknown Words: {error_rate_unknown_pseudo:.4f}")
print(f"Total Error Rate: {total_error_rate_pseudo:.4f}")

=== Pseudo-Words with MLE Results ===
Error Rate for Known Words: 0.1858
Error Rate for Unknown Words: 0.0000
Total Error Rate: 0.1858


In [25]:
import pandas as pd

unique_test_tags_pseudo = set(tag for sent in processed_test_sents_pseudo for _, tag in sent)
unique_all_tags_pseudo = sorted(list(set(tags_pseudo).union(unique_test_tags_pseudo)))
confusion_matrix = pd.DataFrame(0, index=unique_all_tags_pseudo, columns=unique_all_tags_pseudo)

word_freq_pseudo = defaultdict(int)
for sent in train_sents:
    for word, tag in sent:
        word_freq_pseudo[word] += 1

processed_train_sents_pseudo = []
for sent in train_sents:
    new_sent = []
    for word, tag in sent:
        pseudo_word = create_pseudo_word(word, word_freq_pseudo[word])
        new_sent.append((pseudo_word, tag))
    processed_train_sents_pseudo.append(new_sent)

processed_test_sents_pseudo = []
for sent in test_sents:
    new_sent = []
    for word, tag in sent:
        pseudo_word = create_pseudo_word(word, word_freq_pseudo.get(word, 0))
        new_sent.append((pseudo_word, tag))
    processed_test_sents_pseudo.append(new_sent)

transition_counts_pseudo = defaultdict(lambda: defaultdict(int))
emission_counts_pseudo = defaultdict(lambda: defaultdict(int))
tag_counts_pseudo = defaultdict(int)

for sent in processed_train_sents_pseudo:
    prev_tag = START
    tag_counts_pseudo[prev_tag] += 1
    for word, tag in sent:
        transition_counts_pseudo[prev_tag][tag] += 1
        emission_counts_pseudo[tag][word] += 1
        tag_counts_pseudo[tag] += 1
        prev_tag = tag

transition_probs_pseudo = defaultdict(dict)
for prev_tag in transition_counts_pseudo:
    total = sum(transition_counts_pseudo[prev_tag].values())
    for tag in transition_counts_pseudo[prev_tag]:
        transition_probs_pseudo[prev_tag][tag] = transition_counts_pseudo[prev_tag][tag] / total

emission_probs_pseudo_smooth = defaultdict(dict)
V_pseudo = len(set(word for tag in emission_counts_pseudo for word in emission_counts_pseudo[tag])) + 1  # +1 for '<UNK>'

for tag in emission_counts_pseudo:
    total = sum(emission_counts_pseudo[tag].values()) + V_pseudo  # Add-One Smoothing
    for word in emission_counts_pseudo[tag]:
        emission_probs_pseudo_smooth[tag][word] = (emission_counts_pseudo[tag][word] + 1) / total
    # Probability for unknown words
    emission_probs_pseudo_smooth[tag]['<UNK>'] = 1 / total

known_pseudo_words = set()
for tag in emission_counts_pseudo:
    for word in emission_counts_pseudo[tag]:
        known_pseudo_words.add(word)

tags_pseudo = list(tag_counts_pseudo.keys())

unknown_tag_pseudo = 'NN'

total_pseudo_smooth = 0
errors_pseudo_smooth = 0

total_known_pseudo_smooth = 0
errors_known_pseudo_smooth = 0

total_unknown_pseudo_smooth = 0
errors_unknown_pseudo_smooth = 0

for sent in processed_test_sents_pseudo:
    words, true_tags = zip(*sent)
    processed_words = [word if word in known_pseudo_words else '<UNK>' for word in words]
    predicted_tags = viterbi(processed_words, transition_probs_pseudo, emission_probs_pseudo_smooth, tags_pseudo, unknown_tag=unknown_tag_pseudo)
    for word, pred, true in zip(words, predicted_tags, true_tags):
        total_pseudo_smooth += 1
        if pred != true:
            errors_pseudo_smooth += 1
        if word in known_pseudo_words:
            total_known_pseudo_smooth += 1
            if pred != true:
                errors_known_pseudo_smooth += 1
        else:
            total_unknown_pseudo_smooth += 1
            if pred != true:
                errors_unknown_pseudo_smooth += 1
        confusion_matrix.loc[true, pred] += 1

error_rate_known_pseudo_smooth = errors_known_pseudo_smooth / total_known_pseudo_smooth if total_known_pseudo_smooth > 0 else 0
error_rate_unknown_pseudo_smooth = errors_unknown_pseudo_smooth / total_unknown_pseudo_smooth if total_unknown_pseudo_smooth > 0 else 0
total_error_rate_pseudo_smooth = errors_pseudo_smooth / total_pseudo_smooth if total_pseudo_smooth > 0 else 0

print("=== Pseudo-Words with Add-One Smoothing Results ===")
print(f"Error Rate for Known Words: {error_rate_known_pseudo_smooth:.4f}")
print(f"Error Rate for Unknown Words: {error_rate_unknown_pseudo_smooth:.4f}")
print(f"Total Error Rate: {total_error_rate_pseudo_smooth:.4f}")


print("\n=== Confusion Matrix ===")
print(confusion_matrix)


=== Pseudo-Words with Add-One Smoothing Results ===
Error Rate for Known Words: 0.1903
Error Rate for Unknown Words: 0.0000
Total Error Rate: 0.1903

=== Confusion Matrix ===
         '  ''   (  (-HL   )  )-HL  *  *-HL  ,  ,-HL  ...  VBZ-HL  WDT  \
'        5   0   0     0   0     0  0     0  0     0  ...       0    0   
''       0  50   0     0   0     0  0     0  0     0  ...       0    0   
(        0   0  17     0   0     0  0     0  0     0  ...       0    0   
(-HL     0   0   0     0   0     0  0     0  0     0  ...       0    0   
)        0   0   0     0  13     0  0     0  3     0  ...       0    0   
...     ..  ..  ..   ...  ..   ... ..   ... ..   ...  ...     ...  ...   
WPS      0   0   0     0   0     0  0     0  0     0  ...       0    0   
WPS+BEZ  0   0   0     0   0     0  0     0  0     0  ...       0    0   
WQL      0   0   0     0   0     0  0     0  0     0  ...       0    0   
WRB      0   0   0     0   0     0  0     0  0     0  ...       0    0   
``       0 

=== Pseudo-Words with Add-One Smoothing Results ===
Error Rate for Known Words: 0.1903
Error Rate for Unknown Words: 0.0000
Total Error Rate: 0.1903

=== Confusion Matrix ===
         '  ''   (  (-HL   )  )-HL  *  *-HL  ,  ,-HL  ...  VBZ-HL  WDT  \
'        5   0   0     0   0     0  0     0  0     0  ...       0    0
''       0  50   0     0   0     0  0     0  0     0  ...       0    0
(        0   0  17     0   0     0  0     0  0     0  ...       0    0
(-HL     0   0   0     0   0     0  0     0  0     0  ...       0    0
)        0   0   0     0  13     0  0     0  3     0  ...       0    0
...     ..  ..  ..   ...  ..   ... ..   ... ..   ...  ...     ...  ...
WPS      0   0   0     0   0     0  0     0  0     0  ...       0    0
WPS+BEZ  0   0   0     0   0     0  0     0  0     0  ...       0    0
WQL      0   0   0     0   0     0  0     0  0     0  ...       0    0
WRB      0   0   0     0   0     0  0     0  0     0  ...       0    0
``       0   0   0     0   0     0  0     0  0     0  ...       0    0