In [82]:
import nltk
import numpy as np
from collections import defaultdict
from copy import deepcopy

# nltk.download("brown")
tagged_sents = list(nltk.corpus.brown.tagged_sents(categories="news"))
training_size = int(len(tagged_sents) * 0.9)
training_data = tagged_sents[:training_size]
test_data = tagged_sents[training_size:]

tags_histogram = defaultdict(int)
words_map = defaultdict(lambda: defaultdict(int))
for sentence in training_data:
    for word, part_of_speech in sentence:
        words_map[word][part_of_speech] += 1
        tags_histogram[part_of_speech] += 1



In [83]:
# Baseline - most likely tag with no other assumptions
most_probable_tag = dict()
for word in words_map:
    most_probable_tag[word] = max(words_map[word].items(), key=lambda pair: pair[1])[0]

known_words_count = 0
known_hits_count = 0
unknown_words_count = 0
unknown_hits_count = 0

for sentence in test_data:
    for word, real_tag in sentence:
        if word in most_probable_tag:
            predicted_tag = most_probable_tag[word]
            known_words_count += 1
            if predicted_tag == real_tag:
                known_hits_count += 1
        else:
            predicted_tag = "NN"
            unknown_words_count += 1
            if predicted_tag == real_tag: 
                unknown_hits_count += 1
known_words_accuracy = known_hits_count / known_words_count
unknown_words_accuracy = unknown_hits_count / unknown_words_count
total_accuracy = (known_hits_count + unknown_hits_count) / (known_words_count + unknown_words_count)

print("Known words prediction err:", 1 - known_words_accuracy)
print("Unknown words prediction err:", 1 - unknown_words_accuracy)
print("Total prediction err:", 1 - total_accuracy)

Known words prediction err: 0.08315517047372567
Unknown words prediction err: 0.7897033158813264
Total prediction err: 0.16385926442738963


In [84]:
# Bigram HMM
emissions = defaultdict(lambda: defaultdict(float))
for word in words_map:
    for part_of_speech in words_map[word]:
        emissions[word][part_of_speech] = words_map[word][part_of_speech] / tags_histogram[part_of_speech]

transitions = defaultdict(float)
for sentence in training_data:
    previous_tag = '*'
    for word, tag in sentence:
        transitions[(previous_tag, tag)] += 1
        previous_tag = tag
    transitions[(previous_tag, "STOP")] += 1
for transition in transitions:
    transitions[transition] /= tags_histogram[transition[0]]

def viterbi(sentence, transitions, emissions):
    n = len(sentence)
    # Calling pi pie so I won't confuse it with pi
    pie = [defaultdict(float) for i in range(n + 1)]
    for tag in tags_histogram.keys():
        pie[0][tag] = (None, 1.0)
    for k in range(1, n + 1):
        for tag in tags_histogram.keys():
            max_arg = "NN"
            max_value = 0
            for previous_tag in tags_histogram.keys():
                emission = emissions[sentence[k - 1][0]][tag]
                transition = transitions[(previous_tag, tag)]
                pr = pie[k - 1][previous_tag][1] * emission * transition
                if pr > max_value:
                    max_arg = previous_tag
                    max_value = pr
            pie[k][tag] = (max_arg, max_value)

viterbi(training_data[3], transitions, emissions)





1.7328530181074028e-69 CC NNS 37
defaultdict(<class 'float'>, {'AT': ('NN', 0), 'NP-TL': ('NN', 0), 'NN-TL': ('NN', 0), 'JJ-TL': ('NN', 0), 'VBD': ('NN', 0), 'NR': ('NN', 0), 'NN': ('JJ', 4.726809495838191e-15), 'IN': ('NN', 0), 'NP$': ('NN', 0), 'JJ': ('NN', 0), '``': ('NN', 0), "''": ('NN', 0), 'CS': ('NN', 0), 'DTI': ('NN', 0), 'NNS': ('NN', 0), '.': ('NN', 0), 'RBR': ('NN', 0), ',': ('NN', 0), 'WDT': ('NN', 0), 'HVD': ('NN', 0), 'VBZ': ('NN', 0), 'CC': ('NN', 0), 'IN-TL': ('NN', 0), 'BEDZ': ('NN', 0), 'VBN': ('NN', 0), 'NP': ('NN', 0), 'BEN': ('NN', 0), 'TO': ('NN', 0), 'VB': ('NN', 0), 'RB': ('NN', 0), 'DT': ('NN', 0), 'PPS': ('NN', 0), 'DOD': ('NN', 0), 'AP': ('NN', 0), 'BER': ('NN', 0), 'HV': ('NN', 0), 'DTS': ('NN', 0), 'VBG': ('NN', 0), 'PPO': ('NN', 0), 'QL': ('NN', 0), 'JJT': ('NN', 0), 'ABX': ('NN', 0), 'NN-HL': ('NN', 0), 'VBN-HL': ('NN', 0), 'WRB': ('NN', 0), 'CD': ('NN', 0), 'MD': ('NN', 0), 'BE': ('NN', 0), 'JJR': ('NN', 0), 'VBG-TL': ('NN', 0), 'BEZ': ('NN', 0), 'NN$-T