<a href="https://colab.research.google.com/github/thedatadj/natural-language-processing/blob/main/pos_tagging_markov_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Use a hidden Markov model for Part-of-Speech tagging.

# Datasets

In [3]:
# Download file
!gdown 18yqFMkgBIjKOiDZ43s9JXfClO3YNZotf
# Load training corpus
with open("/content/WSJ_02-21.pos", "r") as file0:
    training_data = file0.readlines()
# Download file
!gdown 1FtzoPTuRqF6DIgvWSRIJZLnlp959uiAr
# Load test corpus
with open("/content/hmm_vocab.txt", 'r') as file1:
    vocab = file1.read().split('\n')
# {word: index} dictionary
wid = {}
for i, word in enumerate(sorted(vocab)):
    wid[word] = i
# Download file
!gdown 1f-tIhCz9A6Kj9kqrpnhNbma4qPrJDnZL
# Load file
with open('/content/WSJ_24.pos', 'r') as file2:
    testdata = file2.readlines()
# Download and import preprocess function
!gdown 1fes2W5p9zRVvJxpr9IE7MsIUd459N5BE
from utils_pos import preprocess
# Download file
!gdown 1jBel8t5KpXi0tXXFoB6rD9cf6NCcDu5X
# Remove tags from the corpus and preprocess the words
_, testcorp = preprocess(vocab, "/content/test.words")

Downloading...
From: https://drive.google.com/uc?id=18yqFMkgBIjKOiDZ43s9JXfClO3YNZotf
To: /content/WSJ_02-21.pos
100% 8.28M/8.28M [00:00<00:00, 132MB/s]
Downloading...
From: https://drive.google.com/uc?id=1FtzoPTuRqF6DIgvWSRIJZLnlp959uiAr
To: /content/hmm_vocab.txt
100% 197k/197k [00:00<00:00, 108MB/s]
Downloading...
From: https://drive.google.com/uc?id=1f-tIhCz9A6Kj9kqrpnhNbma4qPrJDnZL
To: /content/WSJ_24.pos
100% 286k/286k [00:00<00:00, 103MB/s]
Downloading...
From: https://drive.google.com/uc?id=1fes2W5p9zRVvJxpr9IE7MsIUd459N5BE
To: /content/utils_pos.py
100% 8.09k/8.09k [00:00<00:00, 35.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1jBel8t5KpXi0tXXFoB6rD9cf6NCcDu5X
To: /content/test.words
100% 180k/180k [00:00<00:00, 87.0MB/s]


# Helper dictionaries
These dictionaries will help in later steps.

In [4]:
# Use the defaultdict class
from collections import defaultdict

In [5]:
# Helper function
from utils_pos import get_word_tag

In [6]:
# Initialize dictionary
tcounts = defaultdict(int)
ecounts = defaultdict(int)
tagcounts = defaultdict(int)

# Initialize the previous tag
prev_tag = '<s>'

# Loop item in corpus
for wordtag in training_data:

    # Get word and tag
    word, tag = get_word_tag(wordtag, wid)

    # Update count
    tcounts[(prev_tag, tag)] += 1
    ecounts[(tag, word)] += 1
    tagcounts[tag] += 1

    # Update prev_tag
    prev_tag = tag

# States
states = sorted(tagcounts.keys())

# Transition probabilities matrix

In [7]:
import numpy as np

In [8]:
# Values for smoothing
alpha = 0.001

all_tags = sorted(tagcounts.keys())
num_tags = len(all_tags)

# Initialize the transition matrix
A = np.zeros((num_tags, num_tags))

trans_keys = set(tcounts.keys())

for i in range(num_tags):
    for j in range(num_tags):
        count = 0
        key = (all_tags[i], all_tags[j])
        if key in tcounts:
            count = tcounts[key]
        count_prev_tag = tagcounts[all_tags[i]]
        # Smoothing
        A[i,j] = (count + alpha) / (count_prev_tag + alpha * num_tags)

A[:2, :2]

array([[7.03997297e-06, 7.03997297e-06],
       [1.35647553e-07, 1.35647553e-07]])

# Emission probabilities matrix

In [9]:
num_words = len(wid)

# Initialize matrix
B = np.zeros((num_tags, num_words))

emis_keys = set(list(ecounts.keys()))

vc = list(wid)

for i in range(num_tags):
    for j in range(num_words):
        count = 0
        key = (all_tags[i], vc[j])
        if key in ecounts:
            count = ecounts[key]
        count_tag = tagcounts[all_tags[i]]
        B[i,j] = (count + alpha) / (count_tag + alpha * num_words)

In [10]:
B[0, 0]

6.032199882975323e-06

In [11]:
B[3,1]

7.195398974080014e-07

# Viterbi algorithm

## Initialization

In [12]:
import math

# Initialize matrixes to zero
best_probs = np.zeros((num_tags, len(testcorp)))
best_paths = np.zeros((num_tags, len(testcorp)), dtype=int)

# Index of start token
s_idx = states.index("--s--")

# Loop tags
for i in range(num_tags):
    best_probs[i, 0] = math.log(A[s_idx, i] * B[i, wid[testcorp[0]]])

In [13]:
print(best_probs[0, 0])
print(best_probs[2, 3])

-22.60982633354825
0.0


## Forward pass

In [77]:
# Loop words
for i in range(1, len(testcorp)):
    # Loop tags
    for j in range(num_tags):
        best_prob_i = float("-inf")
        best_path_i = None
        # Loop previous word tags
        for k in range(num_tags):
            prob = best_probs[k, i-1] + math.log(A[k, j]) + math.log(B[j, wid[testcorp[i]]])
            if prob > best_prob_i:
                best_prob_i = prob
                best_path_i = k
        best_probs[j,i] = best_prob_i
        best_paths[j,i] = best_path_i

In [78]:
print(f"best_probs[0,1]: {best_probs[0,1]:.4f}")
print(f"best_probs[0,4]: {best_probs[0,4]:.4f}")

best_probs[0,1]: -24.7822
best_probs[0,4]: -49.5601


## Backward pass

In [80]:
# Number of columns/words
m = best_paths.shape[1]

In [81]:
# Map column index to row index
z = [None] * m

In [82]:
# Tag prediction
pred = [None] * m

In [83]:
# Number of rows/tags
n = best_paths.shape[0]

Get the best probability for the last column/word.

In [84]:
# Initialize
bplw = float("-inf")

In [85]:
for k in range(n):
    if bplw < best_probs[k, -1]:
        bplw = best_probs[k, -1]
        # Save index
        z[-1] = k

Add tag of last word to predicted sequence of tags.

In [86]:
pred[-1] = states[z[-1]]
pred[-1]

'--s--'

Search on D matrix

In [87]:
for i in range(m-1, -1, -1):
    # Index of ith word
    ptfwi = z[i]

    # Index of tag of previous word
    z[i - 1] = best_paths[z[i], i]

    # Save tag of previous word
    pred[i - 1] = states[z[i - 1]]

In [92]:
print('The prediction for pred[-7:m-1] is: \n', testcorp[-7:m-1], "\n", pred[-7:m-1], "\n")
print('The prediction for pred[0:8] is: \n', pred[0:7], "\n", testcorp[0:7])

The prediction for pred[-7:m-1] is: 
 ['see', 'them', 'here', 'with', 'us', '.'] 
 ['VB', 'PRP', 'RB', 'IN', 'PRP', '.'] 

The prediction for pred[0:8] is: 
 ['DT', 'NN', 'POS', 'NN', 'MD', 'VB', 'VBN'] 
 ['The', 'economy', "'s", 'temperature', 'will', 'be', 'taken']


# Evalutation
Compute accuracy

In [124]:
num_correct = 0
total = 0
for prediction, y in zip(pred, testdata):
    wordtag = y.split()
    if len(wordtag) == 2:
        word, tag = wordtag[0], wordtag[1]
        if prediction == tag:
            num_correct += 1
        total += 1
accuracy = num_correct/total

In [125]:
accuracy

0.953063647155511

<table>
    <tr>
        <td>
            Based on
        </td>
        <td>
            Assignment from the Natural Language Processing Specialization
        </td>
    </tr>
</table>