# Data Preparation

## Import libraries

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import random
import time
import nltk
import string

In [2]:
def read(file):
    with open(file, 'r', encoding='utf8') as f:
        file = f.read().splitlines()
    data = [[] for _ in range(len(file))]
    for idx, i in enumerate(file):
        a = i.split()
        for j in a:
            tmp = (j.rsplit('/',1))
            data[idx].append((tmp[0], tmp[1]))
    return data

In [3]:
train_set = read('corpus/train.txt')
test_set = read('corpus/test.txt')

In [4]:
# create list of train and test tagged words
train_tagged_words = [tup for sent in train_set for tup in sent]
test_tagged_words = [tup[0] for sent in test_set for tup in sent]
print(len(train_tagged_words))
print(len(test_tagged_words))

2781
1121


In [5]:
test_tagged_words = 'I love you.'.split()

In [6]:
test_tagged_words

['I', 'love', 'you.']

In [7]:
# let's check how many unique tags are present in training data
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)

16
{'SYM', 'VERB', 'ADP', 'ADJ', 'ADV', 'NOUN', 'SCONJ', 'AUX', 'CCONJ', 'PRON', 'NUM', 'X', 'PROPN', 'PUNCT', 'DET', 'PART'}


In [8]:
test_tagged_words

['I', 'love', 'you.']

In [9]:
# let's check how many words are present in vocabulary
vocab = {word for word,tag in train_tagged_words}
print(len(vocab))

1028


### POS Tagging algorithm using Hidden Markov Model (HMM)

We'll use the HMM algorithm to tag the words. Given a sequence of words to be tagged, the task is to assign the most probable tag to the word. 
In other words, to every **word w**, assign **the tag t** that maximises the likelihood **P(t/w)**. 

Since P(t/w) = P(w/t). P(t) / P(w), after ignoring P(w), we have to compute P(w/t) and P(t).

Now:
* **P(w/t): is the emission probability** of a given word for a given tag. This can be computed based on the fraction of given word for given tag to the total count of that tag, ie: P(w/t) = count(w, t) / count(t). 

* **P(t): is the probability of tag (also transition probability)**, and in a tagging task, we assume that a tag will depend only on the previous tag (Markov order 1 assumption). In other words, the probability of say a tag being NN will depend only on the previous tag t(n-1). So for e.g. if t(n-1) is a JJ, then t(n) is likely to be an NN since adjectives often precede a noun (blue coat, tall building etc.).

### Build the Vanilla Viterbi based POS tagger

#### Function to compute emission probabilties for a given word

In [10]:
# compute emission probability for a given word for a given tag
def word_given_tag(word, tag, train_bag=train_tagged_words):
    taglist = [pair for pair in train_bag if pair[1] == tag]
    tag_count = len(taglist) + 16
    w_in_tag = [pair[0] for pair in taglist if pair[0] == word]
    word_count_given_tag = len(w_in_tag) + 1

    return (word_count_given_tag, tag_count)

#### Function to compute transition probabilties for a given tag and previous tag

In [11]:
# compute transition probabilities of a previous and next tag
def t2_given_t1(t2,t1,train_bag=train_tagged_words):
    tags = [pair[1] for pair in train_bag]
#     print(len(tags))
    
    t1_tags = [tag for tag in tags if tag==t1]
    
    count_of_t1 = len(t1_tags) + 16
    
    t2_given_t1 = [tags[index+1] for index in range(len(tags)-1) if tags[index] == t1 and tags[index+1] == t2]
    
    count_t2_given_t1 = len(t2_given_t1) + 1
    
    return(count_t2_given_t1,count_of_t1)

In [12]:
t2_given_t1('NUM','NUM')

(1, 56)

In [13]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [14]:
# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))

In [15]:
tags_df

Unnamed: 0,SYM,VERB,ADP,ADJ,ADV,NOUN,SCONJ,AUX,CCONJ,PRON,NUM,X,PROPN,PUNCT,DET,PART
SYM,0.052632,0.052632,0.052632,0.052632,0.052632,0.105263,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.157895,0.052632,0.052632,0.052632
VERB,0.002985,0.029851,0.241791,0.062687,0.092537,0.140299,0.047761,0.00597,0.00597,0.104478,0.00597,0.002985,0.01194,0.071642,0.170149,0.002985
ADP,0.003289,0.121711,0.016447,0.078947,0.032895,0.118421,0.006579,0.016447,0.003289,0.049342,0.029605,0.006579,0.088816,0.016447,0.407895,0.003289
ADJ,0.004975,0.014925,0.089552,0.039801,0.00995,0.606965,0.024876,0.014925,0.024876,0.004975,0.024876,0.004975,0.0199,0.099502,0.00995,0.004975
ADV,0.005882,0.3,0.058824,0.111765,0.070588,0.029412,0.047059,0.029412,0.017647,0.041176,0.005882,0.005882,0.011765,0.211765,0.047059,0.005882
NOUN,0.001776,0.097691,0.202487,0.003552,0.031972,0.166963,0.01421,0.063943,0.046181,0.030195,0.005329,0.001776,0.017762,0.277087,0.017762,0.021314
SCONJ,0.013514,0.054054,0.040541,0.067568,0.013514,0.121622,0.067568,0.040541,0.013514,0.243243,0.013514,0.013514,0.027027,0.054054,0.202703,0.013514
AUX,0.007752,0.310078,0.031008,0.139535,0.100775,0.031008,0.007752,0.046512,0.015504,0.069767,0.023256,0.007752,0.015504,0.03876,0.100775,0.054264
CCONJ,0.011236,0.101124,0.044944,0.089888,0.078652,0.179775,0.022472,0.05618,0.011236,0.067416,0.022472,0.011236,0.146067,0.05618,0.089888,0.011236
PRON,0.005348,0.347594,0.053476,0.032086,0.090909,0.085561,0.010695,0.208556,0.005348,0.042781,0.005348,0.005348,0.005348,0.074866,0.02139,0.005348


#### Viterbi Algorithm

The steps are as follows:

1. Given a sequence of words
2. iterate through the sequence
3. for each word (starting from first word in sequence) calculate the product of emission probabilties and transition probabilties for all possible tags.
4. assign the tag which has maximum probability obtained in step 3 above.
5. move to the next word in sequence to repeat steps 3 and 4 above.

In [None]:
# Vanilla Viterbi Algorithm
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['PUNCT', tag]
#                 continue
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

#### Testing Vanilla Viterbi Algorithm on sampled test data

In [None]:
random.seed(1234)

# list of tagged words
test_run_base = [tup for sent in test_set for tup in sent]

# list of untagged words
test_tagged_words = [tup[0] for sent in test_set for tup in sent]
tagged_seq = Viterbi(test_tagged_words)
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j]

vanilla_viterbi_accuracy = len(check)/len(tagged_seq)

print("The accuracy of the Vanilla Viterbi Algorithm is -", vanilla_viterbi_accuracy)