<a href="https://colab.research.google.com/github/tubagokhan/DeepLearningNLPFoundations/blob/main/POSwithHMMabdViterbi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Importing libraries
import numpy as np
import pandas as pd
import requests
import pprint
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('treebank')

In [None]:
# reading the Treebank tagged sentences
train_set= list(nltk.corpus.treebank.tagged_sents())

# Getting list of tagged words
train_tagged_words = [tup for sent in train_set for tup in sent]

# tokens 
tokens = [pair[0] for pair in train_tagged_words]
# vocabulary
V = set(tokens)
print("Total vocabularies: ",len(V))
# number of tags
T = set([pair[1] for pair in train_tagged_words])
print("Total tags: ",len(T))

# Emission Probability

In [None]:
# computing P(w/t) and storing in T x V matrix
t = len(T)
v = len(V)
w_given_t = np.zeros((t, v))

# compute word given tag: Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)


# Transition Probability

In [None]:
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability

def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(T), len(T)), dtype='float32') # transition matrix
for i, t1 in enumerate(list(T)):
    for j, t2 in enumerate(list(T)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(T), index=list(T))

# Viterbi 

In [None]:
# Viterbi Heuristic
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [None]:
## Testing
sentence_test = 'Twitter is the best networking social site. Man is a social animal. Data science is an emerging field. Data science jobs are high in demand.'
words = word_tokenize(sentence_test)
tagged_seq = Viterbi(words)
print(tagged_seq)

In [None]:
## Testing
sentence_test = 'Tuba have an interview  for a postdoc position. '
words = word_tokenize(sentence_test)
tagged_seq = Viterbi(words)
print(tagged_seq)

In [None]:
newWord=('Tuba','NNP')

if newWord in train_tagged_words:
	print ("Yes")
else:
  train_tagged_words.append(newWord)
  print("Added")

In [None]:
## Testing
sentence_test = 'Tuba have an interview  for a postdoc position.'
words = word_tokenize(sentence_test)
tagged_seq = Viterbi(words)
print(tagged_seq)