## POS tagging using modified Viterbi

### Data Preparation

In [1]:
#Importing libraries
#Importing libraries
import nltk, re, pprint
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

In [2]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [3]:
# Splitting into train and test
random.seed(1234)
train_set, test_set = train_test_split(nltk_data,test_size=0.05)

print(len(train_set))
print(len(test_set))
print(train_set[:40])

3718
196
[[('In', 'ADP'), ('composite', 'ADJ'), ('trading', 'NOUN'), ('on', 'ADP'), ('the', 'DET'), ('New', 'NOUN'), ('York', 'NOUN'), ('Stock', 'NOUN'), ('Exchange', 'NOUN'), (',', '.'), ('Telerate', 'NOUN'), ('shares', 'NOUN'), ('closed', 'VERB'), ('at', 'ADP'), ('$', '.'), ('19.50', 'NUM'), ('*U*', 'X'), (',', '.'), ('up', 'ADV'), ('12.5', 'NUM'), ('cents', 'NOUN'), ('.', '.')], [('``', '.'), ('Professional', 'ADJ'), ('sugar', 'NOUN'), ('people', 'NOUN'), ('here', 'ADV'), ('who', 'PRON'), ('*T*-1', 'X'), ('have', 'VERB'), ('strong', 'ADJ'), ('contacts', 'NOUN'), ('with', 'ADP'), ('the', 'DET'), ('Brazilian', 'ADJ'), ('sugar', 'NOUN'), ('industry', 'NOUN'), ('have', 'VERB'), ('been', 'VERB'), ('unable', 'ADJ'), ('*-3', 'X'), ('to', 'PRT'), ('confirm', 'VERB'), ('the', 'DET'), ('reports', 'NOUN'), ('or', 'CONJ'), ('get', 'VERB'), ('enough', 'ADJ'), ('information', 'NOUN'), ('0', 'X'), ('*', 'X'), ('to', 'PRT'), ('clarify', 'VERB'), ('the', 'DET'), ('situation', 'NOUN'), ('*T*-4', 'X')

In [4]:
# Getting list of tagged words
train_tagged_words = [tup for sent in train_set for tup in sent]
len(train_tagged_words)

95621

In [5]:
# tokens 
tokens = [pair[0] for pair in train_tagged_words]
tokens[:10]

['In',
 'composite',
 'trading',
 'on',
 'the',
 'New',
 'York',
 'Stock',
 'Exchange',
 ',']

In [6]:
# vocabulary
V = set(tokens)
print(len(V))

12076


In [7]:
# number of tags
T = set([pair[1] for pair in train_tagged_words])
len(T)

12

In [8]:
print(T)

{'ADV', 'ADJ', '.', 'ADP', 'CONJ', 'X', 'NUM', 'PRON', 'DET', 'NOUN', 'VERB', 'PRT'}


### Build the vanilla Viterbi based POS tagger

In [9]:
# computing P(w/t) and storing in T x V matrix
t = len(T)
v = len(V)
w_given_t = np.zeros((t, v))

In [10]:
# compute word given tag: Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
    
    return (count_w_given_tag, count_tag)

In [11]:
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability

def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [12]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)
tags_matrix = np.zeros((len(T), len(T)), dtype='float32')
for i, t1 in enumerate(list(T)):
    for j, t2 in enumerate(list(T)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

In [13]:
# convert the matrix to a df for better readability
tags_df = pd.DataFrame(tags_matrix, columns = list(T), index=list(T))

In [14]:
tags_df.loc['.', :]

ADV     0.053297
ADJ     0.043786
.       0.093943
ADP     0.090624
CONJ    0.058232
X       0.027008
NUM     0.080126
PRON    0.065590
DET     0.173800
NOUN    0.221893
VERB    0.089098
PRT     0.002512
Name: ., dtype: float32

In [None]:
# heatmap of tags matrix
# T(i, j) means P(tag j given tag i)
plt.figure(figsize=(18, 12))
sns.heatmap(tags_df)
plt.show()

In [None]:
# frequent tags
# filter the df to get P(t2, t1) > 0.5
tags_frequent = tags_df[tags_df>0.5]
plt.figure(figsize=(18, 12))
sns.heatmap(tags_frequent)
plt.show()

In [15]:
# Viterbi Heuristic
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [16]:
with open('Test_sentences_1.txt','r',encoding='utf8') as f:
    test_set = f.read()
# list of untagged words
sent_text = nltk.sent_tokenize(str(test_set))
print(sent_text)

['Android is a mobile operating system developed by Google.', 'Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.', "Google and Twitter made a deal in 2015 that gave Google access to Twitter's firehose.", 'Twitter is an online news and social networking service on which users post and interact with messages known as tweets.', 'Before entering politics, Donald Trump was a domineering businessman and a television personality.', 'The 2018 FIFA World Cup is the 21st FIFA World Cup, an international football tournament contested once every four years.', 'This is the first World Cup to be held in Eastern Europe and the 11th time that it has been held in Europe.', 'Show me the cheapest round trips from Dallas to Atlanta\nI would like to see flights from Denver to Philadelphia.', 'Show me the price of the flights leaving Atlanta at about 3 in the afternoon and arriving in San Francisco.', 'NASA invited social media users to experience the launch

In [49]:
# now loop over each sentence and tokenize it separately
start = time.time()
tagged_data = []
for sentence in sent_text:
    tokenized_text = nltk.word_tokenize(sentence)
    #print(tokenized_text)
    tagged = Viterbi(tokenized_text)
    tagged_data.append(tagged)
    print(tagged)
end = time.time()
difference = end-start
print("Time taken in seconds: ", difference)

TypeError: 'list' object is not callable

### Solve the problem of unknown words

In [18]:
a = []
nltk_word =[]
for i in range(len(nltk_data)):
    a.append([pair[0] for pair in nltk_data[i]])
for list in a:
    for word in list:
        nltk_word.append(word) 

In [19]:
b = []
tagged_word =[]
for i in range(len(tagged_data)):
    b.append([pair[0] for pair in tagged_data[i]])
for list in b:
    for word in list:
        tagged_word.append(word) 

In [20]:
#Finding the unknown keywords
unknownwords = set(tagged_word).union(set(nltk_word))  - set(tagged_word).intersection(set(nltk_word)) - set(nltk_word)

In [21]:
unknownwords = [ k for k in unknownwords ]
print(unknownwords)

['domineering', 'Google', 'arriving', 'invited', 'interact', 'Cup', 'FIFA', 'online', 'Satellite', 'NASA', 'trips', 'OS', '21st', 'Android', 'ICESAT-2', 'smartphones', '2011', 'messages', '2018', '2015', 'firehose', 'worldwide', 'Twitter', 'personality', '2013', 'contested', 'tweets', 'tournament']


In [48]:
c = []
tagged =[]
for i in range(len(tagged_data)):
    c.append([pair for pair in tagged_data[i]])
for list in c:
   for word in list:
       tagged.append(word)
for tag in tagged:
    for word in unknownwords:
        if word in tag:
            print(word,',',tag[1])

Android , ADV
Google , ADV
Android , ADV
OS , ADV
worldwide , ADV
smartphones , ADV
2011 , ADV
2013 , ADV
Google , ADV
Twitter , ADV
2015 , ADV
Google , ADV
Twitter , ADV
firehose , ADV
Twitter , ADV
online , ADV
interact , ADV
messages , ADV
tweets , ADV
domineering , ADV
personality , ADV
2018 , ADV
FIFA , ADV
Cup , ADV
21st , ADV
FIFA , ADV
Cup , ADV
tournament , ADV
contested , ADV
Cup , ADV
trips , ADV
arriving , ADV
NASA , ADV
invited , ADV
ICESAT-2 , ADV
Satellite , ADV


In [50]:
'''
Assign NOUN if capitalized
'''
def apply_capitalization_assignment(word=None, pos=-1):
    if len(word.pos_tags) == 0:
        if word.orig_text[0].isupper():
            word.pos_tags = ['NOUN']

    if pos > 0:
        if word.orig_text[0].isupper():
            word.pos_tags = ['NOUN']

    return word

In [54]:
'''
Assign possible tags helper function: Dictionary Search
'''
from cebdict import dictionary
def dictionary_search(word=None):
    if word.is_entry:
        # pos_tags = search_term(entries=stemmer.entries, term=word.root)
        pos_tags = dictionary.search(word.root)
        if pos_tags:
            word.pos_tags = pos_tags
            word.root_tags = word.pos_tags

    return word

In [61]:
'''
POS Tag Assignment
Assigns all possible POS tags per token
'''
from cebstemmer import stemmer
import string
def assign_pos_tags(tokens=[]):
    words = []
    for idx, token in enumerate(tokens):
        #stem = stemmer.stem_word(word=token, as_object=True)
        #word = dictionary_search(word=token)
        #word = function_words_search(word=stem)
        #word = apply_lexical_rules_assignment(word=stem)
        word = apply_capitalization_assignment(word=token, pos=idx)

        if len(word.pos_tags) == 0:
            if stem.text.isdigit():
                # Assign num if the word is a numerical value
                word.pos_tags = ['NUM']
            elif stem.text in string.punctuation:
                word.pos_tags = ['SYM']
            else:
                word.pos_tags = ['OTH']

        words.append(word)
    return words

In [62]:
'''
Given a Cebuano sentence, it will tag all words with its corresponding POS tags
'''
def tag_sentence(tokens):
    #tokens = tokenize(text=text)
    words = assign_pos_tags(tokens=tokens)
    
    sentence = []
    for word in words:
        sentence.append((word.orig_text.encode('utf-8'), word.pos_tags[0]))

    return sentence

In [63]:
tag_sentence(unknownwords)

AttributeError: 'str' object has no attribute 'pos_tags'

#### Evaluating tagging accuracy

### Compare the tagging accuracies of the modifications with the vanilla Viterbi algorithm

### List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications