## POS tagging using modified Viterbi

### Data Preparation

In [1]:
#Importing libraries
import nltk
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import random
from operator import itemgetter

### Downloading - nltk 'universal tagset'

In [2]:
import nltk
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\viki\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [3]:
# reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [4]:
nltk_data[:5]

[[('Pierre', 'NOUN'),
  ('Vinken', 'NOUN'),
  (',', '.'),
  ('61', 'NUM'),
  ('years', 'NOUN'),
  ('old', 'ADJ'),
  (',', '.'),
  ('will', 'VERB'),
  ('join', 'VERB'),
  ('the', 'DET'),
  ('board', 'NOUN'),
  ('as', 'ADP'),
  ('a', 'DET'),
  ('nonexecutive', 'ADJ'),
  ('director', 'NOUN'),
  ('Nov.', 'NOUN'),
  ('29', 'NUM'),
  ('.', '.')],
 [('Mr.', 'NOUN'),
  ('Vinken', 'NOUN'),
  ('is', 'VERB'),
  ('chairman', 'NOUN'),
  ('of', 'ADP'),
  ('Elsevier', 'NOUN'),
  ('N.V.', 'NOUN'),
  (',', '.'),
  ('the', 'DET'),
  ('Dutch', 'NOUN'),
  ('publishing', 'VERB'),
  ('group', 'NOUN'),
  ('.', '.')],
 [('Rudolph', 'NOUN'),
  ('Agnew', 'NOUN'),
  (',', '.'),
  ('55', 'NUM'),
  ('years', 'NOUN'),
  ('old', 'ADJ'),
  ('and', 'CONJ'),
  ('former', 'ADJ'),
  ('chairman', 'NOUN'),
  ('of', 'ADP'),
  ('Consolidated', 'NOUN'),
  ('Gold', 'NOUN'),
  ('Fields', 'NOUN'),
  ('PLC', 'NOUN'),
  (',', '.'),
  ('was', 'VERB'),
  ('named', 'VERB'),
  ('*-1', 'X'),
  ('a', 'DET'),
  ('nonexecutive', 'ADJ'),
 

### Splitting into Train and Test set (Validation set) (95:5)

In [5]:
# Splitting into train and test
#random.seed(1234)
train_set, test_set = train_test_split(nltk_data,train_size = 0.95)
print("Lenght of NTLK ('universal tagset'):",len(nltk_data))
print("Length of train dataset:",len(train_set))
print("Length of test dataset:",len(test_set))


Lenght of NTLK ('universal tagset'): 3914
Length of train dataset: 3718
Length of test dataset: 196


### View - train data for better understanding

In [6]:
train_set[:2]

[[('Arbitrage', 'NOUN'),
  ('simply', 'ADV'),
  ('transfers', 'VERB'),
  ('his', 'PRON'),
  ('selling', 'NOUN'),
  ('pressure', 'NOUN'),
  ('from', 'ADP'),
  ('Chicago', 'NOUN'),
  ('to', 'PRT'),
  ('New', 'NOUN'),
  ('York', 'NOUN'),
  (',', '.'),
  ('while', 'ADP'),
  ('*-1', 'X'),
  ('functioning', 'VERB'),
  ('as', 'ADP'),
  ('a', 'DET'),
  ('buyer', 'NOUN'),
  ('in', 'ADP'),
  ('Chicago', 'NOUN'),
  ('.', '.')],
 [('*-64', 'X'),
  ('Founded', 'VERB'),
  ('as', 'ADP'),
  ('the', 'DET'),
  ('Examiner', 'NOUN'),
  ('in', 'ADP'),
  ('1903', 'NUM'),
  ('by', 'ADP'),
  ('Mr.', 'NOUN'),
  ('Hearst', 'NOUN'),
  (',', '.'),
  ('the', 'DET'),
  ('Herald', 'NOUN'),
  ('was', 'VERB'),
  ('crippled', 'VERB'),
  ('*-64', 'X'),
  ('by', 'ADP'),
  ('a', 'DET'),
  ('bitter', 'ADJ'),
  (',', '.'),
  ('decade-long', 'ADJ'),
  ('strike', 'NOUN'),
  ('that', 'DET'),
  ('*T*-1', 'X'),
  ('began', 'VERB'),
  ('in', 'ADP'),
  ('1967', 'NUM'),
  ('and', 'CONJ'),
  ('cut', 'VERB'),
  ('circulation', 'NOUN'

In [7]:
# Getting list of tagged words
train_tagged_words_list = [tup for sent in train_set for tup in sent]
len(train_tagged_words_list)

95896

### View train_tagged_words_list for better understanding

In [8]:
train_tagged_words_list[:5]

[('Arbitrage', 'NOUN'),
 ('simply', 'ADV'),
 ('transfers', 'VERB'),
 ('his', 'PRON'),
 ('selling', 'NOUN')]

### Words from train_tagged_words_list

In [9]:
words = [ i[0] for i in train_tagged_words_list]
words = set(words)
print(words)



### Tags from train_tagged_words_list

In [10]:
tags = [ i[1] for i in train_tagged_words_list]
tags = set(tags)
print(tags)

{'PRON', 'PRT', '.', 'X', 'DET', 'ADV', 'NUM', 'NOUN', 'ADP', 'VERB', 'ADJ', 'CONJ'}


##### Hidden Markov Model( HMM)

**Emission Probability** of a word 'w' for tag 't':
P(w|t) = Number of times w has been tagged t /Number of times t appears <br>
**Transition Probability** of tag t1 followed by tag t2:
P(t2|t1) = Number of times t1 is followed by tag t2 / Number of times t1 appears 


#### Emission Probability

In [11]:
# compute word given tag: Emission Probability
# returns count_w_given_tag, count_tag
# count_w_given_tag - Number of times the word has been assigned to the tag
# count_tag - Number of times the tag is present in the corpus
def word_given_tag(word, tag, train_bag = train_tagged_words_list):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)    
    return (count_w_given_tag, count_tag)

#### Transition Probability

In [12]:
# compute tag given tag: tag2(t2) given tag1 (t1), i.e. Transition Probability
# count_t2_t1 - Number times tag1 followed by tag2
# count_t1 - Number of times tag1 appears in the corpus

def t2_given_t1(t2, t1, train_bag = train_tagged_words_list):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [13]:
# creating t x t transition matrix of tags
# each column is t2, each row is t1
# thus M(i, j) represents P(tj given ti)

tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
tags_matrix
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]
tags_matrix

array([[8.09560530e-03, 1.27216652e-02, 4.04780246e-02, 9.36777145e-02,
        1.00231301e-02, 3.39244418e-02, 7.71010015e-03, 2.07401693e-01,
        2.19737850e-02, 4.84579802e-01, 7.40169659e-02, 5.39707020e-03],
       [1.77165363e-02, 1.96850393e-03, 4.36351709e-02, 1.34514440e-02,
        1.00721784e-01, 9.51443613e-03, 5.83989508e-02, 2.46719167e-01,
        2.03412082e-02, 3.98293972e-01, 8.72703418e-02, 1.96850393e-03],
       [6.51375353e-02, 2.50873575e-03, 9.16584507e-02, 2.71481052e-02,
        1.74536332e-01, 5.21458648e-02, 8.22506920e-02, 2.21754327e-01,
        9.09416750e-02, 8.94185081e-02, 4.45300601e-02, 5.78801185e-02],
       [5.56083657e-02, 1.82034224e-01, 1.64448664e-01, 7.55703449e-02,
        5.54499365e-02, 2.48732567e-02, 2.85171112e-03, 6.27376437e-02,
        1.44486696e-01, 2.04214200e-01, 1.71102658e-02, 1.06147025e-02],
       [3.49819055e-03, 2.41254529e-04, 1.73703264e-02, 4.55971062e-02,
        5.54885389e-03, 1.27864899e-02, 2.18335353e-02, 6.39

### Converting the matrix of transition probability to a dataframe 

In [14]:
# convert the matrix to a df for better readability
transition_probability_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
transition_probability_df

Unnamed: 0,PRON,PRT,.,X,DET,ADV,NUM,NOUN,ADP,VERB,ADJ,CONJ
PRON,0.008096,0.012722,0.040478,0.093678,0.010023,0.033924,0.00771,0.207402,0.021974,0.48458,0.074017,0.005397
PRT,0.017717,0.001969,0.043635,0.013451,0.100722,0.009514,0.058399,0.246719,0.020341,0.398294,0.08727,0.001969
.,0.065138,0.002509,0.091658,0.027148,0.174536,0.052146,0.082251,0.221754,0.090942,0.089419,0.04453,0.05788
X,0.055608,0.182034,0.164449,0.07557,0.05545,0.024873,0.002852,0.062738,0.144487,0.204214,0.01711,0.010615
DET,0.003498,0.000241,0.01737,0.045597,0.005549,0.012786,0.021834,0.639807,0.00953,0.039445,0.20386,0.000483
ADV,0.014647,0.01498,0.134154,0.022304,0.068242,0.080559,0.031957,0.031957,0.117843,0.347537,0.129161,0.006658
NUM,0.001177,0.026773,0.116799,0.211239,0.002942,0.002942,0.186231,0.352457,0.03501,0.017652,0.033245,0.013533
NOUN,0.004721,0.043725,0.240449,0.028617,0.013219,0.016996,0.009406,0.264127,0.176678,0.147407,0.01213,0.042526
ADP,0.068562,0.00149,0.04003,0.035026,0.322368,0.013414,0.06228,0.323326,0.017247,0.008091,0.107314,0.000852
VERB,0.035587,0.031013,0.034812,0.218871,0.133509,0.081486,0.023259,0.11118,0.091254,0.167623,0.065979,0.005427


### Build the Vanilla Viterbi based POS tagger (no modifications)

In [15]:
# Viterbi Heuristic
def Viterbi(words, train_tagged_words_list = train_tagged_words_list):
    #state  - list used to hold the state probabilities
    state = []
    
    #T - list contains all the Tags from train_bag
    T = list(set([pair[1] for pair in train_tagged_words_list]))
    
    for key, word in enumerate(words):
        #print("key:",key)
        #print("word:",word)
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            #print("Tag:",tag)
            #if the key is 0, then it is the begining of the sentence.
            if key == 0:
                transition_p = transition_probability_df.loc['.', tag] #"." - represents the start
                #print("transition_p:",transition_p)
            else:
                #state[-1] - refers to the previous tag. tag - refers to the current tag
                transition_p = transition_probability_df.loc[state[-1], tag]
                #print("transition_p:",transition_p)
                
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            #print("emission_p:",emission_p)
            state_probability = emission_p * transition_p  
            
            p.append(state_probability)
            #print("list of state probabilty:",p)
            
        pmax = max(p)
        #print("Maximum State Probability:",pmax)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)]
        #print("T[p.index(pmax)]:",T[p.index(pmax)])
        #Appending the maximum state probability to the state[]. So, when referring to state[-1] - represents previous tag
        state.append(state_max)
        #print("state:",state)
    return list(zip(words, state))

### Test data

##### test_tagged_words - contains only the words from the test data. Using vannila Viterbi algorithm, we will tag these words.

In [16]:
# list of tagged words
test_run_base = [tup for sentences in test_set for tup in sentences]
test_tagged_words = [tup[0] for sentences in test_set for tup in sentences]
print(len(test_set))
print(len(test_tagged_words))

196
4780


### Applying the Vanilla Viterbi algorithm to get the tags for the words in test set.

In [17]:
# tagging the test sentences

tagged_seq = Viterbi(test_tagged_words)
tagged_seq 

[('The', 'DET'),
 ('offer', 'NOUN'),
 (',', '.'),
 ('which', 'DET'),
 ('*T*-2', 'X'),
 ('follows', 'VERB'),
 ('a', 'DET'),
 ('$', '.'),
 ('55-a-share', 'PRON'),
 ('*U*', 'X'),
 ('bid', 'VERB'),
 ('that', 'ADP'),
 ('*T*-3', 'X'),
 ('was', 'VERB'),
 ('rejected', 'VERB'),
 ('*-1', 'X'),
 ('in', 'ADP'),
 ('September', 'NOUN'),
 (',', '.'),
 ('steps', 'NOUN'),
 ('up', 'PRT'),
 ('pressure', 'NOUN'),
 ('on', 'ADP'),
 ('the', 'DET'),
 ('chemicals', 'NOUN'),
 ('concern', 'NOUN'),
 ('.', '.'),
 ('Already', 'PRON'),
 (',', '.'),
 ('10', 'NUM'),
 ('local', 'ADJ'),
 ('councils', 'NOUN'),
 ('have', 'VERB'),
 ('refused', 'VERB'),
 ('*-1', 'X'),
 ('to', 'PRT'),
 ('honor', 'VERB'),
 ('fees', 'NOUN'),
 ('and', 'CONJ'),
 ('payments', 'NOUN'),
 ('to', 'PRT'),
 ('banks', 'NOUN'),
 ('incurred', 'VERB'),
 ('*', 'X'),
 ('during', 'ADP'),
 ('various', 'ADJ'),
 ('swaps', 'PRON'),
 ('dealings', 'NOUN'),
 ('.', '.'),
 ('According', 'VERB'),
 ('to', 'PRT'),
 ('Ms.', 'NOUN'),
 ('Poore', 'NOUN'),
 (',', '.'),
 ('Old

#### Checking the accuracy between the tags assigned by humans and Viterbi algorithm

tagged_seq - refers to the tags given by the Virterbi algorithm <br>
test_run_base - refers to the tags given by humans

In [18]:
# accuracy for checking matched tags.
#If the tags generated by humans and Viterbi algorithm is same it will add to the list.
Matched_tags = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
Matched_tags

[('The', 'DET'),
 ('offer', 'NOUN'),
 (',', '.'),
 ('which', 'DET'),
 ('*T*-2', 'X'),
 ('follows', 'VERB'),
 ('a', 'DET'),
 ('$', '.'),
 ('*U*', 'X'),
 ('*T*-3', 'X'),
 ('was', 'VERB'),
 ('rejected', 'VERB'),
 ('*-1', 'X'),
 ('in', 'ADP'),
 ('September', 'NOUN'),
 (',', '.'),
 ('up', 'PRT'),
 ('pressure', 'NOUN'),
 ('on', 'ADP'),
 ('the', 'DET'),
 ('chemicals', 'NOUN'),
 ('concern', 'NOUN'),
 ('.', '.'),
 (',', '.'),
 ('10', 'NUM'),
 ('local', 'ADJ'),
 ('councils', 'NOUN'),
 ('have', 'VERB'),
 ('refused', 'VERB'),
 ('*-1', 'X'),
 ('to', 'PRT'),
 ('honor', 'VERB'),
 ('fees', 'NOUN'),
 ('and', 'CONJ'),
 ('payments', 'NOUN'),
 ('to', 'PRT'),
 ('banks', 'NOUN'),
 ('incurred', 'VERB'),
 ('*', 'X'),
 ('during', 'ADP'),
 ('various', 'ADJ'),
 ('dealings', 'NOUN'),
 ('.', '.'),
 ('According', 'VERB'),
 ('to', 'PRT'),
 ('Ms.', 'NOUN'),
 ('Poore', 'NOUN'),
 (',', '.'),
 ('Journal', 'NOUN'),
 ('Corp.', 'NOUN'),
 (',', '.'),
 ('her', 'PRON'),
 ('company', 'NOUN'),
 (',', '.'),
 ('printed', 'VERB'),

In [19]:
accuracy = len(Matched_tags)/len(tagged_seq)
accuracy

0.8958158995815899

**Observation** - The accuracy of the Vanilla Veterbi algorithm is 89.83%. We will modify the Veterbi algorithm and compare its performance

### Creating a dataframe for accuracy of different modifications of Viterbi

In [20]:
columns = ['Algorithm','Accuracy']
Accuracy_df = pd.DataFrame(columns = columns)
new_row = {'Algorithm':' Vannila Viterbi', 'Accuracy':accuracy}
Accuracy_df = Accuracy_df.append(new_row, ignore_index=True)
Accuracy_df

Unnamed: 0,Algorithm,Accuracy
0,Vannila Viterbi,0.895816


#### Incorrect tags
j - contains the tags assigned by Vanilla Viterbi Algorithm and by humans. 

In [21]:
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(tagged_seq, test_run_base)) if j[0]!=j[1]]
incorrect_tagged_cases

[[('$', '.'), (('55-a-share', 'PRON'), ('55-a-share', 'ADJ'))],
 [('*U*', 'X'), (('bid', 'VERB'), ('bid', 'NOUN'))],
 [('bid', 'NOUN'), (('that', 'ADP'), ('that', 'DET'))],
 [(',', '.'), (('steps', 'NOUN'), ('steps', 'VERB'))],
 [('.', '.'), (('Already', 'PRON'), ('Already', 'ADV'))],
 [('various', 'ADJ'), (('swaps', 'PRON'), ('swaps', 'NOUN'))],
 [(',', '.'), (('Old-House', 'PRON'), ('Old-House', 'NOUN'))],
 [('her', 'PRON'), (('publishing', 'VERB'), ('publishing', 'NOUN'))],
 [('all', 'DET'), (('126,000', 'PRON'), ('126,000', 'NUM'))],
 [('a', 'DET'), (('worst-case', 'PRON'), ('worst-case', 'ADJ'))],
 [('bold', 'ADJ'), (('palace', 'PRON'), ('palace', 'NOUN'))],
 [('palace', 'NOUN'), (('revolt', 'PRON'), ('revolt', 'NOUN'))],
 [('to', 'PRT'), (('telephone', 'NOUN'), ('telephone', 'VERB'))],
 [('them', 'PRON'), (('pressure', 'NOUN'), ('pressure', 'VERB'))],
 [('to', 'PRT'), (('ban', 'NOUN'), ('ban', 'VERB'))],
 [('traders', 'NOUN'), (('answer', 'NOUN'), ('answer', 'VERB'))],
 [('reform

**Observation** 
('85.7', 'X'), ('85.7', 'NUM') - Shows that the Viterbi algorithm assigned the tag 'X' for 85.7 whereas 'NUM' has been assigned as '85.7'by humans.

### Solve the problem of unknown words

### Viterbi Modification - 1

For Unknown words, find the most commonly used tags from the train dataset and tag it.

***Finding the most commonly used tag from the 'train_tagged_words_list'***

In [22]:
tag_fd = nltk.FreqDist(tag for (word, tag) in train_tagged_words_list)
max(tag_fd.most_common(),key=itemgetter(1))[0]

'NOUN'

**Observation** - Most commonly used tag from 'train_tagged_words_list' is 'NOUN'. 

**Viterbi_modification_1** <br>
Step 1: Tag the words using Vanilla Veterbi Algorithm<br>
Step 2: From the taggged sequence, check whether the word is present in the training data set. If the word is not present tag it as 'NOUN' ('NOUN' is the most commonly found tag in the training data)

In [23]:
# Viterbi Heuristic
def Viterbi_modification_1(test_words, train_tagged_words_list = train_tagged_words_list):
    tagged_seq = Viterbi(test_words)
    
    V = list(set([i[0] for i in train_tagged_words_list]))
             
    viterbi_words = [i[0] for i in tagged_seq]
    viterbi_tags = [i[1] for i in tagged_seq]
            
    for key,word in enumerate(viterbi_words):
             if word not in V:
                 viterbi_tags[key] = 'NOUN'
    #print(list(zip(viterbi_words,viterbi_tags)))
    return list(zip(viterbi_words,viterbi_tags))


### Tagging the words using Viterbi_modification_1

In [24]:
#Calling our user-defined Viterbi_modification_1 to tag words for test data
modified_viterbi_1 = Viterbi_modification_1(test_tagged_words)

### Checking the accuracy of Viterbi_modification_1

In [25]:
# Finding the tags that has been matched with the tags given by humans.
Matched_tags_Viterbi_modified_1 = [i for i, j in zip(modified_viterbi_1, test_run_base) if i == j] 
Matched_tags_Viterbi_modified_1

[('The', 'DET'),
 ('offer', 'NOUN'),
 (',', '.'),
 ('which', 'DET'),
 ('*T*-2', 'X'),
 ('follows', 'VERB'),
 ('a', 'DET'),
 ('$', '.'),
 ('*U*', 'X'),
 ('*T*-3', 'X'),
 ('was', 'VERB'),
 ('rejected', 'VERB'),
 ('*-1', 'X'),
 ('in', 'ADP'),
 ('September', 'NOUN'),
 (',', '.'),
 ('up', 'PRT'),
 ('pressure', 'NOUN'),
 ('on', 'ADP'),
 ('the', 'DET'),
 ('chemicals', 'NOUN'),
 ('concern', 'NOUN'),
 ('.', '.'),
 (',', '.'),
 ('10', 'NUM'),
 ('local', 'ADJ'),
 ('councils', 'NOUN'),
 ('have', 'VERB'),
 ('refused', 'VERB'),
 ('*-1', 'X'),
 ('to', 'PRT'),
 ('honor', 'VERB'),
 ('fees', 'NOUN'),
 ('and', 'CONJ'),
 ('payments', 'NOUN'),
 ('to', 'PRT'),
 ('banks', 'NOUN'),
 ('incurred', 'VERB'),
 ('*', 'X'),
 ('during', 'ADP'),
 ('various', 'ADJ'),
 ('swaps', 'NOUN'),
 ('dealings', 'NOUN'),
 ('.', '.'),
 ('According', 'VERB'),
 ('to', 'PRT'),
 ('Ms.', 'NOUN'),
 ('Poore', 'NOUN'),
 (',', '.'),
 ('Old-House', 'NOUN'),
 ('Journal', 'NOUN'),
 ('Corp.', 'NOUN'),
 (',', '.'),
 ('her', 'PRON'),
 ('company',

In [26]:
accuracy = len(Matched_tags_Viterbi_modified_1)/len(modified_viterbi_1)
accuracy

0.9309623430962343

**Observation** - The accuracy of the Viterbi_modification_1 algorithm is 93.07%. After tagging it with most common tag ('Noun' in this case) the accuracy has improved from 89.83% to 93.07%. We will modify the Veterbi algorithm using rules and compare its performance.

### Updating the Accuracy_df with the accuracy of Viterbi_modification_1

In [27]:
new_row = {'Algorithm':' Viterbi_modification_1(commonly used tag)', 'Accuracy':accuracy}
Accuracy_df = Accuracy_df.append(new_row, ignore_index=True)
Accuracy_df

Unnamed: 0,Algorithm,Accuracy
0,Vannila Viterbi,0.895816
1,Viterbi_modification_1(commonly used tag),0.930962


### Words that have incorrect tags

In [28]:
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(modified_viterbi_1, test_run_base)) if j[0]!=j[1]]
incorrect_tagged_cases

[[('$', '.'), (('55-a-share', 'NOUN'), ('55-a-share', 'ADJ'))],
 [('*U*', 'X'), (('bid', 'VERB'), ('bid', 'NOUN'))],
 [('bid', 'NOUN'), (('that', 'ADP'), ('that', 'DET'))],
 [(',', '.'), (('steps', 'NOUN'), ('steps', 'VERB'))],
 [('.', '.'), (('Already', 'NOUN'), ('Already', 'ADV'))],
 [('her', 'PRON'), (('publishing', 'VERB'), ('publishing', 'NOUN'))],
 [('all', 'DET'), (('126,000', 'NOUN'), ('126,000', 'NUM'))],
 [('a', 'DET'), (('worst-case', 'NOUN'), ('worst-case', 'ADJ'))],
 [('to', 'PRT'), (('telephone', 'NOUN'), ('telephone', 'VERB'))],
 [('them', 'PRON'), (('pressure', 'NOUN'), ('pressure', 'VERB'))],
 [('to', 'PRT'), (('ban', 'NOUN'), ('ban', 'VERB'))],
 [('traders', 'NOUN'), (('answer', 'NOUN'), ('answer', 'VERB'))],
 [('reform', 'NOUN'), (('instituted', 'NOUN'), ('instituted', 'VERB'))],
 [('1929', 'NUM'), (('that', 'ADP'), ('that', 'DET'))],
 [('*T*-1', 'X'), (('protects', 'NOUN'), ('protects', 'VERB'))],
 [('being', 'VERB'), (('relentlessly', 'NOUN'), ('relentlessly', 'ADV

### Viterbi Modification 2

Step 1: Tag the words using Vanilla Veterbi Algorithm<br>
Step 2: From the taggged sequence, check whether the word is present in the training data set. If the word is not present, then use some rules to tag it. Below are the rules defined for tagging <br>
        1. If the word is a number, tag it as 'NUM'
        2. If word has last three letters 'ous', tag it as 'ADJ'
        3. If the word ends with 'ing' or 'ed', tag it as 'VERB'
        4. If the word ends with 'ly', tag it as 'ADV'
        5. If the previous word is 'has' or 'have' or 'had', tag the current word as 'VERB'
        6. If the word is in uppercase, tag it as 'NOUN'

In [29]:
# Viterbi Heuristic
def Viterbi_modification_2(test_words, train_tagged_words_list = train_tagged_words_list):
    tagged_seq = Viterbi(test_words)
    
    V = list(set([i[0] for i in train_tagged_words_list]))             
    viterbi_words = [i[0] for i in tagged_seq]
    viterbi_tags = [i[1] for i in tagged_seq]
            
    for key,word in enumerate(viterbi_words):
        if word not in V:
            #if word.isdigit() == True or word[:-2].isdigit() == True or word.isnumeric()==True or word[-3] == '.':
            if word.isdigit() == True or word.isnumeric()==True or word[-3:] == '.':
                viterbi_tags[key] = 'NUM'                
            elif word[-3:] =='ous' or word[-3:] =='OUS':
                viterbi_tags[key] = 'ADJ'
            elif word[-2:] =='ing' or word[-2:] =='ING' or word[-2:] =='ed' or word[-2:] =='ED':
                viterbi_tags[key] = 'VERB'
            elif word[-2:] =='ly' or word[-2:] =='LY':
                viterbi_tags[key] = 'ADV'
            elif word[0].isupper() == True:
                viterbi_tags[key] = 'NOUN'
            elif viterbi_words[key-1] == 'has' or viterbi_words[key-1] == 'have' or viterbi_words[key-1] == 'had':
                viterbi_tags[key] = 'VERB'
            else:
                viterbi_tags[key] = 'NOUN' 
    #print(list(zip(viterbi_words,viterbi_tags)))
    return list(zip(viterbi_words,viterbi_tags))


### Tagging the words using Viterbi_modification_2

In [30]:
#Calling our user-defined Viterbi_modification_2 to tag words for test data
modified_viterbi_2 = Viterbi_modification_2(test_tagged_words)

### Checking the accuracy of Viterbi_modification_2

In [31]:
# Finding the tags that has been matched with the tags given by humans.
Matched_tags_Viterbi_modified_2 = [i for i, j in zip(modified_viterbi_2, test_run_base) if i == j] 
Matched_tags_Viterbi_modified_2

[('The', 'DET'),
 ('offer', 'NOUN'),
 (',', '.'),
 ('which', 'DET'),
 ('*T*-2', 'X'),
 ('follows', 'VERB'),
 ('a', 'DET'),
 ('$', '.'),
 ('*U*', 'X'),
 ('*T*-3', 'X'),
 ('was', 'VERB'),
 ('rejected', 'VERB'),
 ('*-1', 'X'),
 ('in', 'ADP'),
 ('September', 'NOUN'),
 (',', '.'),
 ('up', 'PRT'),
 ('pressure', 'NOUN'),
 ('on', 'ADP'),
 ('the', 'DET'),
 ('chemicals', 'NOUN'),
 ('concern', 'NOUN'),
 ('.', '.'),
 (',', '.'),
 ('10', 'NUM'),
 ('local', 'ADJ'),
 ('councils', 'NOUN'),
 ('have', 'VERB'),
 ('refused', 'VERB'),
 ('*-1', 'X'),
 ('to', 'PRT'),
 ('honor', 'VERB'),
 ('fees', 'NOUN'),
 ('and', 'CONJ'),
 ('payments', 'NOUN'),
 ('to', 'PRT'),
 ('banks', 'NOUN'),
 ('incurred', 'VERB'),
 ('*', 'X'),
 ('during', 'ADP'),
 ('various', 'ADJ'),
 ('swaps', 'NOUN'),
 ('dealings', 'NOUN'),
 ('.', '.'),
 ('According', 'VERB'),
 ('to', 'PRT'),
 ('Ms.', 'NOUN'),
 ('Poore', 'NOUN'),
 (',', '.'),
 ('Old-House', 'NOUN'),
 ('Journal', 'NOUN'),
 ('Corp.', 'NOUN'),
 (',', '.'),
 ('her', 'PRON'),
 ('company',

In [32]:
accuracy = len(Matched_tags_Viterbi_modified_2)/len(modified_viterbi_2)
accuracy

0.9389121338912134

**Observation** - The accuracy has improved from 93.07% to 93.76% in rule based tagger. Rule-based tagger shows a good performance compared to Vanilla Viterbi and Viterbi_modification_1 (most commonly used tag).


In [33]:
new_row = {'Algorithm':' Viterbi_modification_2(rule-based)', 'Accuracy':accuracy}
Accuracy_df = Accuracy_df.append(new_row, ignore_index=True)
Accuracy_df 

Unnamed: 0,Algorithm,Accuracy
0,Vannila Viterbi,0.895816
1,Viterbi_modification_1(commonly used tag),0.930962
2,Viterbi_modification_2(rule-based),0.938912


### Words that have incorrect tags

In [34]:
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(modified_viterbi_2, test_run_base)) if j[0]!=j[1]]
incorrect_tagged_cases

[[('$', '.'), (('55-a-share', 'NOUN'), ('55-a-share', 'ADJ'))],
 [('*U*', 'X'), (('bid', 'VERB'), ('bid', 'NOUN'))],
 [('bid', 'NOUN'), (('that', 'ADP'), ('that', 'DET'))],
 [(',', '.'), (('steps', 'NOUN'), ('steps', 'VERB'))],
 [('.', '.'), (('Already', 'NOUN'), ('Already', 'ADV'))],
 [('her', 'PRON'), (('publishing', 'VERB'), ('publishing', 'NOUN'))],
 [('all', 'DET'), (('126,000', 'NOUN'), ('126,000', 'NUM'))],
 [('a', 'DET'), (('worst-case', 'NOUN'), ('worst-case', 'ADJ'))],
 [('to', 'PRT'), (('telephone', 'NOUN'), ('telephone', 'VERB'))],
 [('them', 'PRON'), (('pressure', 'NOUN'), ('pressure', 'VERB'))],
 [('to', 'PRT'), (('ban', 'NOUN'), ('ban', 'VERB'))],
 [('traders', 'NOUN'), (('answer', 'NOUN'), ('answer', 'VERB'))],
 [('1929', 'NUM'), (('that', 'ADP'), ('that', 'DET'))],
 [('*T*-1', 'X'), (('protects', 'NOUN'), ('protects', 'VERB'))],
 [('*-2', 'X'), (('downward', 'ADJ'), ('downward', 'ADV'))],
 [('to', 'PRT'), (('profit', 'NOUN'), ('profit', 'VERB'))],
 [('the', 'DET'), (('

### Viterbi Modification 3

**Viterbi_modification_3** <br>
State Probability = emission_p * transition_p <br>
If the emission probability is '0' the state probability will become '0'. To overcome it, multiply the emission probability with least constant, so that the state probability will not become '0'.

In [35]:
# Viterbi Heuristic
def Viterbi_modification_3(words, train_tagged_words_list = train_tagged_words_list):
    #state  - list used to hold the state probabilities
    state = []
    
    #T - list contains all the Tags from train_bag
    T = list(set([pair[1] for pair in train_tagged_words_list]))
    
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = transition_probability_df.loc['.', tag] #"." - represents the start
            else:
                transition_p = transition_probability_df.loc[state[-1], tag]
                
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            if (emission_p == 0):
                state_probability = (emission_p+0.00001) * transition_p
            else: 
                state_probability = emission_p * transition_p  
            
            p.append(state_probability)
            
            
        pmax = max(p)
        state_max = T[p.index(pmax)]
        state.append(state_max)
        
    return list(zip(words, state))


### Tagging the words using Viterbi_modification_3

In [36]:
#Calling our user-defined Viterbi_modification_3 to tag words for test data
modified_viterbi_3 = Viterbi_modification_3(test_tagged_words)

In [37]:
# Finding the tags that has been matched with the tags given by humans.
Matched_tags_Viterbi_modified_3 = [i for i, j in zip(modified_viterbi_3, test_run_base) if i == j] 
Matched_tags_Viterbi_modified_3

[('The', 'DET'),
 ('offer', 'NOUN'),
 (',', '.'),
 ('which', 'DET'),
 ('*T*-2', 'X'),
 ('follows', 'VERB'),
 ('a', 'DET'),
 ('$', '.'),
 ('*U*', 'X'),
 ('*T*-3', 'X'),
 ('was', 'VERB'),
 ('rejected', 'VERB'),
 ('*-1', 'X'),
 ('in', 'ADP'),
 ('September', 'NOUN'),
 (',', '.'),
 ('up', 'PRT'),
 ('pressure', 'NOUN'),
 ('on', 'ADP'),
 ('the', 'DET'),
 ('chemicals', 'NOUN'),
 ('concern', 'NOUN'),
 ('.', '.'),
 (',', '.'),
 ('10', 'NUM'),
 ('local', 'ADJ'),
 ('councils', 'NOUN'),
 ('have', 'VERB'),
 ('refused', 'VERB'),
 ('*-1', 'X'),
 ('to', 'PRT'),
 ('honor', 'VERB'),
 ('fees', 'NOUN'),
 ('and', 'CONJ'),
 ('payments', 'NOUN'),
 ('to', 'PRT'),
 ('banks', 'NOUN'),
 ('incurred', 'VERB'),
 ('*', 'X'),
 ('during', 'ADP'),
 ('various', 'ADJ'),
 ('swaps', 'NOUN'),
 ('dealings', 'NOUN'),
 ('.', '.'),
 ('According', 'VERB'),
 ('to', 'PRT'),
 ('Ms.', 'NOUN'),
 ('Poore', 'NOUN'),
 (',', '.'),
 ('Old-House', 'NOUN'),
 ('Journal', 'NOUN'),
 ('Corp.', 'NOUN'),
 (',', '.'),
 ('her', 'PRON'),
 ('company',

### Checking the accuracy of Viterbi_modification_3

In [38]:
accuracy = len(Matched_tags_Viterbi_modified_3)/len(modified_viterbi_3)
accuracy

0.9315899581589958

**Observation** - The accuracy has not improved much in Viterbi_modification_3. It is 92.78% which is less than Viterbi_modification_1 and Viterbi_modification_2 and high than vanilla viterbi.

In [39]:
new_row = {'Algorithm':' Viterbi_modification_3(handling emission probability)', 'Accuracy':accuracy}
Accuracy_df = Accuracy_df.append(new_row, ignore_index=True)
Accuracy_df

Unnamed: 0,Algorithm,Accuracy
0,Vannila Viterbi,0.895816
1,Viterbi_modification_1(commonly used tag),0.930962
2,Viterbi_modification_2(rule-based),0.938912
3,Viterbi_modification_3(handling emission prob...,0.93159


### Words that have incorrect tags

In [40]:
incorrect_tagged_cases = [[test_run_base[i-1],j] for i, j in enumerate(zip(modified_viterbi_3, test_run_base)) if j[0]!=j[1]]
incorrect_tagged_cases

[[('$', '.'), (('55-a-share', 'NOUN'), ('55-a-share', 'ADJ'))],
 [('*U*', 'X'), (('bid', 'VERB'), ('bid', 'NOUN'))],
 [('bid', 'NOUN'), (('that', 'ADP'), ('that', 'DET'))],
 [(',', '.'), (('steps', 'NOUN'), ('steps', 'VERB'))],
 [('.', '.'), (('Already', 'NOUN'), ('Already', 'ADV'))],
 [('her', 'PRON'), (('publishing', 'VERB'), ('publishing', 'NOUN'))],
 [('all', 'DET'), (('126,000', 'NOUN'), ('126,000', 'NUM'))],
 [('a', 'DET'), (('worst-case', 'NOUN'), ('worst-case', 'ADJ'))],
 [('to', 'PRT'), (('telephone', 'NOUN'), ('telephone', 'VERB'))],
 [('them', 'PRON'), (('pressure', 'NOUN'), ('pressure', 'VERB'))],
 [('to', 'PRT'), (('ban', 'NOUN'), ('ban', 'VERB'))],
 [('traders', 'NOUN'), (('answer', 'NOUN'), ('answer', 'VERB'))],
 [('reform', 'NOUN'), (('instituted', 'NOUN'), ('instituted', 'VERB'))],
 [('1929', 'NUM'), (('that', 'ADP'), ('that', 'DET'))],
 [('being', 'VERB'), (('relentlessly', 'X'), ('relentlessly', 'ADV'))],
 [('*-2', 'X'), (('downward', 'ADJ'), ('downward', 'ADV'))],
 

#### Evaluating tagging accuracy

In [41]:
columns = ['Words','Vanilla_Viterbi']
POS_df = pd.DataFrame(columns = columns)

for x in tagged_seq:
    new_row = {'Words':x[0], 'Vanilla_Viterbi':x[1]}
    POS_df = POS_df.append(new_row, ignore_index=True)
    
x = []
for i in modified_viterbi_1:
    x.append(i[1])
POS_df.insert(2, "Viterbi_modification_1", x, True)

x = []
for i in modified_viterbi_2:
    x.append(i[1])
POS_df.insert(3, "Viterbi_modification_2", x, True)

x = []
for i in modified_viterbi_3:
    x.append(i[1])
POS_df.insert(4, "Viterbi_modification_3", x, True)  
POS_df.head(100)

Unnamed: 0,Words,Vanilla_Viterbi,Viterbi_modification_1,Viterbi_modification_2,Viterbi_modification_3
0,The,DET,DET,DET,DET
1,offer,NOUN,NOUN,NOUN,NOUN
2,",",.,.,.,.
3,which,DET,DET,DET,DET
4,*T*-2,X,X,X,X
...,...,...,...,...,...
95,began,VERB,VERB,VERB,VERB
96,shortly,ADV,ADV,ADV,ADV
97,after,ADP,ADP,ADP,ADP
98,Oct.,NOUN,NOUN,NOUN,NOUN


In [42]:
Performance_df = POS_df[(POS_df['Viterbi_modification_1'] != POS_df['Viterbi_modification_2']) & (POS_df['Viterbi_modification_2'] != POS_df['Viterbi_modification_3']) & (POS_df['Viterbi_modification_1'] != POS_df['Viterbi_modification_3']) & (POS_df['Vanilla_Viterbi'] != POS_df['Viterbi_modification_1']) & (POS_df['Vanilla_Viterbi'] != POS_df['Viterbi_modification_2'])& (POS_df['Vanilla_Viterbi'] != POS_df['Viterbi_modification_3'])]
Performance_df

Unnamed: 0,Words,Vanilla_Viterbi,Viterbi_modification_1,Viterbi_modification_2,Viterbi_modification_3
163,relentlessly,PRON,NOUN,ADV,X
460,cleaned,PRON,NOUN,VERB,X
1106,escalated,PRON,NOUN,VERB,X
1302,successfully,PRON,NOUN,ADV,VERB
1319,repeatedly,PRON,NOUN,ADV,X
1486,instituted,PRON,NOUN,VERB,X
1636,shown,PRON,NOUN,VERB,X
1890,enhanced,PRON,NOUN,VERB,X
2318,granted,PRON,NOUN,VERB,X
3133,purely,PRON,NOUN,ADV,X


**Observation** -  The above dataframe 'Performance_df' shows how the words has been tagged differently by each modifications of Viterbi.<br> <br>
**'917'** tagged as **'NUM'** by rule-based(Viterbi_modification_2) whereas it as tagged incorrectly as ***'ADP'***,***NOUN*** and ***'DET'*** by others.<br> <br>
**'flirted', 'burned', 'echoed' and 'expunged'** tagged as **'VERB'** by rule-based(Viterbi_modification_2) whereas it as tagged incorrectly as **'ADP'**,**'NOUN'** and **'X'** by others.<br> <br>

### Compare the tagging accuracies of the modifications with the vanilla Viterbi algorithm

In [43]:
Accuracy_df

Unnamed: 0,Algorithm,Accuracy
0,Vannila Viterbi,0.895816
1,Viterbi_modification_1(commonly used tag),0.930962
2,Viterbi_modification_2(rule-based),0.938912
3,Viterbi_modification_3(handling emission prob...,0.93159


**Comments on tagging accuracy:** The tagging accuracy is high in  Viterbi_modification_2(rule-based) than Viterbi_modification_1(commonly used tag) and Viterbi_modification_3. <br> <br>
In case of  Viterbi_modification_1(commonly used tag) the unknown words has been tagged with 'NOUN'. The most common tag in training data is 'NOUN' hence the unknown words in the test data has been tagged as 'NOUN'. <br> <br>
In case of  Viterbi_modification_2(rule-based) the unknown words has been tagged with some basic grammar rules.<br> <br>
In case of  Viterbi_modification_3,if the emission probability is 0, it is added to a least constant and then multiplied with transition probability. <br> <br>

All these 3 Viterbi modifications, shows a good accuracy compared to the Vanilla Viterbi algorithm.

### List down cases which were incorrectly tagged by original POS tagger and got corrected by your modifications

Performance_df - This dataframe contains the words that has differnt tags assigned by Vanilla Viterbi, Viterbi_modification_1,Viterbi_modification_2 and Viterbi_modification_3.

In [44]:
Performance_df

Unnamed: 0,Words,Vanilla_Viterbi,Viterbi_modification_1,Viterbi_modification_2,Viterbi_modification_3
163,relentlessly,PRON,NOUN,ADV,X
460,cleaned,PRON,NOUN,VERB,X
1106,escalated,PRON,NOUN,VERB,X
1302,successfully,PRON,NOUN,ADV,VERB
1319,repeatedly,PRON,NOUN,ADV,X
1486,instituted,PRON,NOUN,VERB,X
1636,shown,PRON,NOUN,VERB,X
1890,enhanced,PRON,NOUN,VERB,X
2318,granted,PRON,NOUN,VERB,X
3133,purely,PRON,NOUN,ADV,X


**Observation** -  The above dataframe 'Performance_df' shows how the words has been tagged by each modifications of Viterbi.<br> <br>
**Case 1 :'917' and '890'** tagged as **'NUM'** by rule-based(Viterbi_modification_2) whereas it as tagged incorrectly as ***'ADP'***,***NOUN*** and ***'DET'*** by others.
**Case 2 :'flirted', 'burned', 'echoed' and 'expunged'** tagged as **'VERB'** by rule-based(Viterbi_modification_2) whereas it as tagged incorrectly as **'ADP'**,**'NOUN'** and **'X'** by others.<br> <br>
**Case 3 : 'stringently' and 'politically'** tagged as **'ADV'** by rule-based(Viterbi_modification_2) whereas it as tagged incorrectly as **'ADP'**,**'NOUN'** and **'X'**/**VERB** by others.<br> <br>

**Conclusion:** From the above dataframe we can conclude that rule-based taggers (Viterbi_modification_2) tags most of the words correctly and shows good performance too.

### Sample Sentences

Few sample sentences has been executed and words has been tagged using Vanilla Viterbi, Viterbi_modification_1,Viterbi_modification_2 and Viterbi_modification_3.

### Sample Sentence 1

In [45]:
from nltk.tokenize import word_tokenize
sample_sentences = 'Android is a mobile operating system developed by Google.'
sample_sentences = word_tokenize(sample_sentences)
Vanilla_Viterbi = Viterbi(sample_sentences)

modification_1 = Viterbi_modification_1(sample_sentences)
modification_2 = Viterbi_modification_2(sample_sentences)
modification_3 = Viterbi_modification_3(sample_sentences)

columns = ['Words','Vanilla_Viterbi']
sample_sentence_df_1 = pd.DataFrame(columns = columns)

for x in Vanilla_Viterbi:
    new_row = {'Words':x[0], 'Vanilla_Viterbi':x[1]}
    sample_sentence_df_1 = sample_sentence_df_1.append(new_row, ignore_index=True)
    
x = []
for i in modification_1:
    x.append(i[1])
sample_sentence_df_1.insert(2, "Viterbi_modification_1", x, True)

x = []
for i in modification_2:
    x.append(i[1])
sample_sentence_df_1.insert(3, "Viterbi_modification_2", x, True)

x = []
for i in modification_3:
    x.append(i[1])
sample_sentence_df_1.insert(4, "Viterbi_modification_3", x, True)  
sample_sentence_df_1.head(100)

Unnamed: 0,Words,Vanilla_Viterbi,Viterbi_modification_1,Viterbi_modification_2,Viterbi_modification_3
0,Android,PRON,NOUN,NOUN,NOUN
1,is,VERB,VERB,VERB,VERB
2,a,DET,DET,DET,DET
3,mobile,ADJ,ADJ,ADJ,ADJ
4,operating,NOUN,NOUN,NOUN,NOUN
5,system,NOUN,NOUN,NOUN,NOUN
6,developed,VERB,VERB,VERB,VERB
7,by,ADP,ADP,ADP,ADP
8,Google,PRON,NOUN,NOUN,NOUN
9,.,.,.,.,.


### Sample Sentence 2

In [46]:
from nltk.tokenize import word_tokenize
sample_sentences = 'Android has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.'
sample_sentences = word_tokenize(sample_sentences)
Vanilla_Viterbi = Viterbi(sample_sentences)
sample_sentences
modification_1 = Viterbi_modification_1(sample_sentences)
modification_2 = Viterbi_modification_2(sample_sentences)
modification_3 = Viterbi_modification_3(sample_sentences)

columns = ['Words','Vanilla_Viterbi']
sample_sentence_df_2 = pd.DataFrame(columns = columns)

for x in Vanilla_Viterbi:
    new_row = {'Words':x[0], 'Vanilla_Viterbi':x[1]}
    sample_sentence_df_2 = sample_sentence_df_2.append(new_row, ignore_index=True)
    
x = []
for i in modification_1:
    x.append(i[1])
sample_sentence_df_2.insert(2, "Viterbi_modification_1", x, True)

x = []
for i in modification_2:
    x.append(i[1])
sample_sentence_df_2.insert(3, "Viterbi_modification_2", x, True)

x = []
for i in modification_3:
    x.append(i[1])
sample_sentence_df_2.insert(4, "Viterbi_modification_3", x, True)  
sample_sentence_df_2.head(100)

Unnamed: 0,Words,Vanilla_Viterbi,Viterbi_modification_1,Viterbi_modification_2,Viterbi_modification_3
0,Android,PRON,NOUN,NOUN,NOUN
1,has,VERB,VERB,VERB,VERB
2,been,VERB,VERB,VERB,VERB
3,the,DET,DET,DET,DET
4,best-selling,ADJ,ADJ,ADJ,ADJ
5,OS,PRON,NOUN,NOUN,NOUN
6,worldwide,PRON,NOUN,NOUN,NOUN
7,on,ADP,ADP,ADP,ADP
8,smartphones,PRON,NOUN,NOUN,NOUN
9,since,ADP,ADP,ADP,ADP


### Sample Sentence 3

In [47]:
from nltk.tokenize import word_tokenize
sample_sentences = 'Twitter is an online news and social networking service on which users post and interact with messages known as tweets.'
sample_sentences = word_tokenize(sample_sentences)
Vanilla_Viterbi = Viterbi(sample_sentences)
sample_sentences
modification_1 = Viterbi_modification_1(sample_sentences)
modification_2 = Viterbi_modification_2(sample_sentences)
modification_3 = Viterbi_modification_3(sample_sentences)

columns = ['Words','Vanilla_Viterbi']
sample_sentence_df_3 = pd.DataFrame(columns = columns)

for x in Vanilla_Viterbi:
    new_row = {'Words':x[0], 'Vanilla_Viterbi':x[1]}
    sample_sentence_df_3 = sample_sentence_df_3.append(new_row, ignore_index=True)
    
x = []
for i in modification_1:
    x.append(i[1])
sample_sentence_df_3.insert(2, "Viterbi_modification_1", x, True)

x = []
for i in modification_2:
    x.append(i[1])
sample_sentence_df_3.insert(3, "Viterbi_modification_2", x, True)

x = []
for i in modification_3:
    x.append(i[1])
sample_sentence_df_3.insert(4, "Viterbi_modification_3", x, True)  
sample_sentence_df_3.head(100)

Unnamed: 0,Words,Vanilla_Viterbi,Viterbi_modification_1,Viterbi_modification_2,Viterbi_modification_3
0,Twitter,PRON,NOUN,NOUN,NOUN
1,is,VERB,VERB,VERB,VERB
2,an,DET,DET,DET,DET
3,online,PRON,NOUN,NOUN,NOUN
4,news,NOUN,NOUN,NOUN,NOUN
5,and,CONJ,CONJ,CONJ,CONJ
6,social,ADJ,ADJ,ADJ,ADJ
7,networking,NOUN,NOUN,NOUN,NOUN
8,service,NOUN,NOUN,NOUN,NOUN
9,on,ADP,ADP,ADP,ADP


### Sample Sentence 4

In [48]:
from nltk.tokenize import word_tokenize
sample_sentences = 'NASA invited social media users to experience the launch of ICESAT-2 Satellite.'
sample_sentences = word_tokenize(sample_sentences)
Vanilla_Viterbi = Viterbi(sample_sentences)
sample_sentences
modification_1 = Viterbi_modification_1(sample_sentences)
modification_2 = Viterbi_modification_2(sample_sentences)
modification_3 = Viterbi_modification_3(sample_sentences)

columns = ['Words','Vanilla_Viterbi']
sample_sentence_df_4 = pd.DataFrame(columns = columns)

for x in Vanilla_Viterbi:
    new_row = {'Words':x[0], 'Vanilla_Viterbi':x[1]}
    sample_sentence_df_4 = sample_sentence_df_4.append(new_row, ignore_index=True)
    
x = []
for i in modification_1:
    x.append(i[1])
sample_sentence_df_4.insert(2, "Viterbi_modification_1", x, True)

x = []
for i in modification_2:
    x.append(i[1])
sample_sentence_df_4.insert(3, "Viterbi_modification_2", x, True)

x = []
for i in modification_3:
    x.append(i[1])
sample_sentence_df_4.insert(4, "Viterbi_modification_3", x, True)  
sample_sentence_df_4.head(100)

Unnamed: 0,Words,Vanilla_Viterbi,Viterbi_modification_1,Viterbi_modification_2,Viterbi_modification_3
0,NASA,PRON,NOUN,NOUN,NOUN
1,invited,PRON,NOUN,VERB,NOUN
2,social,ADJ,ADJ,ADJ,ADJ
3,media,NOUN,NOUN,NOUN,NOUN
4,users,NOUN,NOUN,NOUN,NOUN
5,to,PRT,PRT,PRT,PRT
6,experience,NOUN,NOUN,NOUN,NOUN
7,the,DET,DET,DET,DET
8,launch,NOUN,NOUN,NOUN,NOUN
9,of,ADP,ADP,ADP,ADP


### Sample Sentence 5

In [49]:
from nltk.tokenize import word_tokenize
sample_sentences = 'Show me the price of the flights leaving Atlanta at about 3 in the afternoon and arriving in San Francisco.'
sample_sentences = word_tokenize(sample_sentences)
Vanilla_Viterbi = Viterbi(sample_sentences)
sample_sentences
modification_1 = Viterbi_modification_1(sample_sentences)
modification_2 = Viterbi_modification_2(sample_sentences)
modification_3 = Viterbi_modification_3(sample_sentences)

columns = ['Words','Vanilla_Viterbi']
sample_sentence_df_5 = pd.DataFrame(columns = columns)

for x in Vanilla_Viterbi:
    new_row = {'Words':x[0], 'Vanilla_Viterbi':x[1]}
    sample_sentence_df_5 = sample_sentence_df_5.append(new_row, ignore_index=True)
    
x = []
for i in modification_1:
    x.append(i[1])
sample_sentence_df_5.insert(2, "Viterbi_modification_1", x, True)

x = []
for i in modification_2:
    x.append(i[1])
sample_sentence_df_5.insert(3, "Viterbi_modification_2", x, True)

x = []
for i in modification_3:
    x.append(i[1])
sample_sentence_df_5.insert(4, "Viterbi_modification_3", x, True)  
sample_sentence_df_5.head(100)

Unnamed: 0,Words,Vanilla_Viterbi,Viterbi_modification_1,Viterbi_modification_2,Viterbi_modification_3
0,Show,NOUN,NOUN,NOUN,NOUN
1,me,PRON,PRON,PRON,PRON
2,the,DET,DET,DET,DET
3,price,NOUN,NOUN,NOUN,NOUN
4,of,ADP,ADP,ADP,ADP
5,the,DET,DET,DET,DET
6,flights,NOUN,NOUN,NOUN,NOUN
7,leaving,VERB,VERB,VERB,VERB
8,Atlanta,NOUN,NOUN,NOUN,NOUN
9,at,ADP,ADP,ADP,ADP


**Observation** - Rule-based tagger (Viterbi_modification_2) has tagged all the words correctly compared to Vanilla Viterbi,Viterbi_modification_1 and Viterbi_modification_3