# POS TAGGER

### Importing Necessary Libraries

In [240]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import conllu
from sklearn.metrics import classification_report

### Opening files and extracting relevant data

In [241]:
# Open the train input file
with open('en_atis-ud-train.conllu', 'r', encoding='utf-8') as f:
    data = f.read()
    
# Open the test input file
with open('en_atis-ud-test.conllu', 'r', encoding='utf-8') as f:
    test_data = f.read()
    
# Open the test input file
with open('en_atis-ud-dev.conllu', 'r', encoding='utf-8') as f:
    dev_data = f.read()
    
# Parse the data as a list of sentences
sentences = conllu.parse(data)
data = []

# Extract the words and their POS tags for each sentence
for sentence in sentences:
    # Extract the words and their POS tags as a list of tuples
    words_and_tags = [(token['form'], token['upos']) for token in sentence]
    data.append(words_and_tags)
    

# Parse the data as a list of sentences
sentences = conllu.parse(test_data)
test_data = []

# Extract the words and their POS tags for each sentence
for sentence in sentences:
    # Extract the words and their POS tags as a list of tuples
    words_and_tags = [(token['form'], token['upos']) for token in sentence]
    test_data.append(words_and_tags)
    

# Parse the data as a list of sentences
sentences = conllu.parse(dev_data)
dev_data = []

# Extract the words and their POS tags for each sentence
for sentence in sentences:
    # Extract the words and their POS tags as a list of tuples
    words_and_tags = [(token['form'], token['upos']) for token in sentence]
    dev_data.append(words_and_tags)
    

In [242]:
dev_data

[[('i', 'PRON'),
  ('would', 'AUX'),
  ('like', 'VERB'),
  ('the', 'DET'),
  ('cheapest', 'ADJ'),
  ('flight', 'NOUN'),
  ('from', 'ADP'),
  ('pittsburgh', 'PROPN'),
  ('to', 'ADP'),
  ('atlanta', 'PROPN'),
  ('leaving', 'VERB'),
  ('april', 'NOUN'),
  ('twenty', 'NUM'),
  ('fifth', 'ADJ'),
  ('and', 'CCONJ'),
  ('returning', 'VERB'),
  ('may', 'NOUN'),
  ('sixth', 'ADJ')],
 [('i', 'PRON'),
  ('want', 'VERB'),
  ('a', 'DET'),
  ('flight', 'NOUN'),
  ('from', 'ADP'),
  ('memphis', 'PROPN'),
  ('to', 'ADP'),
  ('seattle', 'PROPN'),
  ('that', 'ADP'),
  ('arrives', 'VERB'),
  ('no', 'DET'),
  ('later', 'ADV'),
  ('than', 'ADP'),
  ('3', 'NUM'),
  ('pm', 'NOUN')],
 [('show', 'VERB'),
  ('me', 'PRON'),
  ('round', 'NOUN'),
  ('trip', 'NOUN'),
  ('flights', 'NOUN'),
  ('from', 'ADP'),
  ('chicago', 'PROPN'),
  ('to', 'ADP'),
  ('detroit', 'PROPN'),
  ('leaving', 'VERB'),
  ('next', 'ADJ'),
  ('tuesday', 'NOUN'),
  ('and', 'CCONJ'),
  ('returning', 'VERB'),
  ('the', 'DET'),
  ('day', 'NOUN')

### Making a dictionary to make word to vec for computaions

In [243]:
word2idx = {}
tag2idx = {}
tags = []

for sen in data:
    for word,tag in sen:
        if word not in word2idx:
            word2idx[word] = len(word2idx)
        if tag not in tags:
            tag2idx[tag] = len(tags)
            tags.append(tag)

In [244]:
word2idx

{'what': 0,
 'is': 1,
 'the': 2,
 'cost': 3,
 'of': 4,
 'a': 5,
 'round': 6,
 'trip': 7,
 'flight': 8,
 'from': 9,
 'pittsburgh': 10,
 'to': 11,
 'atlanta': 12,
 'beginning': 13,
 'on': 14,
 'april': 15,
 'twenty': 16,
 'fifth': 17,
 'and': 18,
 'returning': 19,
 'may': 20,
 'sixth': 21,
 'now': 22,
 'i': 23,
 'need': 24,
 'leaving': 25,
 'fort': 26,
 'worth': 27,
 'arriving': 28,
 'in': 29,
 'denver': 30,
 'no': 31,
 'later': 32,
 'than': 33,
 '2': 34,
 'pm': 35,
 'next': 36,
 'monday': 37,
 'fly': 38,
 'kansas': 39,
 'city': 40,
 'chicago': 41,
 'wednesday': 42,
 'following': 43,
 'day': 44,
 'meaning': 45,
 'meal': 46,
 'code': 47,
 's': 48,
 'show': 49,
 'me': 50,
 'all': 51,
 'flights': 52,
 'which': 53,
 'serve': 54,
 'for': 55,
 'after': 56,
 'tomorrow': 57,
 'us': 58,
 'air': 59,
 'list': 60,
 'nonstop': 61,
 'early': 62,
 'tuesday': 63,
 'morning': 64,
 'dallas': 65,
 'st.': 66,
 'petersburg': 67,
 'toronto': 68,
 'that': 69,
 'arrive': 70,
 'listing': 71,
 'new': 72,
 'york':

In [245]:
tag2idx

{'PRON': 0,
 'AUX': 1,
 'DET': 2,
 'NOUN': 3,
 'ADP': 4,
 'PROPN': 5,
 'VERB': 6,
 'NUM': 7,
 'ADJ': 8,
 'CCONJ': 9,
 'ADV': 10,
 'PART': 11,
 'INTJ': 12}

In [246]:
idx2tag = {}
j = 0
for i in tag2idx:
    idx2tag[j] = i
    j += 1

In [247]:
idx2tag

{0: 'PRON',
 1: 'AUX',
 2: 'DET',
 3: 'NOUN',
 4: 'ADP',
 5: 'PROPN',
 6: 'VERB',
 7: 'NUM',
 8: 'ADJ',
 9: 'CCONJ',
 10: 'ADV',
 11: 'PART',
 12: 'INTJ'}

### A function to find index of words

In [248]:
def Seq_Out(words, w2i):
    i = []
    for w in words:
        if w in w2i:
            i.append(w2i[w])
        else:
            i.append(0)
    i = np.array(i)
    return i

#### Function check

In [249]:
input = ['what', 'is', 'the', 'cost', 'of', 'a', 'round', 'trip', 'flight', 'from', 'pittsburgh', 'to', 'atlanta', 'beginning', 'on', 'april', 'twenty', 'fifth', 'and', 'returning', 'on', 'may', 'sixth']
Seq_Out(input,word2idx)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 14, 20, 21])

### Creating Model

In [250]:

class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        # Embedding layer to turn words to vectors
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # Hidden to linear layer
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

        # Initialize the hidden state
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Hidden state is formed based on previously seen data 
        # So initialization step is required
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        # Step 1: create word embeddings
        embeddings = self.word_embeddings(sentence)
        
        # Step 2: passing the embeddings, initial state and final state to LSTM
        lstm_out, self.hidden = self.lstm(embeddings.view(len(sentence), 1, -1), self.hidden)
        
        # Step 3: get tag scores
        tag_scores = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_scores, dim=1)
        return tag_scores

### Initialising the Model

In [251]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 300
model = LSTMTagger(EMBEDDING_DIM,HIDDEN_DIM,len(word2idx),len(tag2idx))

### Difining the Loss and Optimizer 

In [252]:
Loss_Function = nn.NLLLoss()
Optimizer = optim.Adam(model.parameters())

### Checking the working of model

In [253]:
# test_sentence = "The Round Trip flight this monday ".lower().split()

# # see what the scores are before training
# # element [i,j] of the output is the *score* for tag j for word i.
# # to check the initial accuracy of our model, we don't need to train, so we use model.eval()
# inputs = Seq_Out(test_sentence, word2idx)
# tag_scores = model(torch.tensor(inputs))
# print(tag_scores)

# # tag_scores outputs a vector of tag scores for each word in an inpit sentence
# # to get the most likely tag index, we grab the index with the maximum score!
# # recall that these numbers correspond to tag2idx = {"DET": 0, "NN": 1, "V": 2}

# predicted_tags = torch.max(tag_scores, 1)
# print('\n')
# print('Predicted tags: \n',predicted_tags)

### Loading the Pre Trained Model

In [262]:
Path = './POS_Saved_Model.pt'
model.load_state_dict(torch.load(Path))

<All keys matched successfully>

### Training

In [263]:
# # No of iterations
# Epoch_count = 8

# for epoch in range(Epoch_count):
#     print(epoch)
#     epoch_loss = 0.0
    
#     for sen in data:
#             words = [tup[0] for tup in sen]
#             tags = [tup[1] for tup in sen]
#             # Setting gradient to zero (To prevent error)
#             model.zero_grad()

#             # Setting hidden state of sdm to zero
#             model.hidden = model.init_hidden()

#             # Finding ta
#             sentence = Seq_Out(words,word2idx)
#             targets = Seq_Out(tags,tag2idx)
            
#             # Forward to get scores
#             scores = model(torch.tensor(sentence))

#             # Finding loss and gradients
#             loss = Loss_Function(scores, torch.tensor(targets))
#             epoch_loss += loss.item()

#             # Backtracking
#             loss.backward()

#             # Optimizer step to substract gradient times learning rates from weights
#             Optimizer.step()
            
            

# # Printing Epochs for certain range
#     if (epoch % 1 == 0):
#         print(f'Epoch: {epoch} , loss: {epoch_loss/len(data)}')

### Testing 

#### Single Sentence Input Testing

In [264]:
test_sentence = "All the flights from here".lower().split()

# see what the scores are after training
inputs = Seq_Out(test_sentence, word2idx)
tag_scores = model(torch.tensor(inputs))

# print the most likely tag index, by grabbing the index with the maximum score!
# recall that these numbers correspond to tag2idx = {"DET": 0, "NN": 1, "V": 2}
_, predicted_tags = torch.max(tag_scores, 1)
print('Predicted tags: \n',predicted_tags)



Predicted tags: 
 tensor([2, 2, 3, 4, 5])


In [265]:
j = 0
for i in test_sentence:
    print(f'{i} : {idx2tag[int(predicted_tags[j])]}')
    j+=1

all : DET
the : DET
flights : NOUN
from : ADP
here : PROPN


### Using Given Test Data

In [266]:
test_words = [tup[0] for lst in test_data for tup in lst]
test_tags = [tup[1] for lst in test_data for tup in lst]

# see what the scores are after training
inputs = Seq_Out(test_words, word2idx)
tag_scores = model(torch.tensor(inputs))

# print the most likely tag index, the index with the maximum score!
_, predicted_tags = torch.max(tag_scores, 1)
print('\n')
print('Predicted tags: \n',predicted_tags.shape)
true_tags = Seq_Out(test_tags,tag2idx)
print(len(true_tags))



Predicted tags: 
 torch.Size([6580])
6580


In [267]:
j = 0
for i in test_words:
    print(f'{i} : {idx2tag[int(predicted_tags[j])]}')
    j+=1

what : DET
are : AUX
the : DET
coach : PROPN
flights : NOUN
between : ADP
dallas : PROPN
and : CCONJ
baltimore : PROPN
leaving : VERB
august : NOUN
tenth : ADJ
and : CCONJ
returning : VERB
august : NOUN
twelve : DET
i : PRON
want : VERB
a : DET
flight : NOUN
from : ADP
nashville : PROPN
to : ADP
seattle : PROPN
that : ADP
arrives : VERB
no : DET
later : ADJ
than : ADP
3 : NUM
pm : NOUN
i : PRON
need : VERB
a : DET
flight : NOUN
leaving : VERB
kansas : PROPN
city : PROPN
to : ADP
chicago : PROPN
leaving : VERB
next : ADJ
wednesday : NOUN
and : CCONJ
returning : VERB
the : DET
following : VERB
day : NOUN
explain : VERB
meal : NOUN
codes : NOUN
sd : PRON
d : PRON
show : VERB
me : PRON
all : DET
flights : NOUN
from : ADP
atlanta : PROPN
to : ADP
san : PROPN
francisco : PROPN
which : DET
leave : VERB
the : DET
day : NOUN
after : ADP
tomorrow : NOUN
after : ADP
5 : NUM
o'clock : ADV
pm : NOUN
i : PRON
need : VERB
a : DET
flight : NOUN
from : ADP
toronto : PROPN
to : ADP
montreal : PROPN
reac

### Finding Accuracy of Test

In [268]:
correct_count = 0
wrong_count = 0

j = 0
for i in test_tags:
    if (i == idx2tag[int(predicted_tags[j])]):
        print(f'{i} and {idx2tag[int(predicted_tags[j])]}')
        j += 1
        correct_count += 1
    else:
        wrong_count += 1
        j+=1

AUX and AUX
DET and DET
NOUN and NOUN
ADP and ADP
PROPN and PROPN
CCONJ and CCONJ
PROPN and PROPN
VERB and VERB
NOUN and NOUN
ADJ and ADJ
CCONJ and CCONJ
VERB and VERB
NOUN and NOUN
PRON and PRON
VERB and VERB
DET and DET
NOUN and NOUN
ADP and ADP
PROPN and PROPN
ADP and ADP
PROPN and PROPN
ADP and ADP
VERB and VERB
DET and DET
ADJ and ADJ
ADP and ADP
NUM and NUM
NOUN and NOUN
PRON and PRON
VERB and VERB
DET and DET
NOUN and NOUN
VERB and VERB
PROPN and PROPN
PROPN and PROPN
ADP and ADP
PROPN and PROPN
VERB and VERB
ADJ and ADJ
NOUN and NOUN
CCONJ and CCONJ
VERB and VERB
DET and DET
VERB and VERB
NOUN and NOUN
VERB and VERB
NOUN and NOUN
NOUN and NOUN
VERB and VERB
PRON and PRON
DET and DET
NOUN and NOUN
ADP and ADP
PROPN and PROPN
ADP and ADP
PROPN and PROPN
PROPN and PROPN
VERB and VERB
DET and DET
NOUN and NOUN
ADP and ADP
NOUN and NOUN
ADP and ADP
NUM and NUM
ADV and ADV
NOUN and NOUN
PRON and PRON
VERB and VERB
DET and DET
NOUN and NOUN
ADP and ADP
PROPN and PROPN
ADP and ADP
PROP

In [269]:
print(correct_count/(correct_count+wrong_count))

0.9534954407294833


In [270]:
target_names = ['PRON', 'AUX', 'DET', 'NOUN', 'ADP', 'PROPN', 'VERB', 'NUM', 'ADJ', 'CCONJ', 'ADV', 'PART', 'INTJ']
print(classification_report(true_tags.tolist(), predicted_tags.tolist(), target_names=target_names))

              precision    recall  f1-score   support

        PRON       0.92      0.75      0.83       392
         AUX       0.95      0.95      0.95       256
         DET       0.79      0.98      0.88       512
        NOUN       0.99      0.98      0.98      1166
         ADP       0.96      0.99      0.98      1434
       PROPN       0.98      0.99      0.99      1567
        VERB       0.98      0.86      0.92       629
         NUM       0.97      0.83      0.89       127
         ADJ       0.95      0.96      0.96       220
       CCONJ       1.00      1.00      1.00       109
         ADV       0.87      0.80      0.84        76
        PART       0.96      0.98      0.97        56
        INTJ       1.00      1.00      1.00        36

    accuracy                           0.95      6580
   macro avg       0.95      0.93      0.94      6580
weighted avg       0.96      0.95      0.95      6580



### Saving Tained Model

In [271]:
# Path = './POS_Saved_Model.pt'
# torch.save(model.state_dict(),Path)