## import libraries

In [45]:
import torch
import pickle
import unicodedata
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

## import dataset

In [3]:
path = "../data/NepaliTaggedCorpus/new_submissions_parallel_corpus_project_Nepal/"

In [4]:
dbfile = open(path+'/'+'processed_tag', 'rb')

In [5]:
data_df = pickle.load(dbfile)

In [6]:
data_df.head()

Unnamed: 0,text,tag
0,"[६१, वर्षीय, पियरे, भिन्केन, नोभेम्बर, २९...","[<CD>, <JJ>, <NNP>, <NNP>, <NNP>, <CD>, <POP>,..."
1,"[श्री, भिन्केन, डच, प्रकाशन, समूह, एल्सेभ...","[<NN>, <NNP>, <NNP>, <NN>, <NN>, <NNP>, <FB>, ..."
2,"[कन्सोलिडेटिड, गोल्ड, फिल्ड्स, पीएलसी, का, ...","[<NNP>, <NN>, <NN>, <NNP>, <PKO>, <JJ>, <NN>, ..."
3,"[एकताका, केन्ट, चुरोट, को, फिल्टर, बनाउन, ...","[<RBO>, <NNP>, <NN>, <PKO>, <NN>, <VBI>, <NN>,..."
4,"[यस, सँग, को, छोटो, सम्पर्क, बाट, मात्र, प...","[<DUM>, <POP>, <PKO>, <JJM>, <NN>, <POP>, <RP>..."


In [7]:
data_df['len_txt'] =data_df['text'].map(len)

In [8]:
data_df['len_tag'] =data_df['tag'].map(len)

In [9]:
data_df[data_df['len_txt']!=data_df['len_tag']]

Unnamed: 0,text,tag,len_txt,len_tag


## hence from the operation, we figure out that there is not mismatch between tags and text tokens 

## Dataset Preparation

In [10]:
data_df.head()

Unnamed: 0,text,tag,len_txt,len_tag
0,"[६१, वर्षीय, पियरे, भिन्केन, नोभेम्बर, २९...","[<CD>, <JJ>, <NNP>, <NNP>, <NNP>, <CD>, <POP>,...",16,16
1,"[श्री, भिन्केन, डच, प्रकाशन, समूह, एल्सेभ...","[<NN>, <NNP>, <NNP>, <NN>, <NN>, <NNP>, <FB>, ...",11,11
2,"[कन्सोलिडेटिड, गोल्ड, फिल्ड्स, पीएलसी, का, ...","[<NNP>, <NN>, <NN>, <NNP>, <PKO>, <JJ>, <NN>, ...",25,25
3,"[एकताका, केन्ट, चुरोट, को, फिल्टर, बनाउन, ...","[<RBO>, <NNP>, <NN>, <PKO>, <NN>, <VBI>, <NN>,...",43,43
4,"[यस, सँग, को, छोटो, सम्पर्क, बाट, मात्र, प...","[<DUM>, <POP>, <PKO>, <JJM>, <NN>, <POP>, <RP>...",38,38


In [11]:
data_df['text'] = data_df['text'].apply(lambda x : ["".join(item.split(" ")) for item in x])

## Converting Unicode value into ASCII

In [12]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

In [13]:
data_df['ascii_text'] = data_df['text'].apply(lambda x : [unicode_to_ascii(item) for item in x])

In [14]:
data_df.drop(columns=['text','len_txt','len_tag'],inplace=True)

In [15]:
data_df = data_df[['ascii_text','tag']]

In [16]:
data_df.head()

Unnamed: 0,ascii_text,tag
0,"[६१, वरषीय, पियर, भिनकन, नोभमबर, २९, बाट, सलला...","[<CD>, <JJ>, <NNP>, <NNP>, <NNP>, <CD>, <POP>,..."
1,"[शरी, भिनकन, डच, परकाशन, समह, एलसभियर, एन.भी.,...","[<NN>, <NNP>, <NNP>, <NN>, <NN>, <NNP>, <FB>, ..."
2,"[कनसोलिडटिड, गोलड, फिलडस, पीएलसी, का, परव, सभा...","[<NNP>, <NN>, <NN>, <NNP>, <PKO>, <JJ>, <NN>, ..."
3,"[एकताका, कनट, चरोट, को, फिलटर, बनाउन, परयोग, भ...","[<RBO>, <NNP>, <NN>, <PKO>, <NN>, <VBI>, <NN>,..."
4,"[यस, सग, को, छोटो, समपरक, बाट, मातर, पनि, दशकौ...","[<DUM>, <POP>, <PKO>, <JJM>, <NN>, <POP>, <RP>..."


## word2vec and vec2word conversion

In [17]:
word_dictonary = list(set(sum(data_df['ascii_text'].tolist(),[])))

In [18]:
word_dictonary[:8]

['', 'परमपरा', 'ओगटन', 'रोव', 'परशासनिक', 'लखका', '३५', 'तयही']

In [19]:
tag_dictionary = list(set(sum(data_df['tag'].tolist(),[])))

In [20]:
tag_dictionary[:8]

['<ALPH>', '<NNP>', '<VBX>', '<DM>', '<UNW>', '<VBNE>', '<FB>', '<PLE>']

In [21]:
int2word = dict(enumerate(word_dictonary))

In [22]:
word2int = {int2word[idx]:idx for idx in int2word.keys() }

In [23]:
int2tag = dict(enumerate(tag_dictionary))

In [24]:
tag2int = {int2tag[idx]:idx for idx in int2tag.keys()}

# Train Test Split

In [25]:
SPLIT_RATIO = 0.2

In [26]:
split_num = round((1-SPLIT_RATIO)*data_df.shape[0])

In [27]:
data_df.isnull().sum()

ascii_text    0
tag           0
dtype: int64

In [28]:
train_dataset = data_df[:split_num]

In [29]:
valid_dataset = data_df[split_num:]

In [30]:
train_dataset.shape,valid_dataset.shape

((3434, 2), (858, 2))

# Model architecture

In [31]:
class POS_Tagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
        super().__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, target_size)
        self.log_softmax = nn.LogSoftmax(dim=1)
        self.dropout = nn.Dropout(p=0.6,inplace=True)
        
        

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        drop_out_tag = self.dropout(tag_space)
        tag_scores = self.log_softmax(drop_out_tag)
        return tag_scores

In [32]:
INPUT_DIM = len(word_dictonary)
OUTPUT_DIM = len(tag_dictionary)

In [33]:
INPUT_DIM,OUTPUT_DIM

(11993, 39)

In [34]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 150

In [35]:
model = POS_Tagger(EMBEDDING_DIM, HIDDEN_DIM, INPUT_DIM, OUTPUT_DIM)

In [36]:
model

POS_Tagger(
  (word_embeddings): Embedding(11993, 300)
  (lstm): LSTM(300, 150)
  (hidden2tag): Linear(in_features=150, out_features=39, bias=True)
  (log_softmax): LogSoftmax()
  (dropout): Dropout(p=0.6, inplace)
)

# Initailization

In [37]:
# Define hyperparameters
EPOCHS = 20
LR = 0.01

In [38]:
# Define Loss, Optimizer
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

# Training

In [39]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [40]:
train_dataset = list(train_dataset.itertuples(index=False))

In [41]:
valid_dataset = list(valid_dataset.itertuples(index=False))

In [None]:
epoch_loss_train=[]
epoch_loss_valid=[]
acc_score_list=[]
for epoch in range(EPOCHS):
    net_loss_train = 0
    train_len=0.001
    for sentence, tags in train_dataset:
        # Get our inputs ready for the network, that is, turn them into
        sent_int = prepare_sequence(sentence, word2int)
        tag_int = prepare_sequence(tags, tag2int)
        # Run our forward pass.
        try:
            tag_scores = model(sent_int)
            # Compute the loss, gradients, and update the parameters 
            optimizer.zero_grad()
            loss = criterion(tag_scores, tag_int)
            loss.backward()
            optimizer.step()
            net_loss_train+=loss.item()
            train_len+=1
            
        except:
            continue
    else:
        # cross validation
        with torch.no_grad():
            net_loss_valid=0
            valid_len=0.001
            acc_score=0
            for sentence,tags in valid_dataset:
                sent_int = prepare_sequence(sentence,word2int)
                tag_int = prepare_sequence(tags,tag2int)
                #Step 3. Run our forward pass.
                try:
                    tag_scores = model(sent_int)

                    # Step 4. Compute the loss 
                    net_loss_valid+=criterion(tag_scores, tag_int).item()
                    valid_len+=1
                    true_value = tag_int.tolist()
                    predicted_value=[]
                    for item in score:
                        predicted_value.append(torch.argmax(item))
                    predicted_value = torch.IntTensor(predicted_value).tolist()
                    acc_score+=acc_scaccuracy_score(true_val,predicted_value)
                    

                except:
                    continue
    epoch_loss_train.append(net_loss_train/train_len)
    epoch_loss_valid.append(net_loss_valid/valid_len)
    acc_score_list.append(acc_score/valid_len)
    torch.save(model.state_dict(),'../model/POS_Tagger_model_epoch'+str(epoch))
    if (epoch+1)%2 == 0:
        print('Epoch: {}/{}.............'.format(epoch, EPOCHS), end=' ')
        print("Training Loss: {:.2f}".format(net_loss_train/train_len),end=' ')
        print("Validation Loss: {:.2f}".format(net_loss_valid/valid_len),end=' ')
        print("Accuracy Score: {:.2f}".format(acc_score/valid_len),end=' ')

Epoch: 1/20............. Training Loss: 0.00 Validation Loss: 0.00
