## import libraries

In [1]:
import torch
import pickle
import unicodedata
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from sklearn.model_selection import train_test_split

## import dataset

In [2]:
path = "../data/NepaliTaggedCorpus/new_submissions_parallel_corpus_project_Nepal/"

In [3]:
dbfile = open(path+'/'+'processed_tag', 'rb')

In [4]:
data_df = pickle.load(dbfile)

In [5]:
data_df.head()

Unnamed: 0,text,tag
0,"[६१, वर्षीय, पियरे, भिन्केन, नोभेम्बर, २९...","[<CD>, <JJ>, <NNP>, <NNP>, <NNP>, <CD>, <POP>,..."
1,"[श्री, भिन्केन, डच, प्रकाशन, समूह, एल्सेभ...","[<NN>, <NNP>, <NNP>, <NN>, <NN>, <NNP>, <FB>, ..."
2,"[कन्सोलिडेटिड, गोल्ड, फिल्ड्स, पीएलसी, का, ...","[<NNP>, <NN>, <NN>, <NNP>, <PKO>, <JJ>, <NN>, ..."
3,"[एकताका, केन्ट, चुरोट, को, फिल्टर, बनाउन, ...","[<RBO>, <NNP>, <NN>, <PKO>, <NN>, <VBI>, <NN>,..."
4,"[यस, सँग, को, छोटो, सम्पर्क, बाट, मात्र, प...","[<DUM>, <POP>, <PKO>, <JJM>, <NN>, <POP>, <RP>..."


In [6]:
data_df['len_txt'] =data_df['text'].map(len)

In [7]:
data_df['len_tag'] =data_df['tag'].map(len)

In [8]:
data_df[data_df['len_txt']!=data_df['len_tag']]

Unnamed: 0,text,tag,len_txt,len_tag


## hence from the operation, we figure out that there is not mismatch between tags and text tokens 

## Dataset Preparation

In [9]:
data_df.head()

Unnamed: 0,text,tag,len_txt,len_tag
0,"[६१, वर्षीय, पियरे, भिन्केन, नोभेम्बर, २९...","[<CD>, <JJ>, <NNP>, <NNP>, <NNP>, <CD>, <POP>,...",16,16
1,"[श्री, भिन्केन, डच, प्रकाशन, समूह, एल्सेभ...","[<NN>, <NNP>, <NNP>, <NN>, <NN>, <NNP>, <FB>, ...",11,11
2,"[कन्सोलिडेटिड, गोल्ड, फिल्ड्स, पीएलसी, का, ...","[<NNP>, <NN>, <NN>, <NNP>, <PKO>, <JJ>, <NN>, ...",25,25
3,"[एकताका, केन्ट, चुरोट, को, फिल्टर, बनाउन, ...","[<RBO>, <NNP>, <NN>, <PKO>, <NN>, <VBI>, <NN>,...",43,43
4,"[यस, सँग, को, छोटो, सम्पर्क, बाट, मात्र, प...","[<DUM>, <POP>, <PKO>, <JJM>, <NN>, <POP>, <RP>...",38,38


In [10]:
data_df['text'] = data_df['text'].apply(lambda x : ["".join(item.split(" ")) for item in x])

## Converting Unicode value into ASCII

In [11]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

In [12]:
data_df['ascii_text'] = data_df['text'].apply(lambda x : [unicode_to_ascii(item) for item in x])

In [13]:
data_df.drop(columns=['text','len_txt','len_tag'],inplace=True)

In [14]:
data_df = data_df[['ascii_text','tag']]

In [15]:
data_df.head()

Unnamed: 0,ascii_text,tag
0,"[६१, वरषीय, पियर, भिनकन, नोभमबर, २९, बाट, सलला...","[<CD>, <JJ>, <NNP>, <NNP>, <NNP>, <CD>, <POP>,..."
1,"[शरी, भिनकन, डच, परकाशन, समह, एलसभियर, एन.भी.,...","[<NN>, <NNP>, <NNP>, <NN>, <NN>, <NNP>, <FB>, ..."
2,"[कनसोलिडटिड, गोलड, फिलडस, पीएलसी, का, परव, सभा...","[<NNP>, <NN>, <NN>, <NNP>, <PKO>, <JJ>, <NN>, ..."
3,"[एकताका, कनट, चरोट, को, फिलटर, बनाउन, परयोग, भ...","[<RBO>, <NNP>, <NN>, <PKO>, <NN>, <VBI>, <NN>,..."
4,"[यस, सग, को, छोटो, समपरक, बाट, मातर, पनि, दशकौ...","[<DUM>, <POP>, <PKO>, <JJM>, <NN>, <POP>, <RP>..."


# Padding Sequence for batches

## word2vec and vec2word conversion

In [16]:
word_dictonary = list(set(sum(data_df['ascii_text'].tolist(),[])))

In [17]:
word_dictonary[:8]

['',
 'अपचछदन',
 'कोजनरसन-पलानट',
 'असवीकार',
 'पयालइसटाइनी',
 'निषकासित',
 'पहिचान',
 'एभन']

In [18]:
tag_dictionary = list(set(sum(data_df['tag'].tolist(),[])))

In [19]:
tag_dictionary[:8]

['<RP>', '<DM>', '<CS>', '<FB>', '<CC>', '<QW>', '<VBF>', '<JJ>']

In [20]:
int2word = dict(enumerate(word_dictonary))

In [21]:
word2int = {int2word[idx]:idx for idx in int2word.keys() }

In [22]:
int2tag = dict(enumerate(tag_dictionary))

In [23]:
tag2int = {int2tag[idx]:idx for idx in int2tag.keys()}

# Train Test Split

In [24]:
SPLIT_RATIO = 0.2

In [25]:
split_num = round((1-SPLIT_RATIO)*data_df.shape[0])

In [26]:
train_dataset = data_df[:split_num]

In [27]:
valid_dataset = data_df[split_num:]

In [28]:
train_dataset.shape,valid_dataset.shape

((3434, 2), (858, 2))

# Model architecture

In [77]:
class POS_Tagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
        super().__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, target_size)
        self.log_softmax = nn.LogSoftmax(dim=1)
        
        

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = self.log_softmax(tag_space)
        return tag_scores

In [30]:
INPUT_DIM = len(word_dictonary)
OUTPUT_DIM = len(tag_dictionary)

In [31]:
INPUT_DIM,OUTPUT_DIM

(11993, 39)

In [32]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 150

In [78]:
sample_model = POS_Tagger(EMBEDDING_DIM, HIDDEN_DIM, INPUT_DIM, OUTPUT_DIM)

In [79]:
sample_model

POS_Tagger(
  (word_embeddings): Embedding(11993, 300)
  (lstm): LSTM(300, 150)
  (hidden2tag): Linear(in_features=150, out_features=39, bias=True)
  (log_softmax): LogSoftmax()
)

# Initailization

In [80]:
# Define Loss, Optimizer
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(sample_model.parameters(), lr=0.01)

# Training

In [36]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [37]:
train_dataset = list(train_dataset.itertuples(index=False))

In [38]:
valid_dataset = list(valid_dataset.itertuples(index=False))

In [39]:
sample = train_dataset[:100]

In [40]:
valid_sample = valid_dataset[:10]

In [41]:
optimizer.zero_grad()

In [47]:
sent_int = prepare_sequence(sample[0][0], word2int)
tag_int = prepare_sequence(sample[0][1], tag2int)

In [82]:
score = sample_model(sent_int)

In [86]:
optimizer.zero_grad()

In [87]:
loss = criterion(score,tag_int)

In [88]:
loss.backward()

In [89]:
optimizer.step()

In [90]:
loss.item()

3.6500182151794434

In [97]:
with torch.no_grad():
    log_ps = sample_model(sent_int)
    test_loss = criterion(log_ps, tag_int).item()
    ps = torch.exp(log_ps)
    top_p, top_class = ps.topk(1, dim=1)

In [102]:
tag_int

tensor([38,  7, 11, 11, 11, 38, 20, 19, 24, 19, 20, 19, 19, 20,  8, 18])

In [101]:
top_class

tensor([[38],
        [ 7],
        [11],
        [11],
        [11],
        [38],
        [20],
        [19],
        [24],
        [19],
        [20],
        [19],
        [19],
        [20],
        [ 8],
        [18]])

In [70]:
epoch_loss_train=[]
epoch_loss_valid=[]
for epoch in range(EPOCHS):
    net_loss_train = 0
    for sentence, tags in sample:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        # Step 2. Get our inputs ready for the network, that is, turn them into
        sent_int = prepare_sequence(sentence, word2int)
        tag_int = prepare_sequence(tags, tag2int)
        

        #Step 3. Run our forward pass.
        try:
            tag_scores = model(sent_int)
            break
            # Step 4. Compute the loss, gradients, and update the parameters 
            optimizer.zero_grad()
            loss = criterion(tag_scores, tag_int)
            loss.backward()
            optimizer.step()
            net_loss_train+=loss.item()
            
        except:
            continue
    # cross validation
    with torch.no_grad():
        net_loss_valid=0
        accuracy=0
        for sentence,tags in valid_sample:
            sent_int = prepare_sequence(sentence,word2int)
            tag_int = prepare_sequence(tags,tag2int)
            #Step 3. Run our forward pass.
            try:
                tag_scores = model(sent_int)

                # Step 4. Compute the loss 
                net_loss_valid+=criterion(tag_scores, tag_int)
            except:
                continue
                
    epoch_loss_train.append(net_loss_train/len(train_dataset))
    epoch_loss_valid.append(net_loss_valid/len(valid_dataset))
#     torch.save(model.state_dict(),'../model/POS_Tagger_model_epoch'+str(epoch))
    if epoch%1 == 0:
        print('Epoch: {}/{}.............'.format(epoch+1, EPOCHS), end=' ')
        print("Training Loss: {:.2f}".format(net_loss_train/len(train_dataset)),end=' ')
        print("Validation Loss: {:.2f}".format(net_loss_valid/len(valid_dataset)))

NameError: name 'EPOCHS' is not defined

In [None]:
# epoch_loss_train=[]
# epoch_loss_valid=[]
# for epoch in range(EPOCHS):
#     net_loss_train = 0
#     for sentence, tags in train_dataset:
#         # Step 1. Remember that Pytorch accumulates gradients.
#         # We need to clear them out before each instance
#         model.zero_grad()
#         # Step 2. Get our inputs ready for the network, that is, turn them into
#         sent_int = prepare_sequence(sentence, word2int)
#         tag_int = prepare_sequence(tags, tag2int)
        

#         #Step 3. Run our forward pass.
#         try:
#             tag_scores = model(sent_int)

#             # Step 4. Compute the loss, gradients, and update the parameters 
#             loss = criterion(tag_scores, tag_int)
#             loss.backward()
#             optimizer.step()
#             net_loss_train+=loss.item()
            
#         except:
#             continue
#     # cross validation
#     with torch.no_grad():
#         net_loss_valid=0
#         accuracy=0
#         for sentence,tags in valid_dataset:
#             sent_int = prepare_sequence(sentence,word2int)
#             tag_int = prepare_sequence(tags,tag2int)
#             #Step 3. Run our forward pass.
#             try:
#                 tag_scores = model(sent_int)

#                 # Step 4. Compute the loss 
#                 net_loss_valid+=criterion(tag_scores, tag_int)
#             except:
#                 continue
                
#     epoch_loss_train.append(net_loss_train/len(train_dataset))
#     epoch_loss_valid.append(net_loss_valid/len(valid_dataset))
#     torch.save(model.state_dict(),'../model/POS_Tagger_model_epoch'+str(epoch))
#     if epoch%5 == 0:
#         print('Epoch: {}/{}.............'.format(epoch+1, EPOCHS), end=' ')
#         print("Training Loss: {:.2f}".format(net_loss_train/len(train_dataset)),end=' ')
#         print("Validation Loss: {:.2f}".format(net_loss_valid/len(valid_dataset)))

In [None]:
epoch_loss_valid = torch.FloatTensor(epoch_loss_valid).tolist()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(epoch_loss_train)
plt.plot(epoch_loss_valid)

In [None]:
valid_dataset[0][1]

In [None]:
with torch.no_grad():
    print(valid_dataset[0][0])
    print('True value')
    print('---------------------------------------')
    print(valid_dataset[0][1])
    score = model(prepare_sequence(valid_dataset[0][0], word2int))
    print('----------------------------------------')
    value=[]
    for item in score:
        value.append(torch.argmax(item))
    value = torch.IntTensor(value).tolist()
    tag=[]
    for item in value:
            tag.append(int2tag[item])
    print('predicted_value')        
    print(tag)

In [None]:
with torch.no_grad():
    sent = ['म','खाना','घर']
    aasa = prepare_sequence(sent, word2int)
    
    score = model(aasa)
    print('----------------------------------------')
    value=[]
    for item in score:
        value.append(torch.argmax(item))
    value = torch.IntTensor(value).tolist()
    tag=[]
    for item in value:
            tag.append(int2tag[item])
    print('predicted_value')        
    print(tag)

In [None]:
score