## import libraries

In [1]:
import torch
import pickle
import unicodedata
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from sklearn.model_selection import train_test_split

## import dataset

In [2]:
path = "../data/NepaliTaggedCorpus/new_submissions_parallel_corpus_project_Nepal/"

In [3]:
dbfile = open(path+'/'+'processed_tag', 'rb')

In [4]:
data_df = pickle.load(dbfile)

In [5]:
data_df.head()

Unnamed: 0,text,tag
0,"[६१, वर्षीय, पियरे, भिन्केन, नोभेम्बर, २९...","[<CD>, <JJ>, <NNP>, <NNP>, <NNP>, <CD>, <POP>,..."
1,"[श्री, भिन्केन, डच, प्रकाशन, समूह, एल्सेभ...","[<NN>, <NNP>, <NNP>, <NN>, <NN>, <NNP>, <FB>, ..."
2,"[कन्सोलिडेटिड, गोल्ड, फिल्ड्स, पीएलसी, का, ...","[<NNP>, <NN>, <NN>, <NNP>, <PKO>, <JJ>, <NN>, ..."
3,"[एकताका, केन्ट, चुरोट, को, फिल्टर, बनाउन, ...","[<RBO>, <NNP>, <NN>, <PKO>, <NN>, <VBI>, <NN>,..."
4,"[यस, सँग, को, छोटो, सम्पर्क, बाट, मात्र, प...","[<DUM>, <POP>, <PKO>, <JJM>, <NN>, <POP>, <RP>..."


In [6]:
data_df['len_txt'] =data_df['text'].map(len)

In [7]:
data_df['len_tag'] =data_df['tag'].map(len)

In [8]:
data_df[data_df['len_txt']!=data_df['len_tag']]

Unnamed: 0,text,tag,len_txt,len_tag


## hence from the operation, we figure out that there is not mismatch between tags and text tokens 

## Dataset Preparation

In [9]:
data_df.head()

Unnamed: 0,text,tag,len_txt,len_tag
0,"[६१, वर्षीय, पियरे, भिन्केन, नोभेम्बर, २९...","[<CD>, <JJ>, <NNP>, <NNP>, <NNP>, <CD>, <POP>,...",16,16
1,"[श्री, भिन्केन, डच, प्रकाशन, समूह, एल्सेभ...","[<NN>, <NNP>, <NNP>, <NN>, <NN>, <NNP>, <FB>, ...",11,11
2,"[कन्सोलिडेटिड, गोल्ड, फिल्ड्स, पीएलसी, का, ...","[<NNP>, <NN>, <NN>, <NNP>, <PKO>, <JJ>, <NN>, ...",25,25
3,"[एकताका, केन्ट, चुरोट, को, फिल्टर, बनाउन, ...","[<RBO>, <NNP>, <NN>, <PKO>, <NN>, <VBI>, <NN>,...",43,43
4,"[यस, सँग, को, छोटो, सम्पर्क, बाट, मात्र, प...","[<DUM>, <POP>, <PKO>, <JJM>, <NN>, <POP>, <RP>...",38,38


In [10]:
data_df['text'] = data_df['text'].apply(lambda x : ["".join(item.split(" ")) for item in x])

## Converting Unicode value into ASCII

In [11]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

In [12]:
data_df['ascii_text'] = data_df['text'].apply(lambda x : [unicode_to_ascii(item) for item in x])

In [13]:
data_df.drop(columns=['text','len_txt','len_tag'],inplace=True)

In [14]:
data_df = data_df[['ascii_text','tag']]

In [15]:
data_df.head()

Unnamed: 0,ascii_text,tag
0,"[६१, वरषीय, पियर, भिनकन, नोभमबर, २९, बाट, सलला...","[<CD>, <JJ>, <NNP>, <NNP>, <NNP>, <CD>, <POP>,..."
1,"[शरी, भिनकन, डच, परकाशन, समह, एलसभियर, एन.भी.,...","[<NN>, <NNP>, <NNP>, <NN>, <NN>, <NNP>, <FB>, ..."
2,"[कनसोलिडटिड, गोलड, फिलडस, पीएलसी, का, परव, सभा...","[<NNP>, <NN>, <NN>, <NNP>, <PKO>, <JJ>, <NN>, ..."
3,"[एकताका, कनट, चरोट, को, फिलटर, बनाउन, परयोग, भ...","[<RBO>, <NNP>, <NN>, <PKO>, <NN>, <VBI>, <NN>,..."
4,"[यस, सग, को, छोटो, समपरक, बाट, मातर, पनि, दशकौ...","[<DUM>, <POP>, <PKO>, <JJM>, <NN>, <POP>, <RP>..."


# Padding Sequence for batches

In [16]:
# maximum_len = data_df['ascii_text'].map(len).max()

In [17]:
# def padding_sequence(token_list,max_len=maximum_len):
#     len_token = len(token_list)
#     for  idx in range(len_token,maximum_len):
#         token_list.append(' ')
#     return token_list

In [18]:
# data_df['ascii_text'] = data_df['ascii_text'].map(padding_sequence)

In [19]:
# data_df['tag'] = data_df['tag'].map(padding_sequence)

## word2vec and vec2word conversion

In [20]:
word_dictonary = list(set(sum(data_df['ascii_text'].tolist(),[])))

In [21]:
word_dictonary[:8]

['', 'कोण', 'सयय', 'पपवानयगिनी', 'पकडिन', 'माननहनन', 'टाइमस', 'अमल']

In [22]:
tag_dictionary = list(set(sum(data_df['tag'].tolist(),[])))

In [23]:
tag_dictionary[:8]

['<CD>', '<OD>', '<CC>', '<VBKO>', '<ALPH>', '<RBO>', '<HRU>', '<JJD>']

In [24]:
int2word = dict(enumerate(word_dictonary))

In [25]:
word2int = {int2word[idx]:idx for idx in int2word.keys() }

In [26]:
int2tag = dict(enumerate(tag_dictionary))

In [27]:
tag2int = {int2tag[idx]:idx for idx in int2tag.keys()}

# Train Test Split

In [28]:
SPLIT_RATIO = 0.2

In [29]:
split_num = round((1-SPLIT_RATIO)*data_df.shape[0])

In [30]:
train_dataset = data_df[:split_num]

In [31]:
valid_dataset = data_df[split_num:]

In [32]:
train_dataset.shape,valid_dataset.shape

((3434, 2), (858, 2))

# Model architecture

# Evaluation

In [33]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [34]:
train_dataset = list(train_dataset.itertuples(index=False))

In [35]:
valid_dataset = list(valid_dataset.itertuples(index=False))

#### loading model

In [36]:
class POS_Tagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
        super().__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, target_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [37]:
INPUT_DIM = len(word_dictonary)
OUTPUT_DIM = len(tag_dictionary)

In [38]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 150

In [39]:
model = POS_Tagger(EMBEDDING_DIM, HIDDEN_DIM, INPUT_DIM, OUTPUT_DIM)

In [40]:
model

POS_Tagger(
  (word_embeddings): Embedding(11993, 300)
  (lstm): LSTM(300, 150)
  (hidden2tag): Linear(in_features=150, out_features=39, bias=True)
)

In [41]:
model.load_state_dict(torch.load('../model/POS_Tagger_model_epoch49'), strict=False)

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [42]:
# # ps = torch.exp(score)
# top_p, top_class = score.topk(1, dim=1)

In [43]:
dataset = valid_dataset[0]

In [44]:
with torch.no_grad():
    print(valid_dataset[0][0])
    print('True value')
    print('---------------------------------------')
    print(valid_dataset[0][1])
    score = model(prepare_sequence(valid_dataset[0][0], word2int))
    print('----------------------------------------')
    value=[]
    for item in score:
        value.append(torch.argmax(item))
    value = torch.IntTensor(value).tolist()
    tag=[]
    for item in value:
            tag.append(int2tag[item])
    print('predicted_value')        
    print(tag)

['गत', 'वरष', 'को', 'आकडा', 'मा', 'पनरसरचना', 'र', 'असामनय', 'शीरषक', 'मा', '१२', 'मिलियन', 'डलर', 'को', 'एक', 'पटक', 'मातर', 'हन', 'घाटा', 'समाविषट', 'छ', '।']
True value
---------------------------------------
['<JJ>', '<NN>', '<PKO>', '<NN>', '<POP>', '<NN>', '<CC>', '<JJ>', '<NN>', '<POP>', '<CD>', '<CD>', '<NNP>', '<PKO>', '<CD>', '<RBO>', '<RP>', '<VBNE>', '<NN>', '<JJ>', '<VBX>', '<YF>']
----------------------------------------
predicted_value
['<ALPH>', '<PPR>', '<CS>', '<FB>', '<PPR>', '<CD>', '<PPR>', '<CD>', '<CD>', '<PPR>', '<CD>', '<CS>', '<PPR>', '<CS>', '<ALPH>', '<CD>', '<CD>', '<CD>', '<CD>', '<CS>', '<JJD>', '<CS>']


In [45]:
token2int = prepare_sequence(dataset[0], word2int)
tagging2int = prepare_sequence(dataset[1],tag2int)
print(tagging2int)
score = model(token2int)
value=[]
for item in score:
    value.append(torch.argmax(item))
value = torch.IntTensor(value).tolist()
tag=[]
for item in value:
        tag.append(int2tag[item])
print("=========================================")
print('predicted_value')        
print(value)

tensor([23, 22, 26, 22, 10, 22,  2, 23, 22, 10,  0,  0, 35, 26,  0,  5, 29, 32,
        22, 23, 37, 21])
predicted_value
[4, 14, 16, 28, 14, 0, 14, 0, 0, 14, 0, 16, 14, 16, 4, 0, 0, 0, 0, 16, 7, 16]
