In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import csv
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt


class samsungNLPchallange():
    def __init__(self):
        
        self.file_path_train = '/content/drive/MyDrive/models/prometeo/train/'
        self.file_path_test = '/content/drive/MyDrive/models/prometeo/'
        self.train_x_y = []
        self.train_x_y_f = []

        self.classes = []
        self.words = []

        return None

    def prepareTestData(self):
        f = open(self.file_path_test+'test_public.csv')
        csvreader = csv.reader(f)
        header = []
        rows = []
        tokens = []

        header = next(csvreader)
        for row in csvreader:
            rows.append(row)

        for sen in rows:
            for w in sen[1].split():
                tokens.append(w)

        print(len(tokens))
        return None
    
    def prepareTrainData(self):
        dir_list = os.listdir(self.file_path_train)
        dir_list = sorted(dir_list)
        
        line_f = []
        for i in range(len(dir_list)):
            with open(self.file_path_train+dir_list[i], 'r') as f:
                line = ''
                #line_f = []
                while True:
                    line = f.readline()
                    line_s = line.split('\t')
                    line_t = [] 
                    for l in line_s:
                        line_t.append(l.split())
                    line_f.append(line_t)  
                    if line == '':
                        break
            
                line_f = line_f[:len(line_f)-1]

            f.close()

        self.train_x_y = line_f
        return line_f

    def prepareLabels(self):
        
        unique = []

        for val in self.train_x_y:
            for v in val[1]:
                if v not in unique:
                    unique.append(v)
        
        unique = sorted(unique)
        
        self.classes = unique

        return unique

    def prepareWordBag(self):
        
        unique = []

        for val in self.train_x_y:
            for v in val[0]:
                if v not in unique:
                    unique.append(v)
        unique = sorted(unique)
        self.words = unique
        return unique

    def mapping(self):
        
        t_train_x_y_f = []
        
        w_n = len(self.words)
        c_n = len(self.classes)
        
        for x in self.train_x_y:
            
            w_index = []
            
            for xx in x[0]:
                w_index.append(self.words.index(xx))
            
            c_index = []
            
            for yy in x[1]:
                c_index.append(self.classes.index(yy))
            
            t_train_x_y_f.append(torch.LongTensor([w_index,c_index]))
        
        self.train_x_y_f = t_train_x_y_f

        return t_train_x_y_f

    def mappingTest(self,test):

        t_test_x_y_f = []
        count = 0
        for x in test:

            w_index = []

            for xx in x:
                try:
                    w_index.append(self.words.index(xx))
                except:
                    w_index.append(32)
                    count += 1

            t_test_x_y_f.append(torch.LongTensor(w_index))
        #print(count)
        return t_test_x_y_f


class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()

        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()


    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)

        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        
        tag_outputs = self.hidden2tag(lstm_out.view(len(sentence), -1))
        #tag_outputs = self.hidden2tag(tag_outputs_.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_outputs, dim=1)

        return tag_scores

if __name__ == "__main__":

    OBJ     = samsungNLPchallange()
    train   = OBJ.prepareTrainData()
    classes = OBJ.prepareLabels()
    words   = OBJ.prepareWordBag()
    final   = OBJ.mapping()
    #print(len(train))
    #print(classes)
    #print(words)
    #print(final)

In [None]:
final[0][0]

tensor([ 798, 1145, 1453,  128,  359,  120, 1596,  359,  761,  233])

In [None]:
# the embedding dimension defines the size of our word vectors
# for our simple vocabulary and training set, we will keep these small
EMBEDDING_DIM = 10
HIDDEN_DIM = 10

# instantiate our model
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(words), len(classes))

# define our loss and optimizer
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.03)

In [None]:
test_sentence = "The cheese loves the elephant".lower().split()

inputs = final[0][0]
inputs = inputs
tag_scores = model(inputs)
#print(tag_scores)

_, predicted_tags = torch.max(tag_scores, 1)
#print('\n')
print('Predicted tags: \n',predicted_tags)

Predicted tags: 
 tensor([14,  1, 24, 16, 16,  2, 16, 16, 16, 16])


In [None]:
# normally these epochs take a lot longer 
# but with our toy data (only 3 sentences), we can do many epochs in a short time
n_epochs = 3000

import random

for epoch in range(n_epochs):
    
    epoch_loss = 0.0
    random.shuffle(final)
    # get all sentences and corresponding tags in the training data
    #for sentence, tags in training_data:
    for sentence, tags in final:    
        # zero the gradients
        model.zero_grad()

        # zero the hidden state of the LSTM, this detaches it from its history
        model.hidden = model.init_hidden()

        # prepare the inputs for processing by out network, 
        # turn all sentences and targets into Tensors of numerical indices
        #sentence_in = prepare_sequence(sentence, word2idx)
        #targets = prepare_sequence(tags, tag2idx)

        # forward pass to get tag scores
        tag_scores = model(sentence)

        # compute the loss, and gradients 
        loss = loss_function(tag_scores, tags)
        epoch_loss += loss.item()
        loss.backward()
        
        # update the model parameters with optimizer.step()
        optimizer.step()
        
    # print out avg loss per 20 epochs
    if(epoch%20 == 19):
        print("Epoch: %d, loss: %1.5f" % (epoch+1, epoch_loss/len(final)))

Epoch: 20, loss: 0.44717
Epoch: 40, loss: 0.38031
Epoch: 60, loss: 0.32876
Epoch: 80, loss: 0.28921
Epoch: 100, loss: 0.26758
Epoch: 120, loss: 0.23081
Epoch: 140, loss: 0.21123
Epoch: 160, loss: 0.19382
Epoch: 180, loss: 0.19116
Epoch: 200, loss: 0.19560
Epoch: 220, loss: 0.16917
Epoch: 240, loss: 0.14597
Epoch: 260, loss: 0.14059
Epoch: 280, loss: 0.13064
Epoch: 300, loss: 0.12438
Epoch: 320, loss: 0.12445
Epoch: 340, loss: 0.11346
Epoch: 360, loss: 0.34973
Epoch: 380, loss: 0.15002
Epoch: 400, loss: 0.13043
Epoch: 420, loss: 0.12818
Epoch: 440, loss: 0.11172
Epoch: 460, loss: 0.10402
Epoch: 480, loss: 0.10085
Epoch: 500, loss: 0.09632
Epoch: 520, loss: 0.09378
Epoch: 540, loss: 0.09005
Epoch: 560, loss: 0.11133
Epoch: 580, loss: 0.08533
Epoch: 600, loss: 0.08199
Epoch: 620, loss: 0.14384
Epoch: 640, loss: 0.08453
Epoch: 660, loss: 0.08053
Epoch: 680, loss: 0.09985
Epoch: 700, loss: 0.07510
Epoch: 720, loss: 0.07280
Epoch: 740, loss: 0.09865
Epoch: 760, loss: 0.07804
Epoch: 780, loss

In [None]:
final[7][1]

tensor([ 0, 17, 32, 32, 32, 32])

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/models/prometeo/model_LSTM_03.pt')

In [None]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(words), len(classes))

model.load_state_dict(torch.load('/content/drive/MyDrive/models/prometeo/model_LSTM_03.pt'))

<All keys matched successfully>

In [None]:
file = open('/content/drive/MyDrive/models/prometeo/test_public.csv')
csvreader = csv.reader(file)

header = []
rows = []

header = next(csvreader)
for row in csvreader:
  rows.append(row)

input_text = []

for i in rows:
  input_text.append(i[1].split())

#print(input_text)

test_final = OBJ.mappingTest(input_text)



In [None]:
input_text

[['sakshi', 'ki', 'landline', 'mein', 'call', 'karo'],
 ['call', 'karo', 'chacha', 'ko', 'mobile', 'pe'],
 ['dadaji', 'ke', 'mobile', 'pe', 'call', 'karo'],
 ['vinuth', 'ko', 'redial', 'karo'],
 ['mere',
  'sasurji',
  'ke',
  'ghar',
  'wale',
  'number',
  'par',
  'abhi',
  'ke',
  'abhi',
  'phone',
  'ghumao'],
 ['gudiya', 'ki', 'mobile', 'pe', 'dial', 'karo'],
 ['meri',
  'baat',
  'karao',
  'abhijeet',
  'se',
  'sim',
  '2',
  'card',
  'use',
  'karna'],
 ['mausa', 'ji', 'ko', 'jaldi', 'se', 'ring', 'back', 'karien'],
 ['mujhe', 'sim', '2', 'se', 'mom', 'ko', 'call', 'karna', 'hain'],
 ['mujhe', 'abhi', 'nani', 'se', 'baat', 'karni', 'hain'],
 ['dadi', 'ke', 'landline', 'number', 'par', 'phone', 'karo'],
 ['phuphi', 'ji', 'ko', 'call', 'karo'],
 ['please', 'varun', 'ko', 'phone', 'lagiyen'],
 ['rasika', 'ke', 'office', 'ke', 'number', 'par', 'call', 'please'],
 ['mujhe', 'angad', 'se', 'baat', 'karni', 'hai'],
 ['baat', 'karao', 'zara', 'sabina', 'se'],
 ['tarun', 'ki', 'durv

In [None]:
classes = OBJ.prepareLabels()
words   = OBJ.prepareWordBag()

In [None]:
i = 0
result = []
for final in test_final:
  inputs = final
  inputs = inputs
  tag_scores = model(inputs)
  #print(tag_scores)

# print the most likely tag index, by grabbing the index with the maximum score!
# recall that these numbers correspond to tag2idx = {"DET": 0, "NN": 1, "V": 2}
  _, predicted_tags = torch.max(tag_scores, 1)
  print(input_text[i])
  cl  = []
  for idx, ii in enumerate(predicted_tags.numpy()):
    if input_text[i][idx] not in words:
      cl.append('')
    else:
      cl.append(classes[ii])
  print(cl)
  result.append(cl)
  i += 1

['sakshi', 'ki', 'landline', 'mein', 'call', 'karo']
['B_Contact_Name', 'o', 'B_Number_Type', 'I_Number_Type', 'o', 'o']
['call', 'karo', 'chacha', 'ko', 'mobile', 'pe']
['o', 'o', 'B_Catchall_Phrase', 'o', 'B_Number_Type', 'o']
['dadaji', 'ke', 'mobile', 'pe', 'call', 'karo']
['', 'o', 'B_Number_Type', 'o', 'o', 'o']
['vinuth', 'ko', 'redial', 'karo']
['B_Contact_Name', 'o', 'B_App_Name', 'o']
['mere', 'sasurji', 'ke', 'ghar', 'wale', 'number', 'par', 'abhi', 'ke', 'abhi', 'phone', 'ghumao']
['o', 'I_Relation', 'o', 'B_Location', 'I_Location', 'I_Location', 'o', 'o', 'o', 'o', 'o', 'o']
['gudiya', 'ki', 'mobile', 'pe', 'dial', 'karo']
['o', 'o', 'B_Number_Type', 'I_Number_Type', 'o', 'o']
['meri', 'baat', 'karao', 'abhijeet', 'se', 'sim', '2', 'card', 'use', 'karna']
['o', 'o', 'o', 'B_Contact_Name', 'o', 'I_Component', 'I_Component', 'I_Component', 'o', 'o']
['mausa', 'ji', 'ko', 'jaldi', 'se', 'ring', 'back', 'karien']
['B_Catchall_Phrase', 'I_Catchall_Phrase', 'o', 'o', 'o', 'o', '

In [None]:
result

[['B_Contact_Name', 'o', 'B_Number_Type', 'I_Number_Type', 'o', 'o'],
 ['o', 'o', 'B_Catchall_Phrase', 'o', 'B_Number_Type', 'o'],
 ['', 'o', 'B_Number_Type', 'o', 'o', 'o'],
 ['B_Contact_Name', 'o', 'B_App_Name', 'o'],
 ['o',
  'I_Relation',
  'o',
  'B_Location',
  'I_Location',
  'I_Location',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o'],
 ['o', 'o', 'B_Number_Type', 'I_Number_Type', 'o', 'o'],
 ['o',
  'o',
  'o',
  'B_Contact_Name',
  'o',
  'I_Component',
  'I_Component',
  'I_Component',
  'o',
  'o'],
 ['B_Catchall_Phrase', 'I_Catchall_Phrase', 'o', 'o', 'o', 'o', 'o', 'o'],
 ['o',
  'I_Component',
  'I_Component',
  'o',
  'B_Contact_Name',
  'o',
  'o',
  'o',
  'o'],
 ['o', 'o', 'o', 'o', 'o', 'o', 'o'],
 ['I_Date', 'o', 'B_Number_Type', 'I_Location', 'o', 'o', 'o'],
 ['B_Relation', 'B_Contact_Name', 'o', 'o', 'o'],
 ['o', 'B_Contact_Name', 'o', 'o', 'o'],
 ['', 'o', 'o', 'o', 'o', 'o', 'o', 'o'],
 ['o', 'B_Contact_Name', 'o', 'o', 'o', 'o'],
 ['o', 'o', 'o', '', 'o'],
 ['I_App

In [None]:
import csv
with open('/content/drive/MyDrive/models/prometeo/test_public_res.csv', 'w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(['id', 'Predicted'])
  count = 0
  for val in result:
    for v in val:
      writer.writerow([count, v])
      count += 1