In [219]:
import pickle
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import torchtext
import bcolz
import unicodedata
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import torch.optim as optim
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
import random

# Preprocessing the text data

In [5]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )


In [6]:
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = s.replace("'","")
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [259]:
def preprocess(df):
    nrows = len(df)
    real_preprocess = []
    df['Content_Parsed_1'] = df['transcription']
    for row in range(0, nrows):

        # Create an empty list containing preprocessed words
        real_preprocess = []

        # Save the text and its words into an object
        text = df.loc[row]['transcription']
        text = normalizeString(text)


        df.loc[row]['Content_Parsed_1'] = text

    df['action'] = df['action'].str.lower()
    df['object'] = df['object'].str.lower()
    df['location'] = df['location'].str.lower()

In [89]:
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [271]:
def lemmatize(df):
    wordnet_lemmatizer = WordNetLemmatizer()
    # Lemmatizing the content
    nrows = len(df)
    lemmatized_text_list = []
    for row in range(0, nrows):

        # Create an empty list containing lemmatized words
        lemmatized_list = []

        # Save the text and its words into an object
        text = df.loc[row]['Content_Parsed_1']
        text_words = text.split(" ")

        # Iterate through every word to lemmatize
        for word in text_words:
            lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))

        # Join the list
        lemmatized_text = " ".join(lemmatized_list)

        # Append to the list containing the texts
        lemmatized_text_list.append(lemmatized_text)
    df['Content_Parsed_2'] = lemmatized_text_list


In [274]:
path_df = "E:/saarthi/task_data/train_data.csv"
with open(path_df, 'rb') as data:
    df = pd.read_csv(data)

In [275]:
path_df_val = "E:/saarthi/task_data/valid_data.csv"
with open(path_df, 'rb') as data:
    df_val = pd.read_csv(data)

In [276]:
preprocess(df_val)
lemmatize(df_val)

In [277]:
preprocess(df)
lemmatize(df)

# Getting Glove Word embeddings 

In [96]:
glove_path = "E:"

In [98]:
vectors = bcolz.open(f'{glove_path}/6B.50.dat')[:]
words = pickle.load(open(f'{glove_path}/6B.50_words.pkl', 'rb'))
word2idx = pickle.load(open(f'{glove_path}/6B.50_idx.pkl', 'rb'))
glove = {w: vectors[word2idx[w]] for w in words}

In [278]:
target_vocab = []
nrows = len(df)
for row in range(0, nrows):
    text = df.loc[row]['Content_Parsed_2']
    text_words = text.split(" ")
    for word in text_words:
        if word not in target_vocab:
            target_vocab.append(word)

In [279]:
target_vocab = []
nrows = len(df_val)
for row in range(0, nrows):
    text = df.loc[row]['Content_Parsed_2']
    text_words = text.split(" ")
    for word in text_words:
        if word not in target_vocab:
            target_vocab.append(word)

In [280]:
nrows = len(df)
for row in range(0, nrows):
    text = df.loc[row]['action']
    text_words = text.split(" ")
    for word in text_words:
        if word not in target_vocab:
            target_vocab.append(word)

In [281]:
nrows = len(df_val)
for row in range(0, nrows):
    text = df.loc[row]['action']
    text_words = text.split(" ")
    for word in text_words:
        if word not in target_vocab:
            target_vocab.append(word)

In [282]:
nrows = len(df)
for row in range(0, nrows):
    text = df.loc[row]['object']
    text_words = text.split(" ")
    for word in text_words:
        if word not in target_vocab:
            target_vocab.append(word)

In [283]:
nrows = len(df_val)
for row in range(0, nrows):
    text = df.loc[row]['object']
    text_words = text.split(" ")
    for word in text_words:
        if word not in target_vocab:
            target_vocab.append(word)

In [284]:
nrows = len(df)
for row in range(0, nrows):
    text = df.loc[row]['location']
    text_words = text.split(" ")
    for word in text_words:
        if word not in target_vocab:
            target_vocab.append(word)

In [285]:
nrows = len(df_val)
for row in range(0, nrows):
    text = df.loc[row]['location']
    text_words = text.split(" ")
    for word in text_words:
        if word not in target_vocab:
            target_vocab.append(word)

# Creating an embedding matrix

In [105]:
vocab_size = len(target_vocab)
input_size = 50

embedding_matrix = torch.zeros((vocab_size, input_size))
for w in target_vocab:
    i = word_to_idx(w)
    
    embedding_matrix[i, :] = torch.from_numpy(glove[w]).float()

# Defining utility functions

In [104]:
def word_to_idx(word):
    for i, w in enumerate(target_vocab):
        if w == word:
            return i
    return -1

In [140]:
def sentence_to_matrix(sentence):
    words = sentence.split(" ")
    n = len(words)
    m = torch.zeros((n, input_size))
    for i, w in enumerate(words):
        m[i] = embedding_matrix[word_to_idx(w)]
    return m

In [200]:
def sentence_to_index(sentence):
    w = sentence.split(" ")
    l = []
    for word in w:
        l.append(word_to_idx(word))
    t = torch.tensor(l, dtype=torch.float32)
    return t

In [288]:
output_size = len(target_vocab)
input_size = 50
hidden_size = 50

In [214]:
def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [234]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

# Creating the Networks

In [152]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size)

    def forward(self, x, hidden):
        x = x.unsqueeze(0)
        output, hidden = self.gru(x, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [160]:
s = "turn down the bathroom temperature"
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
matrix = sentence_to_matrix(s)
print(matrix[0].unsqueeze(0).shape)
encoder = EncoderRNN(input_size, hidden_size)
hidden = encoder.initHidden()
for i in range(matrix.shape[0]):
    out, hidden = encoder(matrix[i].unsqueeze(0), hidden)
print(out.shape)
print(hidden.shape)

torch.Size([1, 50])
5
torch.Size([1, 1, 50])
torch.Size([1, 1, 50])


In [180]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden):
        output = F.relu(x)
        output, hidden = self.gru(output, hidden)
        output_softmax = self.softmax(self.out(output[0]))
        return output, hidden, output_softmax

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [237]:
decoder_hidden = hidden
decoder_input = torch.ones((1,1,50))
decoder = DecoderRNN(hidden_size, output_size)
output_sentence = df.loc[3]["action"] + " "+ df.loc[3]["object"] + " " + df.loc[3]["location"]
print(output_sentence)
target_tensor = sentence_to_index(output_sentence)
criterion = nn.NLLLoss()
loss = 0
for i in range(target_tensor.shape[0]):
    decoder_input, decoder_hidden, decoder_output_softmax = decoder(decoder_input, decoder_hidden)
    loss += criterion(decoder_output_softmax, target_tensor[i].unsqueeze(0).long())
    print(torch.argmax(decoder_output_softmax, dim=1))

decrease heat washroom
tensor([25])
tensor([25])
tensor([22])


# Training the networks

In [215]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    
    encoder_hidden = encoder.initHidden()
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei].unsqueeze(0), encoder_hidden)

    decoder_input = torch.ones((1,1,50))

    decoder_hidden = encoder_hidden

   
    for i in range(target_tensor.shape[0]):
        decoder_input, decoder_hidden, decoder_output_softmax = decoder(decoder_input, decoder_hidden)
        loss += criterion(decoder_output_softmax, target_tensor[i].unsqueeze(0).long())

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [232]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01, df):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()
    
    nrows = len(df)

    for iter in range(1, n_iters + 1):
        i = random.randint(0, n_iters)
        i = (i % nrows)
        
        s = df.loc[i]["Content_Parsed_2"]

        
        input_tensor = sentence_to_matrix(s)
        
        output_sentence = df.loc[i]["action"] + " "+ df.loc[i]["object"] + " " + df.loc[i]["location"]
        target_tensor = sentence_to_index(output_sentence)

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [245]:
def predict(encoder, decoder, input_sentence):
    encoder_hidden = encoder.initHidden()
    input_tensor = sentence_to_matrix(input_sentence)
    decoder_input = torch.ones((1,1,50))
    input_length = input_tensor.size(0)
    
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei].unsqueeze(0), encoder_hidden)
    
    decoder_hidden = encoder_hidden
    
    for i in range(3):
        decoder_input, decoder_hidden, decoder_output_softmax = decoder(decoder_input, decoder_hidden)
        idx = torch.argmax(decoder_output_softmax)
        print(target_vocab[idx])

In [289]:
def evaluate(encoder, decoder, input_sentence, target_tensor):
    encoder_hidden = encoder.initHidden()
    input_tensor = sentence_to_matrix(input_sentence)
    decoder_input = torch.ones((1,1,50))
    input_length = input_tensor.size(0)
    
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei].unsqueeze(0), encoder_hidden)
    
    decoder_hidden = encoder_hidden
    correct = 0
    for i in range(3):
        decoder_input, decoder_hidden, decoder_output_softmax = decoder(decoder_input, decoder_hidden)
        idx = torch.argmax(decoder_output_softmax)
        if(idx == target_tensor[i]):
            correct += 1
    if(correct == 3):
        return 1
    else: 
        return 0

In [247]:
encoder = EncoderRNN(input_size, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, output_size)

trainIters(encoder, decoder, 150000, df)

0m 3s (- 9m 23s) (1000 0%) 2.1604
0m 7s (- 9m 40s) (2000 1%) 1.0210
0m 11s (- 9m 27s) (3000 2%) 0.5684
0m 16s (- 9m 47s) (4000 2%) 0.4079
0m 20s (- 9m 48s) (5000 3%) 0.3162
0m 24s (- 9m 44s) (6000 4%) 0.2225
0m 28s (- 9m 43s) (7000 4%) 0.1346
0m 32s (- 9m 35s) (8000 5%) 0.0895
0m 36s (- 9m 36s) (9000 6%) 0.0623
0m 40s (- 9m 33s) (10000 6%) 0.0533
0m 45s (- 9m 36s) (11000 7%) 0.0448
0m 49s (- 9m 31s) (12000 8%) 0.0309
0m 53s (- 9m 23s) (13000 8%) 0.0360
0m 58s (- 9m 24s) (14000 9%) 0.0314
1m 2s (- 9m 21s) (15000 10%) 0.0271
1m 6s (- 9m 15s) (16000 10%) 0.0204
1m 10s (- 9m 13s) (17000 11%) 0.0163
1m 14s (- 9m 8s) (18000 12%) 0.0150
1m 19s (- 9m 7s) (19000 12%) 0.0162
1m 24s (- 9m 8s) (20000 13%) 0.0127
1m 29s (- 9m 10s) (21000 14%) 0.0138
1m 35s (- 9m 14s) (22000 14%) 0.0132
1m 40s (- 9m 14s) (23000 15%) 0.0171
1m 46s (- 9m 18s) (24000 16%) 0.0113
1m 52s (- 9m 23s) (25000 16%) 0.0107
1m 57s (- 9m 18s) (26000 17%) 0.0094
2m 2s (- 9m 17s) (27000 18%) 0.0114
2m 7s (- 9m 16s) (28000 18%) 0.0

# Evaluating the model

In [291]:
n = len(df_val)
total = 0
correct = 0
for i in range(n):
    output_sentence = df_val.loc[i]["action"] + " "+ df_val.loc[i]["object"] + " " + df_val.loc[i]["location"]
    target_tensor = sentence_to_index(output_sentence)
    
    input_sentence = df_val.loc[i]["Content_Parsed_2"]
    correct += evaluate(encoder, decoder, input_sentence, target_tensor)
    total += 1
print(correct)
print(total)
print(f"Accuracy on Val test : {(float(correct)/total)*100}")

11566
11566
Accuracy on Val test : 100.0
