Generative Chatbot
==================
> CSCK507 Group Project: Group A
> 
This is a generative chatbot that uses a seq2seq model.

# Importing Libraries

In [1]:
import codecs
import csv
import io
import itertools
import json
import time
import math
import os
import random
import re
import tarfile
import unicodedata
import zipfile
from io import open
from collections import Counter, OrderedDict
from typing import Tuple

import numpy as np
import pandas as pd
import polars as pl
import requests
import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch.utils.tensorboard import SummaryWriter
from torch import optim
from torch.jit import script, trace
import torchtext as tt

In [2]:
# Load in GPU
spacy.prefer_gpu()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
try:
    spacy.load('en_core_web_sm')
except LookupError:
    print('Run: python -m spacy download en_core_web_sm')

# Data Preprocessing

## Importing the dataset

In [3]:
def download_file(url, dir):
    """
    Download file from url
    :param url: url of file
    :param filename: name of file
    :return: None
    """
    r = requests.get(url)
    if url.endswith('.tar.gz'):
        z = tarfile.open(fileobj=io.BytesIO(r.content), mode="r:gz")
        z.extractall(dir)
        z.close()
    elif url.endswith('.zip'):
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall(dir)
    else:
        print('Unknown file type')
    return None

def extract_zip(filename, dir):
    """
    Extract zip file
    :param filename: name of file
    :return: None
    """
    z = zipfile.ZipFile(filename)
    z.extractall(dir)
    return None

In [4]:
datasets = {#'WikiQACorpus': 'https://download.microsoft.com/download/E/5/F/E5FCFCEE-7005-4814-853D-DAA7C66507E0/WikiQACorpus.zip',
            #'Question_Answer_Dataset_v1.2': 'https://www.cs.cmu.edu/~ark/QA-data/data/Question_Answer_Dataset_v1.2.tar.gz',
            'ubuntu-dialogue': 'data/ubuntu dialogue.zip'}

In [5]:
# Create directory
if not os.path.exists('data'):
    os.makedirs('data')

# Check if data is already downloaded
for dataset, source in datasets.items():
    if os.path.exists('data/' + dataset):
        print(dataset + ' already exists')
    elif dataset == 'ubuntu-dialogue':
        ubuntu = 'data/ubuntu dialogue'
        if os.path.exists(source):
            os.makedirs(ubuntu)
            extract_zip(source, ubuntu)
            os.remove(source)
            print(dataset + ' extracted')
        elif os.path.exists(ubuntu):
            print(dataset + ' already exists')
        else:
            kag = 'https://www.kaggle.com/datasets/rtatman/ubuntu-dialogue-corpus/download?datasetVersionNumber=2'
            print(f'Manually download ubuntu dialogue dataset from {kag} and place in data folder')
    else:
        download_file(source, 'data')
        print(dataset + ' downloaded')

ubuntu-dialogue already exists


## Preparing the dataset

### Read Data

In [6]:
variants = {'small':'',
            'medium':'_196',
            'large':'_301'}
ubuntufile = f'data/ubuntu dialogue/ubuntu-dialogue-corpus/dialogueText{variants["small"]}.csv'
text_df = pd.read_csv(ubuntufile)
text_df['dialogueID'] = text_df['dialogueID'].apply(lambda x: int(x.split('.')[0]))
print(text_df.shape)


(1038324, 6)


In [7]:
# preview text from ubuntu dialogue dataset
text_df.head()

Unnamed: 0,folder,dialogueID,date,from,to,text
0,3,126125,2008-04-23T14:55:00.000Z,bad_image,,"Hello folks, please help me a bit with the fol..."
1,3,126125,2008-04-23T14:56:00.000Z,bad_image,,Did I choose a bad channel? I ask because you ...
2,3,126125,2008-04-23T14:57:00.000Z,lordleemo,bad_image,the second sentence is better english and we...
3,3,64545,2009-08-01T06:22:00.000Z,mechtech,,Sock Puppe?t
4,3,64545,2009-08-01T06:22:00.000Z,mechtech,,WTF?


In [8]:
# reduce df size
text_df = text_df[:100]
text_df.head()

Unnamed: 0,folder,dialogueID,date,from,to,text
0,3,126125,2008-04-23T14:55:00.000Z,bad_image,,"Hello folks, please help me a bit with the fol..."
1,3,126125,2008-04-23T14:56:00.000Z,bad_image,,Did I choose a bad channel? I ask because you ...
2,3,126125,2008-04-23T14:57:00.000Z,lordleemo,bad_image,the second sentence is better english and we...
3,3,64545,2009-08-01T06:22:00.000Z,mechtech,,Sock Puppe?t
4,3,64545,2009-08-01T06:22:00.000Z,mechtech,,WTF?


### Preprocess Data and structure it

In [9]:
nlp = spacy.load('en_core_web_sm') # load spacy model

In [10]:
def unicodetoascii(text):
    """
    Turn a Unicode string to plain ASCII

    :param text: text to be converted
    :return: text in ascii format
    """
    normalized_text = unicodedata.normalize('NFKD', str(text))
    ascii_text = ''.join(char for char in normalized_text if unicodedata.category(char) != 'Mn')
    return ascii_text

def preprocess_text(text, fn=unicodetoascii):

    text = fn(text)
    text = text.lower()
    text = re.sub(r'[^\x00-\x7F]+', "", text) # Remove non-ASCII characters
    text = re.sub(r"(\w)[!?]+(\w)", r'\1\2', text) # Remove !? between words
    text = re.sub(r"\s\s+", r" ", text).strip() # Remove extra spaces
    return text

def parse_dialogue(data):
    dialogues = {}
    df = data.copy()
    df.reset_index(inplace=True)
    # Group by dialogueID
    for dialogue_id, group in df.groupby('dialogueID'):
        sentence_pairs = {}
        context = ''
        previous_direction = (None, None)
        for i, row in group.iterrows():
            idx = row['index']
            sender = row['from']
            recipient = row['to']
            response = str(row['text'])
            direction = (sender, recipient)

            if direction == previous_direction:
                # add to the response to the previous message if the current message is consecutive
                prev_idx = idx - 1
                while prev_idx not in sentence_pairs:
                    prev_idx -= 1
                response = context + ' ' + response
                sentence_pairs[prev_idx] = (sentence_pairs[prev_idx][0], response)
                # sentence_pairs[-1] = (sentence_pairs[-1][0], response)
            elif (direction == previous_direction[::-1]) or (previous_direction[1] == None) and (direction[1] == previous_direction[0]):
                # if the current message is from the previous recipient to the previous sender
                # if the previous message did not have a recipient, but the current message is to the previous sender
                sentence_pairs[idx]=(context, response)
            else:
                sentence_pairs[idx]=(context, response)
            
            previous_direction = tuple(direction)
            context = str(response) # response is the context for the next message
        # remove the sentence pairs that does not have context but only responses
        sentence_pairs = {k: v for k, v in sentence_pairs.items() if v[0] != ''}
        dialogues[dialogue_id] = sentence_pairs

    return dialogues

In [11]:
text_df['text'] = text_df['text'].apply(preprocess_text)
text_df.head()

Unnamed: 0,folder,dialogueID,date,from,to,text
0,3,126125,2008-04-23T14:55:00.000Z,bad_image,,"hello folks, please help me a bit with the fol..."
1,3,126125,2008-04-23T14:56:00.000Z,bad_image,,did i choose a bad channel? i ask because you ...
2,3,126125,2008-04-23T14:57:00.000Z,lordleemo,bad_image,the second sentence is better english and we a...
3,3,64545,2009-08-01T06:22:00.000Z,mechtech,,sock puppet
4,3,64545,2009-08-01T06:22:00.000Z,mechtech,,wtf?


In [12]:
dialogues = parse_dialogue(text_df)

In [13]:
# convert nested dictionary to dataframe
def dict_to_df(data):
    rows = []
    for dialogue_id, sentence_pairs in data.items():
        for idx, pair in sentence_pairs.items():
            rows.append([dialogue_id, idx, pair[0], pair[1]])
    df = pd.DataFrame(rows, columns=['dialogueID', 'index', 'context', 'response'])
    return df

dialogue_df = dict_to_df(dialogues)
dialogue_df.head()

Unnamed: 0,dialogueID,index,context,response
0,16039,43,is there a way to tell ubuntu not to show icon...,can use a udev rule for it
1,16039,44,can use a udev rule for it,thanks :)
2,27998,82,"when i upgraded from 10.04 to 10.10, the upgra...",against the upgrade tool
3,27998,83,against the upgrade tool,thanks
4,34410,88,hints on how to get nvidia driver to work?,https://help.ubuntu.com/community/binarydriver...


### Load the structured data into dataframe and index it

- As we will be using word vectors, lemmatization is not required. (words that are similar will have vectors that are close to each other)
- As one of the models will be using attention, removing stop words is not required. (attention will learn to ignore them)

In [14]:
# use torch text to create vocabulary
def tokenize(text):
    """
    Tokenize text
    :param text: text to be tokenized
    :return: list of tokens
    """
    return [tok.text for tok in nlp.tokenizer(text)]

def create_mapping(df, tokenize=tokenize):
    """
    Create vocabulary mapping from context and response dataframes
    :param df_context: context dataframe
    :param df_response: response dataframe
    :param tokenize: tokenization function
    :return: vocabulary mapping
    """
    # Create vocabulary mapping
    vocab = set()
    default_tokens = ['<pad>', '<bos>', '<eos>']
    start_index = len(default_tokens)
    for context, response in zip(df['context'], df['response']):
        vocab.update(tokenize(context))
        vocab.update(tokenize(response))
    word2idx = {word: start_index+idx for idx, word in enumerate(vocab)}
    idx2word = {start_index+idx: word for idx, word in enumerate(vocab)}
    for idx, token in enumerate(default_tokens):
        word2idx[token] = idx
        idx2word[idx] = token
    return word2idx, idx2word

def lookup_words(idx2word, indices):
    """
    Lookup words from indices
    :param idx2word: index to word mapping
    :param indices: indices to be converted
    :return: list of words
    """
    return [idx2word[idx] for idx in indices]

In [15]:
word2idx, idx2word = create_mapping(dialogue_df)
word2idx['<pad>']

0

In [16]:
# Map words to indices
dialogue_df['context_idx'] = dialogue_df['context'].apply(lambda x: [word2idx[word] for word in tokenize(x)])
dialogue_df['response_idx'] = dialogue_df['response'].apply(lambda x: [word2idx[word] for word in tokenize(x)])

In [17]:
# add bos and eos tokens to context and response
bos = word2idx['<bos>']
eos = word2idx['<eos>']
dialogue_df['context_idx'] = dialogue_df['context_idx'].apply(lambda x: [bos] + x + [eos])
dialogue_df['response_idx'] = dialogue_df['response_idx'].apply(lambda x: [bos] + x + [eos])

In [18]:
dialogue_df.head()

Unnamed: 0,dialogueID,index,context,response,context_idx,response_idx
0,16039,43,is there a way to tell ubuntu not to show icon...,can use a udev rule for it,"[1, 353, 164, 405, 156, 249, 496, 385, 402, 24...","[1, 290, 42, 405, 240, 87, 49, 434, 2]"
1,16039,44,can use a udev rule for it,thanks :),"[1, 290, 42, 405, 240, 87, 49, 434, 2]","[1, 354, 495, 2]"
2,27998,82,"when i upgraded from 10.04 to 10.10, the upgra...",against the upgrade tool,"[1, 368, 433, 264, 98, 223, 249, 200, 463, 344...","[1, 77, 344, 69, 7, 2]"
3,27998,83,against the upgrade tool,thanks,"[1, 77, 344, 69, 7, 2]","[1, 354, 2]"
4,34410,88,hints on how to get nvidia driver to work?,https://help.ubuntu.com/community/binarydriver...,"[1, 11, 449, 477, 249, 357, 97, 384, 249, 48, ...","[1, 268, 2]"


### Create tensors

In [19]:
# Create tensors with sos, pad, eos tokens
def create_tensors(df, max_len=20):
    """
    Create tensors with sos, pad, eos tokens
    :param df: dataframe with context and response
    :param max_len: maximum length of sequence
    :return: tensors with sos, pad, eos tokens
    """
    # Create tensors
    context_tensor = torch.zeros((len(df), max_len), dtype=torch.long)
    response_tensor = torch.zeros((len(df), max_len), dtype=torch.long)
    for i, (context, response) in enumerate(zip(df['context_idx'], df['response_idx'])):
        # Trim context and response
        if len(context) > max_len:
            context = context[:max_len]
        if len(response) > max_len:
            response = response[:max_len]
        # Add to tensor
        context_tensor[i, :len(context)] = torch.tensor(context, dtype=torch.long)
        response_tensor[i, :len(response)] = torch.tensor(response, dtype=torch.long)
    return context_tensor, response_tensor

In [20]:
# get max length of context and response
max_len_context = max(dialogue_df['context_idx'].apply(len))
max_len_response = max(dialogue_df['response_idx'].apply(len))
max_len = max(max_len_context, max_len_response)
max_len = 128
print(f'Maximum length of sequence: {max_len}')

Maximum length of sequence: 128


In [21]:
context_tensor, response_tensor = create_tensors(dialogue_df, max_len=max_len)
print(context_tensor.shape, response_tensor.shape)

torch.Size([42, 128]) torch.Size([42, 128])


### Split and batch the data

In [22]:
class ContextReponseBatch:
    def __init__(self, data):
        transposed_data = list(zip(*data))
        self.input = torch.stack(transposed_data[0], 0)
        self.input_mask = (self.input != 0)
        self.target = torch.stack(transposed_data[1], 0)
        self.target_mask = (self.target != 0)

    def pin_memory(self):
        """
        Pin memory for faster data transfer to GPU
        :return: self
        """
        self.input = self.input.pin_memory()
        self.input_mask = self.input_mask.pin_memory()
        self.target = self.target.pin_memory()
        self.target_mask = self.target_mask.pin_memory()
        return self

def collate_wrapper(batch):
    """
    Wrapper for collate function
    :param batch: batch of data
    :return: ContextReponseBatch object
    """
    return ContextReponseBatch(batch)

In [23]:
# Split data into train, validation, and test sets
def split_data(context_tensor, response_tensor, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """
    Split data into train, validation, and test sets
    :param context_tensor: context tensor
    :param response_tensor: response tensor
    :param train_ratio: ratio of train set
    :param val_ratio: ratio of validation set
    :param test_ratio: ratio of test set
    :return: train, validation, and test sets
    """
    # Split data into train, validation, and test sets
    dataset = TensorDataset(context_tensor, response_tensor)
    train_size = int(train_ratio * len(dataset))
    val_size = int(val_ratio * len(dataset))
    test_size = len(dataset) - train_size - val_size
    train_set, val_set, test_set = random_split(dataset, [train_size, val_size, test_size])
    return train_set, val_set, test_set

# Batch data
def batch_data(train_set, val_set, test_set, batch_size=64, fn=collate_wrapper):
    """
    Batch data
    :param train_set: train set
    :param val_set: validation set
    :param test_set: test set
    :param batch_size: batch size
    :return: train, validation, and test loaders
    """
    # Batch data
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=collate_wrapper)
    val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=True, collate_fn=collate_wrapper)
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True, collate_fn=collate_wrapper)
    return train_loader, val_loader, test_loader

In [24]:
# Create train, validation, and test sets
train_set, val_set, test_set = split_data(context_tensor, response_tensor)

# Batch data
train_loader, val_loader, test_loader = batch_data(train_set, val_set, test_set)

In [25]:
# preview shape of batch
next(iter(train_loader)).input.shape # (batch_size, max_len)

torch.Size([33, 128])

In [26]:
next(iter(train_loader)).input.transpose(0, 1).shape # (max_len, batch_size)

torch.Size([128, 33])

In [27]:
next(iter(train_loader)).input_mask

tensor([[ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        ...,
        [ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False]])

## Building the seq2seq model

In [28]:
def init_weights(m):
    """
    Initialize weights
    :param m: model
    :return: None
    """
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.xavier_uniform_(param.data)
        elif 'bias' in name:
            nn.init.constant_(param.data, 0)
    return None

In [29]:
class Encoder(nn.Module):
    def __init__(self,
                 input_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 dropout: float = 0):
        super(Encoder, self).__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True, batch_first=False, num_layers=1)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)))
        return outputs, hidden


class Attention(nn.Module):
    def __init__(self,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 attn_dim: int):
        super(Attention, self).__init__()

        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.attn_in = (enc_hid_dim * 2) + dec_hid_dim

        self.attn = nn.Linear(self.attn_in, attn_dim)

    def forward(self,
                decoder_hidden: torch.Tensor,
                encoder_outputs: torch.Tensor) -> torch.Tensor:

        src_len = encoder_outputs.shape[0]
        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # Luong attention
        energy = torch.tanh(self.attn(torch.cat((repeated_decoder_hidden, encoder_outputs), dim=2)))
        attention = torch.sum(energy, dim=2)

        return F.softmax(attention, dim=1)


class AttnDecoder(nn.Module):
    def __init__(self,
                 output_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 attention: nn.Module,
                 dropout: float = 0):
        super(AttnDecoder, self).__init__()

        self.output_dim = output_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim, batch_first=False, num_layers=1)
        self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def encode_attention(self,
                              decoder_hidden: torch.Tensor,
                              encoder_outputs: torch.Tensor) -> torch.Tensor:

        a = self.attention(decoder_hidden, encoder_outputs)
        a = a.unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted_encoder_rep = torch.bmm(a, encoder_outputs)
        weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)
        return weighted_encoder_rep

    def forward(self,
                input: torch.Tensor,
                decoder_hidden: torch.Tensor,
                encoder_outputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:

        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        weighted_encoder = self.encode_attention(decoder_hidden, encoder_outputs)
        
        rnn_input = torch.cat((embedded, weighted_encoder), dim=2)
        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))

        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted_encoder = weighted_encoder.squeeze(0)
        output = self.out(torch.cat((output, weighted_encoder, embedded), dim=1))
        return output, decoder_hidden.squeeze(0)

class Decoder(nn.Module):
    def __init__(self,
                 output_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 dropout: float = 0):
        super(Decoder, self).__init__()

        self.output_dim = output_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim, batch_first=False, num_layers=1)
        self.out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self,
                input: torch.Tensor,
                decoder_hidden: torch.Tensor,
                encoder_outputs: torch.Tensor) -> Tuple[torch.Tensor
                                                        , torch.Tensor]:
        
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        context = encoder_outputs[-1,:,:]
        context = context.repeat(embedded.shape[0], 1, 1)
        embs_and_context = torch.cat((embedded, context), -1)
        output, decoder_hidden = self.rnn(embs_and_context, decoder_hidden.unsqueeze(0))
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        context = context.squeeze(0)
        output = self.out(torch.cat((output, embedded, context), -1))
        return output, decoder_hidden.squeeze(0)

class Seq2Seq(nn.Module):
    def __init__(self,
                 encoder: nn.Module,
                 decoder: nn.Module,
                 device: torch.device):
        super(Seq2Seq, self).__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self,
                src: torch.Tensor,
                trg: torch.Tensor,
                teacher_forcing_ratio: float = 0.5) -> torch.Tensor:
        src = src.transpose(0, 1) # (max_len, batch_size)
        trg = trg.transpose(0, 1) # (max_len, batch_size)
        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)

        # first input to the decoder is the <sos> token
        output = trg[0,:]

        for t in range(1, max_len):
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = trg[t] if teacher_force else top1

        return outputs

In [30]:
test_batch = next(iter(train_loader))
test_batch.input.shape, test_batch.target.shape

(torch.Size([33, 128]), torch.Size([33, 128]))

In [31]:
# enc = Encoder(input_dim=len(word2idx), emb_dim=256, enc_hid_dim=512, dec_hid_dim=512)
# attn = Attention(enc_hid_dim=512, dec_hid_dim=512, attn_dim=64)
# dec = AttnDecoder(output_dim=len(word2idx), emb_dim=256, enc_hid_dim=512, dec_hid_dim=512, attention=attn)
# model = Seq2Seq(encoder=enc, decoder=dec, device=device)
# model.apply(init_weights)
# model.to(device)
# model.train()
# optimizer = optim.Adam(model.parameters(), lr=0.001)
# optimizer.zero_grad()
# model(test_batch.input.to(device), test_batch.target.to(device), teacher_forcing_ratio=0.5).shape

In [32]:
print(len(word2idx))

497


In [33]:
# enc = Encoder(input_dim=len(word2idx), emb_dim=256, enc_hid_dim=512, dec_hid_dim=512)
# dec = Decoder(output_dim=len(word2idx), emb_dim=256, enc_hid_dim=512, dec_hid_dim=512)
# model = Seq2Seq(encoder=enc, decoder=dec, device=device)
# model.apply(init_weights)
# model.to(device)
# model.train()
# optimizer = optim.Adam(model.parameters(), lr=0.001)
# optimizer.zero_grad()
# model(test_batch.input.to(device), test_batch.target.to(device), teacher_forcing_ratio=0.5).shape

## Training the seq2seq model

In [52]:
PAD_INDEX = word2idx['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=PAD_INDEX)

def train_model(model, train_loader, val_loader, optimizer,
                run_name = 'seq2seq', init_weights=init_weights, device=device,
                n_epochs=10, clip=1, criterion=criterion,
                teacher_forcing_ratio=0.5):
    """
    Train model
    :param model: model
    :param train_loader: train loader
    :param val_loader: validation loader
    :param optimizer: optimizer
    :param n_epochs: number of epochs
    :param clip: clip
    :param criterion: loss function
    :param PAD_INDEX: index for pad token
    :param teacher_forcing_ratio: teacher forcing ratio
    :return: model, train loss, validation loss
    """
    model = model.to(device)
    model.apply(init_weights)
    writer = SummaryWriter(f'runs/{run_name}')
    train_loss = []
    val_loss = []
    for epoch in range(n_epochs):
        model.train()
        start_time = time.time()
        epoch_loss = 1
        val_epoch_loss = 1
        for i, batch in enumerate(train_loader):
            
            src = batch.input.to(device)
            trg = batch.target.to(device)

            optimizer.zero_grad()
            output = model(src, trg, teacher_forcing_ratio)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg.transpose(0, 1)
            trg = trg[1:].reshape(-1)
            loss = criterion(output, trg)
            loss.to(device).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # clip gradients
            optimizer.step() # update parameters
            epoch_loss += loss.item() # update epoch loss
            writer.add_scalar('Train Loss', loss.item(), epoch * len(train_loader) + i)
        train_loss.append(epoch_loss / len(train_loader))
        model.eval()
        with torch.no_grad():
            for i, batch in enumerate(val_loader):
                src = batch.input.to(device)
                trg = batch.target.to(device)
                output = model(src, trg, teacher_forcing_ratio)
                output_dim = output.shape[-1]
                output = output[1:].view(-1, output_dim)
                trg = trg.transpose(0, 1)
                trg = trg[1:].reshape(-1)
                loss = criterion(output, trg)
                val_epoch_loss += loss.item()
        val_loss.append(val_epoch_loss / len(val_loader))
        writer.add_scalar('Validation Loss', val_epoch_loss / len(val_loader), epoch)
        print(f'Epoch: {epoch+1:02} | Time: {time.time()-start_time:.3f}s | Train Loss: {epoch_loss/len(train_loader):.3f} | Val Loss: {val_epoch_loss/len(val_loader):.3f}')
    writer.close()
    return model, train_loss, val_loss


In [55]:
enc = Encoder(input_dim=len(word2idx), emb_dim=256, enc_hid_dim=512, dec_hid_dim=512)
attn = Attention(enc_hid_dim=512, dec_hid_dim=512, attn_dim=64)
dec = AttnDecoder(output_dim=len(word2idx), emb_dim=256, enc_hid_dim=512, dec_hid_dim=512, attention=attn)
model = Seq2Seq(encoder=enc, decoder=dec, device=device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

model, train_loss, val_loss = train_model(model, train_loader, val_loader, optimizer, run_name='AttnSeq2Seq', n_epochs=10, teacher_forcing_ratio=0.5)

Epoch: 01 | Time: 0.468s | Train Loss: 7.214 | Val Loss: 7.203
Epoch: 02 | Time: 0.423s | Train Loss: 7.200 | Val Loss: 7.194
Epoch: 03 | Time: 0.445s | Train Loss: 7.186 | Val Loss: 7.188
Epoch: 04 | Time: 0.376s | Train Loss: 7.173 | Val Loss: 7.178
Epoch: 05 | Time: 0.447s | Train Loss: 7.160 | Val Loss: 7.167
Epoch: 06 | Time: 0.394s | Train Loss: 7.142 | Val Loss: 7.157
Epoch: 07 | Time: 0.445s | Train Loss: 7.125 | Val Loss: 7.140
Epoch: 08 | Time: 0.447s | Train Loss: 7.104 | Val Loss: 7.124
Epoch: 09 | Time: 0.418s | Train Loss: 7.083 | Val Loss: 7.110
Epoch: 10 | Time: 0.413s | Train Loss: 7.057 | Val Loss: 7.090


## Evaluating the seq2seq model

In [None]:
def evaluate_single_input(model, sentence):
    model.eval()
    tokenized = tokenize(sentence)
    indexed = [word2idx[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).unsqueeze(0).to(device)
    prediction = model(tensor, tensor)

    predicted_idx = torch.argmax(prediction.squeeze(), dim=-1).cpu().numpy()
    predicted_sentence = lookup_words(idx2word, predicted_idx)
    return ' '.join(predicted_sentence)

# Load the best saved model
model.load_state_dict(torch.load('best_model.pt'))
test_loss = evaluate_model(model, test_loader)
print(f'Test Loss: {test_loss:.3f}')

# Test on a single sentence
test_sentence = 'Hello, how are you today?'
predicted_sentence = evaluate_single_input(model, test_sentence)
print(f'Input: {test_sentence} \nOutput: {predicted_sentence}')

In [None]:
# Define batch size
batch_size = 64

# Define training and validation split
train_split = 0.8

# Define training and validation dataset size
train_size = int(len(dialogue_df) * train_split)

# Define training and validation dataset
train_df = dialogue_df[:train_size]

# Define training and validation dataset
val_df = dialogue_df[train_size:]

# Define embedding size
embedding_size = 300

# Define hidden size
hidden_size = 512

# Define number of layers
num_layers = 2

# Define dropout
dropout = 0.5

# Define learning rate
learning_rate = 0.001

# Define number of epochs
num_epochs = 10

# Define gradient clipping
clip = 50.0

# Define teacher forcing ratio
teacher_forcing_ratio = 0.5

# Define decoder learning rate
decoder_learning_ratio = 5.0