# Sequence to sequence model:  training a chatbot

In [83]:
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
from pathlib import Path
import itertools
import math
import numpy as np

## Download data
This corpus contains a collection of fictional conversations extracted from raw movie scripts.
https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html

In [5]:
def unpack_dataset():
    ! wget http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
    ! mkdir -p data
    ! unzip cornell_movie_dialogs_corpus.zip -d data

In [6]:
unpack_dataset()

--2019-06-03 14:13:26--  http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.20
Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.20|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9916637 (9.5M) [application/zip]
Saving to: ‘cornell_movie_dialogs_corpus.zip’


2019-06-03 14:13:29 (3.17 MB/s) - ‘cornell_movie_dialogs_corpus.zip’ saved [9916637/9916637]

Archive:  cornell_movie_dialogs_corpus.zip
   creating: data/cornell movie-dialogs corpus/
  inflating: data/cornell movie-dialogs corpus/.DS_Store  
   creating: data/__MACOSX/
   creating: data/__MACOSX/cornell movie-dialogs corpus/
  inflating: data/__MACOSX/cornell movie-dialogs corpus/._.DS_Store  
  inflating: data/cornell movie-dialogs corpus/chameleons.pdf  
  inflating: data/__MACOSX/cornell movie-dialogs corpus/._chameleons.pdf  
  inflating: data/cornell movie-dialogs corpus/movie_characters_metadata

In [11]:
PATH = Path("data/cornell movie-dialogs corpus")
list(PATH.iterdir())

[PosixPath('data/cornell movie-dialogs corpus/.DS_Store'),
 PosixPath('data/cornell movie-dialogs corpus/raw_script_urls.txt'),
 PosixPath('data/cornell movie-dialogs corpus/README.txt'),
 PosixPath('data/cornell movie-dialogs corpus/movie_titles_metadata.txt'),
 PosixPath('data/cornell movie-dialogs corpus/movie_characters_metadata.txt'),
 PosixPath('data/cornell movie-dialogs corpus/movie_lines.txt'),
 PosixPath('data/cornell movie-dialogs corpus/chameleons.pdf'),
 PosixPath('data/cornell movie-dialogs corpus/movie_conversations.txt')]

In [17]:
path = PATH/"movie_lines.txt"
!head "$path"

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No
L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I'm kidding.  You know how sometimes you just become this "persona"?  And you don't know how to quit?
L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?


In [19]:
path = PATH/"movie_conversations.txt"
!head "$path"

u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L363', 'L364']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L365', 'L366']


## Pre-processing data
We’ll create a data file in which each line contains a tab-separated "query sentence" and a "response sentence" pair.

In [22]:
def ExtractMovieLines(fileName):
    """
    Given: "L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!"
    Returns: lines[L1045] = "They do not!"
    """
    lines = {}
    with open(fileName, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            lines[values[0]] = values[4]
    return lines

In [24]:
movie_lines = ExtractMovieLines(PATH/"movie_lines.txt")

In [26]:
#movie_lines

In [23]:
def ExtractConversations(fileName):
    conversations = []
    with open(fileName, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            lineIds = eval(values[3])
            conversations.append(lineIds)
    return conversations

In [27]:
conversations = ExtractConversations(PATH/"movie_conversations.txt")

In [28]:
conversations[:10]

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208'],
 ['L271', 'L272', 'L273', 'L274', 'L275'],
 ['L276', 'L277'],
 ['L280', 'L281'],
 ['L363', 'L364'],
 ['L365', 'L366']]

In [29]:
# Extracts pairs of sentences from conversations
def ExtractSentencePairs(conversations, movie_lines):
    qa_pairs = []
    for l in conversations:
        for i in range(len(l) - 1):  # We ignore the last line (no answer for it)
            inputLine = movie_lines[l[i]].strip()
            targetLine = movie_lines[l[i+1]].strip()
            # Filter wrong samples (if one of the lists is empty)
            if inputLine and targetLine:
                qa_pairs.append([inputLine, targetLine])
    return qa_pairs

In [30]:
qa_pairs = ExtractSentencePairs(conversations, movie_lines)

In [31]:
len(qa_pairs)

221282

In [32]:
qa_pairs[:5]

[['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
  "Well, I thought we'd start with pronunciation, if that's okay with you."],
 ["Well, I thought we'd start with pronunciation, if that's okay with you.",
  'Not the hacking and gagging and spitting part.  Please.'],
 ['Not the hacking and gagging and spitting part.  Please.',
  "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"],
 ["You're asking me out.  That's so cute. What's your name again?",
  'Forget it.'],
 ["No, no, it's my fault -- we didn't have a proper introduction ---",
  'Cameron.']]

## Filter sentences
Filter sentences with less than 15 words

In [35]:
MAX_LENGTH = 15
def filterPair(p):
    # Input sequences need to preserve the last word for EOS token
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

In [37]:
pairs = [pair for pair in qa_pairs if filterPair(pair)]
len(pairs)

133705

## Split into train and validation

In [48]:
def split_data(pairs):
    n = len(pairs)
    pairs = np.array(pairs)
    mask = np.random.choice([True, False], size=n, p=[0.8, 0.2])
    return pairs[mask], pairs[~mask]

In [49]:
train_pairs, valid_pairs = split_data(pairs)

## Creating a Vocabulary

In [50]:
from collections import defaultdict

In [58]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [59]:
def get_vocab(content):
    """Computes Dict of counts of words.
    Computes the number of times a word is on a document.
    """
    vocab = defaultdict(float)
    for (line1, line2) in content:
        line1, line2 = normalizeString(line1), normalizeString(line2)
        words = set(line1.split() + line2.split())
        for word in words:
            vocab[word] += 1
    return vocab

In [60]:
word_count = get_vocab(train_pairs)
len(word_count)

28705

In [61]:
for word in list(word_count):
    if word_count[word] < 5:
        del word_count[word]
len(word_count)

9122

In [62]:
## Finally we need an index for each word in the vocab
vocab2index = {"<PAD>":0, "UNK":1, "<START>":2, "<END>":3} # init with padding and unknown
words = ["<PAD>", "UNK", "<START>", "<END>"]
for word in word_count:
    vocab2index[word] = len(words)
    words.append(word)

## Sentence encoding

In [94]:
def encode_sentence(s, N=17):
    s = normalizeString(s)
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([2] + [vocab2index.get(w, vocab2index["UNK"]) for w in s.split()] + [3])
    l = min(N, len(enc1))
    enc[:l] = enc1[:l]
    return enc, l

In [95]:
s = train_pairs[0][0]
s

"Well, I thought we'd start with pronunciation, if that's okay with you."

In [96]:
normalizeString(s)

'well i thought we d start with pronunciation if that s okay with you .'

In [97]:
encode_sentence(s)

(array([ 2, 15, 14, 18, 19,  8,  7, 20,  1, 13, 10,  6, 12, 20,  5, 21,  3],
       dtype=int32), 17)

## Dataset

In [98]:
class ChatBotDataset(Dataset):
    def __init__(self, X):
        self.x = X
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x, y = self.x[idx]
        x, n_x = encode_sentence(x)
        y, n_y = encode_sentence(y)
        return x, y, n_x
    
train_ds = ChatBotDataset(train_pairs)
valid_ds = ChatBotDataset(valid_pairs)

In [99]:
train_ds[0]

(array([ 2, 15, 14, 18, 19,  8,  7, 20,  1, 13, 10,  6, 12, 20,  5, 21,  3],
       dtype=int32),
 array([ 2, 16, 11,  4, 23, 22, 23, 17, 24, 21,  9, 21,  3,  0,  0,  0,  0],
       dtype=int32),
 17)

In [100]:
batch_size=5
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)

## Seq2seq model

In [92]:
![](images/encoder-decoder.png)

/bin/sh: -c: line 0: syntax error near unexpected token `"images/encoder-decoder.png"'
/bin/sh: -c: line 0: `[]("images/encoder-decoder.png")'


https://github.com/Arturus/kaggle-web-traffic/blob/master/images/encoder-decoder.png

## Encoder

In [102]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, bidirectional=True)

    def forward(self, x, n_x):
        x = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(x, n_x)
        outputs, hidden = self.gru(packed)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden

## Decoder

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, embedding, hidden_size, output_size, n_layers=1):
        super(DecoderRNN, self).__init__()

        # Keep for reference
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

# Model with attention

## Dot attention 

In [104]:
class DotAttn(nn.Module):
    def __init__(self, hidden_size):
        super(Attn, self).__init__()
        self.hidden_size = hidden_size

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based
        attn_energies = torch.sum(hidden * encoder_output, dim=2)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

## Decoder with attention

In [105]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

## References
Based on
* https://pytorch.org/tutorials/beginner/chatbot_tutorial.html