In [1]:
# Imports for data handling and manipulation
import os
import re
import unicodedata
import json
from sklearn.model_selection import train_test_split
import random

# PyTorch imports for building and training neural networks
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Natural Language Toolkit for preprocessing and tokenization
import nltk
from nltk.tokenize import word_tokenize

# Ensure that NLTK resources are downloaded (e.g., punkt tokenizer)
nltk.download('punkt')

# Convokit for downloading and processing the Cornell Movie-Dialogs Corpus
from convokit import Corpus, download

# Check if CUDA is available for GPU acceleration, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vladandreichuk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [50]:
# Data loading and preprocessing
corpus = Corpus(filename=download("movie-corpus"))
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def normalize_string(s):
    # Convert to ASCII
    s = unicode_to_ascii(s.lower().strip())
    # Replace dots with nothing
    s = s.replace('.', '')
    # Space + punctuation to ensure tokens like "hello?" are treated as "hello ?"
    s = re.sub(r"([!?])", r" \1", s)
    # Remove any characters that are not a sequence of lower/upper case letters or the retained punctuation marks
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s
# Extract sentence pairs
all_pairs = []
for conv in corpus.get_conversation_ids():
    conversation = corpus.get_conversation(conv)
    utterance_ids = conversation.get_utterance_ids()
    for i in range(len(utterance_ids) - 1):
        utt1 = corpus.get_utterance(utterance_ids[i])
        utt2 = corpus.get_utterance(utterance_ids[i + 1])
        if utt1 and utt2:  # Ensure both utterances are not None
            all_pairs.append([normalize_string(utt2.text), normalize_string(utt1.text)])

# Flatten the list and filter out too long sentences
all_pairs = [
    pair
    for pair in all_pairs
    if 2 <= len(pair[0].split(" ")) <= 10 and 2 <= len(pair[1].split(" ")) <= 10
]
all_pairs[:10000]
# Reduce dataset size
sample_pairs = random.sample(all_pairs, int(len(all_pairs) * 1))
train_pairs, val_pairs = train_test_split(sample_pairs, test_size=0.2)
len(train_pairs), len(val_pairs)
# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token
# Vocabulary class
class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count default tokens

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1
voc = Voc('movie-corpus')

for pair in train_pairs:
    voc.add_sentence(pair[0])
    voc.add_sentence(pair[1])
voc.num_words
filtered_val_pairs = []

# Iterate through val_pairs
for pair in val_pairs:

    # Split each sentence into words
    words_0 = pair[0].split()
    words_1 = pair[1].split()
    
    # Check if all words in both sentences are in the vocabulary
    if all(word in voc.word2index for word in words_0) and all(word in voc.word2index for word in words_1):
        
        # If all words are in the vocabulary, append the pair to the filtered list
        filtered_val_pairs.append(pair)
len(val_pairs), len(filtered_val_pairs)
val_pairs = filtered_val_pairs

Downloading movie-corpus to /Users/vladandreichuk/.convokit/downloads/movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done


In [56]:
def indexes_from_sentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

In [57]:
# Model hyperparameters
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64
decoder_learning_ratio = 5.0
clip = 50.0
learning_rate = 0.0001
n_iteration = 4000
print_every = 1
save_every = 100
validate_every = 1

In [58]:
# Model building
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        embedded = self.embedding(input_seq)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        return outputs, hidden
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size

        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def forward(self, hidden, encoder_outputs):
        attn_energies = self.dot_score(hidden, encoder_outputs)
        attn_energies = attn_energies.t()
        return F.softmax(attn_energies, dim=1).unsqueeze(1)
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        rnn_output, hidden = self.gru(embedded, last_hidden)
        attn_weights = self.attn(rnn_output, encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        return output, hidden

In [59]:
embedding = nn.Embedding(voc.num_words, hidden_size)

encoder = EncoderRNN(hidden_size, 
                     embedding, 
                     encoder_n_layers, 
                     dropout)

decoder = LuongAttnDecoderRNN('dot', 
                               embedding, 
                               hidden_size, 
                               voc.num_words,
                               decoder_n_layers, 
                               dropout)

In [61]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        with torch.no_grad():
            encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
            decoder_hidden = encoder_hidden[:decoder.n_layers]

            decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
            all_tokens = torch.zeros([0], device=device, dtype=torch.long)
            all_scores = torch.zeros([0], device=device)

            for _ in range(max_length):
                decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
                decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
                all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
                all_scores = torch.cat((all_scores, decoder_scores), dim=0)
                decoder_input = torch.unsqueeze(decoder_input, 0)
            
            return all_tokens, all_scores


def evaluate(encoder, decoder, searcher, voc, sentence, max_length=10):
    indexes_batch = [indexes_from_sentence(voc, sentence)]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch]).to(device)
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1).to(device)
    tokens, scores = searcher(input_batch, lengths, max_length)
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words

def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while True:
        try:
            input_sentence = input('> ')
            if input_sentence in ('q', 'quit'): break
            input_sentence = normalize_string(input_sentence)
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            output_words = [x for x in output_words if not (x == 'EOS' or x == 'PAD') ] #
            print('Bot:', ' '.join(output_words))
        except KeyError:
            print("Error: Encountered unknown word.")


# If loading on same machine the model was trained on
checkpoint = torch.load('checkpoints/1600_checkpoint.tar')

encoder_sd = checkpoint['en']
decoder_sd = checkpoint['de']
encoder_optimizer_sd = checkpoint['en_opt']
decoder_optimizer_sd = checkpoint['de_opt']
embedding_sd = checkpoint['embedding']
voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')


# Initialize word embeddings


embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models

encoder.load_state_dict(encoder_sd)
decoder.load_state_dict(decoder_sd)

# Use appropriate device

print('Models built and ready to go!')
# Set dropout layers to eval mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

# Begin chatting 
evaluateInput(encoder, decoder, searcher, voc)

Building encoder and decoder ...
Models built and ready to go!
Bot: hello ?
Bot: i m
Bot: i m
