In [1]:
# Imports for data handling and manipulation
import os
import re
import unicodedata
import json
from sklearn.model_selection import train_test_split
import random

# PyTorch imports for building and training neural networks
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Natural Language Toolkit for preprocessing and tokenization
import nltk
from nltk.tokenize import word_tokenize

# Ensure that NLTK resources are downloaded (e.g., punkt tokenizer)
nltk.download('punkt')

# Convokit for downloading and processing the Cornell Movie-Dialogs Corpus
from convokit import Corpus, download

# Check if CUDA is available for GPU acceleration, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vladandreichuk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Data Prep

In [2]:
class Vocabulary:
    def __init__(self):
        self.word2index = {"SOS": 0, "EOS": 1, "PAD": 2}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2: "PAD"}
        self.num_words = 3  # Count SOS, EOS, and PAD

    def add_sentence(self, sentence):
        for word in nltk.word_tokenize(sentence):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

In [3]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [4]:
# Load and preprocess data
def load_and_preprocess_data():
    corpus = Corpus(filename=download("movie-corpus"))
    qa_pairs = []
    for convo in corpus.iter_conversations():
        for i in range(len(convo.get_utterance_ids()) - 1):
            input_line = convo.get_utterance(convo.get_utterance_ids()[i]).text
            target_line = convo.get_utterance(convo.get_utterance_ids()[i + 1]).text
            input_line = normalize_string(input_line)
            target_line = normalize_string(target_line)
            qa_pairs.append([input_line, target_line])
    return qa_pairs


In [5]:
# Building the vocabulary
def build_vocab(qa_pairs):
    vocab = Vocabulary()
    for pair in qa_pairs:
        vocab.add_sentence(pair[0])
        vocab.add_sentence(pair[1])
    return vocab

In [6]:
qa_pairs = load_and_preprocess_data()
vocab = build_vocab(qa_pairs)

Downloading movie-corpus to /Users/vladandreichuk/.convokit/downloads/movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done


In [7]:
# Split the dataset into training, validation, and test sets
train_val_pairs, test_pairs = train_test_split(qa_pairs, test_size=0.1)
train_pairs, val_pairs = train_test_split(train_val_pairs, test_size=0.11)  # 0.11 * 0.9 ≈ 0.1

In [8]:
MAX_LENGTH = 10  # Maximum sentence length for filtering

In [9]:
class DialogueDataset(Dataset):
    def __init__(self, pairs, vocab):
        self.pairs = pairs
        self.vocab = vocab

    def indexes_from_sentence(self, sentence):
        return [self.vocab.word2index[word] for word in nltk.word_tokenize(sentence)] + [self.vocab.word2index["EOS"]]

    def pad_sequence(self, sequence, max_length):
        padded_sequence = sequence + [self.vocab.word2index["PAD"]] * (max_length - len(sequence))
        return padded_sequence

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        input_sequence = self.indexes_from_sentence(self.pairs[idx][0])
        target_sequence = self.indexes_from_sentence(self.pairs[idx][1])
        max_length = max(len(input_sequence), len(target_sequence))
        input_padded = self.pad_sequence(input_sequence, max_length)
        target_padded = self.pad_sequence(target_sequence, max_length)
        return {
            'input_tensor': torch.tensor(input_padded, dtype=torch.long),
            'target_tensor': torch.tensor(target_padded, dtype=torch.long),
            'input_length': len(input_sequence),
            'target_length': len(target_sequence)
        }


In [10]:
def collate_fn(batch):
    input_tensors = [item['input_tensor'] for item in batch]
    target_tensors = [item['target_tensor'] for item in batch]
    input_lengths = [len(input_tensor) for input_tensor in input_tensors]
    target_lengths = [len(target_tensor) for target_tensor in target_tensors]

    # Padding sequences
    input_tensors = torch.nn.utils.rnn.pad_sequence(input_tensors, padding_value=vocab.word2index["PAD"])
    target_tensors = torch.nn.utils.rnn.pad_sequence(target_tensors, padding_value=vocab.word2index["PAD"])

    return {
        'input_tensor': input_tensors,
        'target_tensor': target_tensors,
        'input_length': torch.tensor(input_lengths),
        'target_length': torch.tensor(target_lengths)
    }

# Model

In [11]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
        super(EncoderRNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers, dropout=(0 if num_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        embedded = self.embedding(input_seq)
        packed = pack_padded_sequence(embedded, input_lengths, enforce_sorted=False)
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        return outputs, hidden

In [12]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size

        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        if self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)
        elif self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)

        attn_energies = attn_energies.t()
        return F.softmax(attn_energies, dim=1).unsqueeze(1)



In [13]:
class DecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding_size, hidden_size, output_size, num_layers=1, dropout=0.1):
        super(DecoderRNN, self).__init__()
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(output_size, embedding_size)
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(embedding_size + hidden_size, hidden_size, num_layers, dropout=(0 if num_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        attn_weights = self.attn(last_hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        context = context.transpose(0, 1)
        rnn_output, hidden = self.gru(torch.cat((embedded, context), 2), last_hidden)
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze()
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        return output, hidden, attn_weights


In [14]:
# Model Hyperparameters
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
attn_model = 'dot'
embedding_size = hidden_size
learning_rate = 0.0001
decoder_learning_ratio = 5.0
teacher_forcing_ratio = 1.0
clip = 1.0
batch_size = 64
n_iteration = 4000  # Adjust the number of iterations as needed
print_every = 1
save_every = 500

In [15]:
# Initialize models
encoder = EncoderRNN(vocab.num_words, hidden_size, encoder_n_layers, dropout).to(device)
decoder = DecoderRNN(attn_model, embedding_size, hidden_size, vocab.num_words, decoder_n_layers, dropout).to(device)


# Training

In [25]:
# Create datasets
train_dataset = DialogueDataset(train_pairs, vocab)
val_dataset = DialogueDataset(val_pairs, vocab)
test_dataset = DialogueDataset(test_pairs, vocab)

# Data loaders for batching
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=collate_fn)

In [26]:
SOS_token = 0  # Start-of-sentence token
EOS_token = 1  # End-of-sentence token

In [27]:
# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

DecoderRNN(
  (embedding): Embedding(49892, 500)
  (embedding_dropout): Dropout(p=0.1, inplace=False)
  (gru): GRU(1000, 500, num_layers=2, dropout=0.1)
  (concat): Linear(in_features=1000, out_features=500, bias=True)
  (out): Linear(in_features=500, out_features=49892, bias=True)
  (attn): Attn()
)

In [34]:
# Initialize optimizers
print('Building optimizers...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)

Building optimizers...


In [29]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [46]:
def train(input_tensor, input_length, target_tensor, target_length, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length, clip):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_tensor = input_tensor.to(device)
    target_tensor = target_tensor.to(device)

    loss = 0

    encoder_outputs, encoder_hidden = encoder(input_tensor, input_length, None)

    decoder_input = torch.tensor([[vocab.word2index['SOS']] * target_tensor.size(1)], device=device)
    decoder_hidden = encoder_hidden[:decoder.num_layers]

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    batch_size = target_tensor.size(1)
    print("Batch size: ", batch_size)
    print("Target length: ", target_length)
    max_target_len = max(target_length).item()  # Change here
    print("Max target length: ", max_target_len)

    if use_teacher_forcing:
        for di in range(max_target_len):  # Adjust loop to use max_target_len
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)
            current_targets = target_tensor[di]
            decoder_input = target_tensor[di].view(1, -1)  # Adjust indexing for teacher forcing
            # Calculate loss for each item in the batch and sum
            print("Decorder output and current targets shapes:", decoder_output.shape, current_targets.shape)
            loss_i = criterion(decoder_output, current_targets)
            print("Loss: ", loss_i)
            raise TypeError("Error")
            loss += loss_i
    else:
        for di in range(max_length):  # Loop up to max_length as before
            decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach().unsqueeze(0)  # Detach from history as input
            # Calculate loss for each item in the batch and sum
            loss += criterion(decoder_output, target_tensor[:, di])
            if decoder_input.item() == vocab.word2index['EOS']:
                break

    # Adjust loss calculation to average over only the non-PAD tokens
    loss = loss / batch_size
    loss.backward()

    torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / max_target_len  # Return average loss per token

In [47]:
def trainIters(model_name, train_loader, encoder, decoder, encoder_optimizer, decoder_optimizer, n_iteration, print_every, save_every, clip, max_length):
    print("Starting Training!")
    print_loss_total = 0
    criterion = nn.CrossEntropyLoss(ignore_index=vocab.word2index['PAD'])

    for iteration in range(1, n_iteration + 1):
        training_batch = next(iter(train_loader))
        input_tensor = training_batch['input_tensor']
        target_tensor = training_batch['target_tensor']
        input_length = training_batch['input_length']
        target_length = training_batch['target_length']
        
        loss = train(input_tensor, input_length, target_tensor, target_length, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length, clip)

        print_loss_total += loss
        # plot_loss_total += loss

        if iteration % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print(f'Iteration: {iteration}; Average Loss: {print_loss_avg:.4f}')

        if iteration % save_every == 0:
            directory = os.path.join('saved_models', model_name)
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'vocab_dict': vocab.__dict__,
            }, os.path.join(directory, f'{model_name}_{iteration}.pt'))

In [48]:
# Define a path to save the model
model_name = "seq2seq_chatbot"
save_dir = "./"  # Adjust as needed

In [49]:
# Start the training
trainIters(model_name, train_loader, encoder, decoder, encoder_optimizer, decoder_optimizer, n_iteration, print_every, save_every, clip,max_length=10)

Starting Training!
Batch size:  64
Target length:  tensor([11, 10, 29,  8, 16, 19,  8, 24, 32, 26,  8, 22,  7, 60,  5, 39, 16, 16,
        16, 10, 14,  6, 28, 12, 21, 74, 30,  8, 35, 14,  4, 12, 19, 17, 27,  8,
        26, 26, 38, 16, 16, 21,  6, 16, 10, 34, 10, 18, 19, 12, 48, 32, 26, 44,
         8,  5, 29, 12, 26, 15, 35, 15, 16,  3])
Max target length:  74
Decorder output and current targets shapes: torch.Size([64, 49892]) torch.Size([64])
Loss:  tensor(10.8176, grad_fn=<NllLossBackward0>)


TypeError: Error

# Inference

In [None]:
def evaluate_sentence(encoder, decoder, sentence, vocab, device, max_length=10):
    with torch.no_grad():
        input_tensor = tensorFromSentence(vocab, sentence, device)
        input_length = input_tensor.size(0)
        encoder_hidden = encoder.initHidden(device)

        encoder_outputs, encoder_hidden = encoder(input_tensor, torch.tensor([input_length], device=device))

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden[:decoder.n_layers]

        decoded_words = []
        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(index2word[topi.item()])
            decoder_input = topi.squeeze().detach()

        return decoded_words

In [None]:
def evaluate_bleu(encoder, decoder, pairs, vocab, device):
    smoothing_function = SmoothingFunction().method1
    references = []
    candidates = []
    for input_sentence, reference_sentence in pairs:
        reference = word_tokenize(reference_sentence)
        candidate = evaluate_sentence(encoder, decoder, input_sentence, vocab, device)
        if '<EOS>' in candidate:
            candidate.remove('<EOS>')
        references.append([reference])
        candidates.append(candidate)
    score = corpus_bleu(references, candidates, smoothing_function=smoothing_function)
    return score

In [None]:
def create_pairs(conversations):
    """Creates input-target pairs from conversations"""
    pairs = []
    for conv in conversations:
        for i in range(len(conv) - 1):
            input_sentence = ' '.join(conv[i])  # Join tokens into a single string
            target_sentence = ' '.join(conv[i + 1])
            pairs.append((input_sentence, target_sentence))
    return pairs

test_pairs = create_pairs(test_conversations)

In [None]:
# Assuming test_pairs is a list of (input_sentence, reference_sentence) pairs
bleu_score = evaluate_bleu(encoder, decoder, test_pairs, vocab, device)
print(f'BLEU score: {bleu_score:.4f}')
