Chatbot Development Using Sequence-to-Sequence Model

In this guide, we will develop a chatbot using the Sequence-to-Sequence (Seq2Seq) model,
leveraging the Cornell Movie Dialogs Corpus.
The objective is to create a chatbot that can understand and respond to human queries by learning from movie dialogues.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


1. Environment Setup

In [1]:
# !pip install tensorflow==2.11 keras==2.11.0 numpy pandas
import tensorflow as tf
print(tf.__version__)
print(tf.keras.__version__)

2.11.0
2.11.0


2. Import Libraries

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json

3. Load the Dataset and Convert the JSON data to a pandas DataFrame

In [4]:
with open('archive/movie-dialog-corpus-metadata.json', 'r') as file:
    data = json.load(file)


4. Preprocess the Data

In [10]:
# Extract the relevant columns from the 'field' entries
columns = [field['source']['extract']['column'] for field in data['recordSet'][0]['field']]

# Load the TSV file
tsv_path = 'archive/movie_characters_metadata.tsv'
df = pd.read_csv(tsv_path, sep='\t', usecols=columns)

# Combine the columns into dialogues (if needed)
# Here, we assume each column represents a line of dialogue
dialogues = df.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Convert to DataFrame
dialogues_df = pd.DataFrame(dialogues, columns=['text'])

# Extracting dialogues
dialogues = dialogues_df['text'].apply(lambda x: x.lower())

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dialogues)
tokenized_dialogues = tokenizer.texts_to_sequences(dialogues)

# Padding sequences
max_sequence_len = max([len(x) for x in tokenized_dialogues])
input_sequences = pad_sequences(tokenized_dialogues, maxlen=max_sequence_len, padding='post')

# Add special tokens to the tokenizer after fitting
start_token = 'startseq'
end_token = 'endseq'
tokenizer.word_index[start_token] = len(tokenizer.word_index) + 1
tokenizer.word_index[end_token] = len(tokenizer.word_index) + 1
tokenizer.index_word[len(tokenizer.word_index)] = start_token
tokenizer.index_word[len(tokenizer.word_index) + 1] = end_token

# Create target sequences by shifting input sequences by one position
target_sequences = np.zeros_like(input_sequences)
target_sequences[:, :-1] = input_sequences[:, 1:]
target_sequences[:, -1] = tokenizer.word_index[end_token]

# Check for any out-of-bound indices
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size:", vocab_size)
print("Max index in target_sequences:", np.max(target_sequences))
if np.any(target_sequences >= vocab_size):
    print("Out of bound indices found")
    out_of_bounds_indices = np.where(target_sequences >= vocab_size)
    print("Out of bound indices:", out_of_bounds_indices)
    print("Target sequences at out of bound indices:", target_sequences[out_of_bounds_indices])
    raise ValueError("Some indices in target_sequences are out of bounds.")

#

Vocabulary size: 15244
Max index in target_sequences: 15243


5. Create the Seq2Seq Model

In [11]:
# Define the Seq2Seq model
latent_dim = 256  # Latent dimensionality of the encoding space
num_tokens = vocab_size

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(num_tokens, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_tokens, latent_dim, mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(num_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


6. Train the Model

In [12]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Assuming you have target_sequences ready
model.fit([input_sequences, target_sequences], target_sequences, batch_size=64, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x784992502d40>

7. Set up the inference models

In [13]:
# Encoder inference model
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder inference model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

8. Generate Responses

In [16]:
# Function to preprocess input text
def preprocess_input(text):
    tokenized_input = tokenizer.texts_to_sequences([text.lower()])
    padded_input = pad_sequences(tokenized_input, maxlen=max_sequence_len, padding='post')
    return padded_input

# Function to generate chatbot responses
def decode_sequence(input_seq):
    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))

    # Populate the first character of target sequence with the start character
    target_seq[0, 0] = tokenizer.word_index['startseq']

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = tokenizer.index_word[sampled_token_index]
        decoded_sentence += ' ' + sampled_char

        # Exit condition: either hit max length or find stop character
        if (sampled_char == 'endseq' or len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

# Function to generate response from input text
def generate_response(input_text):
    input_seq = preprocess_input(input_text)
    response = decode_sequence(input_seq)
    return response

# Test the chatbot
test_inputs = [
    "Hi there!",
    "How are you?",
    "Tell me a joke.",
    "What is your name?"
]

for input_text in test_inputs:
    print("Input:", input_text)
    response = generate_response(input_text)
    print("Response:", response)
    print()

Input: Hi there!
Response:  startseq startseq startseq startseq startseq startseq

Input: How are you?
Response:  man startseq startseq startseq startseq startseq startseq

Input: Tell me a joke.
Response:  man startseq startseq startseq startseq startseq startseq

Input: What is your name?
Response:  man m89 startseq startseq startseq startseq startseq

