In [2]:
# load libraries
import pandas as pd
import numpy as np
import re
import os

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm


from sklearn.model_selection import train_test_split

# load movie line from local directory (remember to change to YOUR file location)
with open('movie_lines.txt', 'r', encoding='utf-8', errors='replace') as file:
    lines = pd.read_table(file, sep='\t', header=None, on_bad_lines='skip')

# load conversation file from github (no need to change anything here for data to load)
convolines = pd.read_table('https://raw.githubusercontent.com/wkencel/Generative-Chatbot-Project/refs/heads/main/movie_conversations.txt', sep='\t', header=None, encoding='utf-8', on_bad_lines='skip')

# view lines
lines[:10]

convolines[:10]

# Create dictionary to map each line's id with its text
id2line = {}

# Iterate over each row in the dataframe and access the text data
for line in lines[0]:  # Access the first column which contains the movie lines
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

# Print the first 10 entries
for i, (key, value) in enumerate(id2line.items()):
    if i < 10:  # Change this number to see more or fewer entries
        print(f"{key}: {value}")
    else:
        break

# Create a list of all of the conversations' lines' ids
convs = []
for index, row in convolines.iterrows():
    line = row[0]  # Access the first column of the row
    _line = line.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
    convs.append(_line.split(','))

# Print the first 10 entries
convs[:10]

# Sort the sentences: inputs (questions) and targets (answers)
questions = []
answers = []

for conv in convs:
    for i in range(len(conv) - 1):
        if conv[i] in id2line and conv[i + 1] in id2line:
            questions.append(id2line[conv[i]])
            answers.append(id2line[conv[i + 1]])

print("Number of questions:", len(questions))
print("Number of answers:", len(answers))

# Check if data is loaded correctly
limit = 0
for i in range(limit, limit+5):
    print(questions[i])
    print(answers[i])
    print()

# Create a DataFrame from questions and answers
data = {'Questions': questions, 'Answers': answers}
data = pd.DataFrame(data)

data.head()

data.shape

# Remove duplicates
data.drop_duplicates(inplace  = True)

data.shape

# Function for cleaning the text: lowercase, remove punctuations, and replace certain words

def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()

    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)

    return text

# Apply the function to the DataFrame
data['Questions'] = data['Questions'].apply(clean_text)
data['Answers'] = data['Answers'].apply(clean_text)

# Display the cleaned DataFrame
data.head()

# More text pre-processing
import string

exclude = set(string.punctuation)
remove_digits = str.maketrans('', '', string.digits)

# More text pre-processing
def preprocess_questions_sentences(sent):
    '''Function to preprocess English Sentence'''
    sent = sent.lower()
    sent = sent.replace("'", '')
    sent = ''.join(ch for ch in sent if ch not in exclude)
    sent = sent.translate(remove_digits)

    sent = sent.strip()
    sent = re.sub(" +", " ", sent)
    return sent


# include SOS (start of sent.) & EOS (end of sent.) tokens
def preprocess_answer_sentence(sent):
    if isinstance(sent, str):
        sent = sent.lower()
        sent = sent.replace("'", '')
        sent = ''.join(ch for ch in sent if ch not in exclude)
        sent = sent.translate(remove_digits)
        sent = sent.strip()
        sent = re.sub(" +", " ", sent)
        sent = "startseq " + sent + " endseq"
        return sent
    else:

        return sent

# Apply preprocess function on data
data['Questions'] = data['Questions'].apply(preprocess_questions_sentences)
data['Answers'] = data['Answers'].apply(preprocess_answer_sentence)

# Display the cleaned DataFrame
data.head()

# Remove questions and answers shorter than 1 word and longer than 20 words
min_line_length = 1
max_line_length = 20

# Create a function to count the number of words in a text
def count_words(text):
    return len(text.split())

# Filter the DataFrame
filtered_data = data[
    (data['Questions'].apply(count_words).between(min_line_length, max_line_length)) &
    (data['Answers'].apply(count_words).between(min_line_length, max_line_length))
]

# Update the original DataFrame
data = filtered_data

data.head()

# Sort Qs and As by length of questions to reduce amount of padding during training
# Hope to speed up training and reduce the loss

# Convert questions and answers to their respective lengths
data['Question_Length'] = data['Questions'].apply(lambda x: len(x.split()))
data['Answer_Length'] = data['Answers'].apply(lambda x: len(x.split()))

# Sort Qs and As by length of questions
sorted_questions = []
sorted_answers = []

for length in range(1, max_line_length + 1):
    for index, row in data.iterrows():
        if row['Question_Length'] == length:
            sorted_questions.append(row['Questions'])
            sorted_answers.append(row['Answers'])

# Output the results
print(len(sorted_questions))
print(len(sorted_answers))
print()
for i in range(min(3, len(sorted_questions))):  # Use min to avoid index errors
    print(f"Question {i + 1}: {sorted_questions[i]}")
    print(f"Answer {i + 1}: {sorted_answers[i]}")
    print()

# Sort the DataFrame by question length
data = data.sort_values(by='Question_Length')

# Reset index if needed
data.reset_index(drop=True, inplace=True)

# Output the sorted DataFrame
data[['Questions', 'Answers', 'Question_Length']].head()

data.shape

# Convert DataFrame columns to lists
q_sentences = data['Questions'].tolist()
a_sentences = data['Answers'].tolist()

# Define the split ratios
train_ratio = 0.80  # 80% for training
val_ratio = 0.10    # 10% for validation
test_ratio = 0.10   # 10% for testing

# Ensure the sum of ratios equals 1
assert train_ratio + val_ratio + test_ratio == 1.0, "Split ratios must sum to 1."

# Split into training and temporary sets (which will later be split into validation and test)
train_q_sents, temp_q_sents, train_a_sents, temp_a_sents = train_test_split(
    q_sentences, a_sentences, test_size=(1 - train_ratio), random_state=42, shuffle=True)

# Now split the temporary set into validation and test sets
val_size = val_ratio / (val_ratio + test_ratio)  # Calculate validation size relative to temp set
val_q_sents, test_q_sents, val_a_sents, test_a_sents = train_test_split(
    temp_q_sents, temp_a_sents, test_size=val_size, random_state=42, shuffle=True)

# VOCABULARY
# Filter out non-string elements from training sets
train_q_sents = [str(sent) for sent in train_q_sents]
train_a_sents = [str(sent) for sent in train_a_sents]

# Tokenize question sentences
ques_tokenizer = Tokenizer(oov_token='<OOV>')
ques_tokenizer.fit_on_texts(train_q_sents)
ques_vocab_size = len(ques_tokenizer.word_index) + 1

# Tokenize answer sentences
ans_tokenizer = Tokenizer()
ans_tokenizer.fit_on_texts(train_a_sents)
ans_vocab_size = len(ans_tokenizer.word_index) + 1

print(f"Question Vocabulary Size: {ques_vocab_size}\nAnswer Vocabulary Size: {ans_vocab_size}")

max_length = 20 #Updated from 30 to 20

# Convert text to sequences
ques_sequences = ques_tokenizer.texts_to_sequences(train_q_sents)
ans_sequences = ans_tokenizer.texts_to_sequences(train_a_sents)

# Pad sequences
source_seqs = pad_sequences(ques_sequences, maxlen=max_length, padding='post')
target_seqs = pad_sequences(ans_sequences, maxlen=max_length, padding='post')

# Create training dataset
train_dataset = tf.data.Dataset.from_tensor_slices((source_seqs, target_seqs))
train_dataset = train_dataset.shuffle(buffer_size=len(source_seqs)).batch(128, drop_remainder=True)

# Create validation dataset
val_sequences = ques_tokenizer.texts_to_sequences(val_q_sents)
val_sequences = pad_sequences(val_sequences, maxlen=max_length, padding='post')
val_target_sequences = ans_tokenizer.texts_to_sequences(val_a_sents)
val_target_sequences = pad_sequences(val_target_sequences, maxlen=max_length, padding='post')
val_dataset = tf.data.Dataset.from_tensor_slices((val_sequences, val_target_sequences))
val_dataset = val_dataset.batch(128, drop_remainder=True)

# Create test dataset
test_sequences = ques_tokenizer.texts_to_sequences(test_q_sents)
test_sequences = pad_sequences(test_sequences, maxlen=max_length, padding='post')
test_target_sequences = ans_tokenizer.texts_to_sequences(test_a_sents)
test_target_sequences = pad_sequences(test_target_sequences, maxlen=max_length, padding='post')
test_dataset = tf.data.Dataset.from_tensor_slices((test_sequences, test_target_sequences))
test_dataset = test_dataset.batch(128, drop_remainder=True)

# Print sizes of the datasets
print(f"Training set size: {len(train_q_sents)}")
print(f"Validation set size: {len(val_q_sents)}")
print(f"Test set size: {len(test_q_sents)}")

L1045: They do not!
L1044: They do to!
L985: I hope so.
L984: She okay?
L925: Let's go.
L924: Wow
L872: Okay -- you're gonna need to learn how to lie.
L871: No
L870: I'm kidding.  You know how sometimes you just become this "persona"?  And you don't know how to quit?
L869: Like my fear of wearing pastels?
Number of questions: 221416
Number of answers: 221416
Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
Well, I thought we'd start with pronunciation, if that's okay with you.

Well, I thought we'd start with pronunciation, if that's okay with you.
Not the hacking and gagging and spitting part.  Please.

Not the hacking and gagging and spitting part.  Please.
Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?

You're asking me out.  That's so cute. What's your name again?
Forget it.

No, no, it's my fault -- we didn't have a proper introduction ---
Cameron.

160580
160580

Questi

In [8]:
# Define the Seq2Seq Model with Attention Mechanism

data = data.sample(frac=0.5, random_state=42)  # Reduce to 50% of the original data

# Import necessary modules
from tensorflow.keras.metrics import SparseCategoricalAccuracy

# Define the accuracy metric
accuracy_metric = SparseCategoricalAccuracy()

class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, units):
        super(AttentionLayer, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, encoder_output, hidden_state):
        hidden_with_time_axis = tf.expand_dims(hidden_state, 1)
        score = tf.nn.tanh(self.W1(encoder_output) + self.W2(hidden_with_time_axis))

        # Calculate attention weights
        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        # Weighted sum of the encoder output using the attention weights
        context_vector = attention_weights * encoder_output
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

# Encoder class definition
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(self.enc_units,
                                 return_sequences=True,
                                 return_state=True,
                                 recurrent_initializer='glorot_uniform'))

    def call(self, x):
        x = self.embedding(x)
        output, forward_h, forward_c, backward_h, backward_c = self.lstm(x)
        state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
        state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])
        return output, state_h, state_c

# Decoder class definition
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(self.dec_units * 2,
                                         return_sequences=True,
                                         return_state=True,
                                         recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = AttentionLayer(self.dec_units)

    def call(self, x, enc_output, hidden, enc_state_c):
        context_vector, attention_weights = self.attention(enc_output, hidden)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state_h, state_c = self.lstm(x, initial_state=[hidden, enc_state_c])
        x = self.fc(output)
        x = tf.reshape(x, (-1, x.shape[2]))
        return x, state_h, state_c, attention_weights

# Training process
optimizer = tf.keras.optimizers.Adam()

# Loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

# Hyperparameters
embedding_dim = 512
units = 256
BATCH_SIZE = 128

# Define the encoder and decoder
encoder = Encoder(ques_vocab_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(ans_vocab_size, embedding_dim, units, BATCH_SIZE)

# Checkpoint saving
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

# Training step function with accuracy tracking
@tf.function
def train_step(inp, targ, enc_hidden, enc_state_c):
    loss = 0
    accuracy_metric.reset_state()  # Reset accuracy metric for each batch

    with tf.GradientTape() as tape:
        enc_output, enc_hidden, enc_state_c = encoder(inp)

        dec_hidden = enc_hidden
        dec_state_c = enc_state_c
        dec_input = tf.expand_dims([ans_tokenizer.word_index['startseq']] * BATCH_SIZE, 1)

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, dec_state_c, _ = decoder(dec_input, enc_output, dec_hidden, dec_state_c)

            loss += loss_function(targ[:, t], predictions)

            # Update accuracy metric
            accuracy_metric.update_state(targ[:, t], predictions)

            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss, accuracy_metric.result()

# Training loop with accuracy displayed after each epoch
EPOCHS = 15

for epoch in range(EPOCHS):
    enc_hidden = tf.zeros((BATCH_SIZE, units * 2))
    enc_state_c = tf.zeros((BATCH_SIZE, units * 2))

    total_loss = 0
    total_accuracy = 0

    for (batch, (inp, targ)) in enumerate(train_dataset.take(len(train_q_sents)//BATCH_SIZE)):
        batch_loss, batch_accuracy = train_step(inp, targ, enc_hidden, enc_state_c)
        total_loss += batch_loss

        if batch % 100 == 0:
            print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy()} Accuracy {batch_accuracy.numpy()}')

    # Saving the model after every epoch
    checkpoint.save(file_prefix=checkpoint_prefix)

    print(f'Epoch {epoch+1} Loss {total_loss / (len(train_q_sents)//BATCH_SIZE)} Accuracy {batch_accuracy.numpy()}')

# Validation and Testing (example code can be added for evaluation if needed)

# Save paths using the TensorFlow format
encoder_save_path = './saved_models_15epocs/encoder_model'
decoder_save_path = './saved_models_15epocs/decoder_model'

# Make directories if they don't exist
os.makedirs(os.path.dirname(encoder_save_path), exist_ok=True)

# Save the encoder and decoder models using TensorFlow SavedModel format
encoder.save(encoder_save_path, save_format="tf")
decoder.save(decoder_save_path, save_format="tf")

print(f"Encoder and Decoder models saved to {encoder_save_path} and {decoder_save_path} in SavedModel format.")

import shutil
import os


# Create zip archives of the SavedModel directories
encoder_zip_path = shutil.make_archive(encoder_save_path, 'zip', encoder_save_path)
decoder_zip_path = shutil.make_archive(decoder_save_path, 'zip', decoder_save_path)




Epoch 1 Batch 0 Loss 4.225269317626953 Accuracy 0.0
Epoch 1 Batch 100 Loss 2.3524129390716553 Accuracy 0.06044407933950424
Epoch 1 Batch 200 Loss 2.5202603340148926 Accuracy 0.060855261981487274
Epoch 1 Batch 300 Loss 2.256880283355713 Accuracy 0.06455592066049576
Epoch 1 Batch 400 Loss 1.973192572593689 Accuracy 0.07319078594446182
Epoch 1 Batch 500 Loss 2.3228752613067627 Accuracy 0.08799342066049576
Epoch 1 Batch 600 Loss 1.9807785749435425 Accuracy 0.09457236528396606
Epoch 1 Batch 700 Loss 2.103070020675659 Accuracy 0.09292763471603394
Epoch 1 Batch 800 Loss 2.033123731613159 Accuracy 0.10115131735801697
Epoch 1 Batch 900 Loss 1.8812826871871948 Accuracy 0.09046052396297455
Epoch 1 Batch 1000 Loss 2.061699628829956 Accuracy 0.09457236528396606
Epoch 1 Loss 2.1843485832214355 Accuracy 0.10608552396297455
Epoch 2 Batch 0 Loss 1.8094481229782104 Accuracy 0.09621710330247879
Epoch 2 Batch 100 Loss 1.876987099647522 Accuracy 0.09333881735801697
Epoch 2 Batch 200 Loss 1.8601092100143433