<a href="https://colab.research.google.com/github/tikendraw/chatbot-with-attention/blob/main/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys
import os

if 'google.colab' in sys.modules:
    
    # Mount Google drive
    from google.colab import drive
    drive.mount('/content/drive')
    
    ! git clone https://github.com/tikendraw/chatbot-with-attention.git 
    os.chdir('chatbot-with-attention') 
    print(os.getcwd())

    ! pip install tensorflow==2.11 -q
    ! pip install tensorflow-text -q


Mounted at /content/drive
Cloning into 'chatbot-with-attention'...
remote: Enumerating objects: 48, done.[K
remote: Counting objects: 100% (48/48), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 48 (delta 7), reused 39 (delta 4), pack-reused 0[K
Unpacking objects: 100% (48/48), 35.30 MiB | 7.25 MiB/s, done.
/content/chatbot-with-attention
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import (
    TextVectorization, 
    Embedding, 
    LSTM, 
    GRU, 
    Bidirectional, 
    TimeDistributed, 
    Dense, 
    Attention, 
    MultiHeadAttention
)

import tensorflow_text as tf_text
import pickle
from datetime import datetime
from tensorflow.keras.callbacks import CSVLogger

print('GPU Avaliable: ', gpu:=len(tf.config.list_physical_devices('GPU')))
if gpu:
    pass


GPU Avaliable:  1


# Hyperparameters

In [3]:
MAX_OUTPUT_LENGTH = 102
BATCH_SIZE = 32
UNITS = 64
EMBEDDING_DIMS = 128

# Vectorizer

In [4]:
# preprocessing text
def tf_lower_and_split_punct_en(text):
    # Split accented characters.
    text = tf_text.normalize_utf8(text, 'NFKD')
    text = tf.strings.lower(text)
    # Keep space, a to z, and select punctuation.
    text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
    # Add spaces around punctuation.
    text = tf.strings.regex_replace(text, '[.?!,¿|]', r' \0 ')
    # Strip whitespace.
    text = tf.strings.strip(text)
    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
    return text



In [5]:
# Loading vectorizer
from_disk = pickle.load(open("./components/vectorizer.pkl", "rb"))
vectorizer = TextVectorization.from_config(from_disk['config'])
# You have to call `adapt` with some dummy data (BUG in Keras)
vectorizer.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
vectorizer.set_weights(from_disk['weights'])

# Lets see the Vector for word "this"
# print (vectorizer("who am i"))

# Dataset

In [6]:
save_train_data_path = './dataset/train/'
save_test_data_path = './dataset/test/'

#loading the data
train_data = tf.data.Dataset.load(save_train_data_path, compression='GZIP')
test_data = tf.data.Dataset.load(save_test_data_path, compression='GZIP')

In [7]:
for (enc_input, dec_input), dec_output  in train_data.take(1):
    print('encoder input')
    print(enc_input[0, :20].numpy())
    print('-'*44)
    print('decoder input')
    print(dec_input[0, :20].numpy()) 
    print('-'*44)
    print('encoder output')
    print(dec_output[0, :20].numpy())
    break

encoder input
[    3    20 10942   120     5  1994    11  6126    14  9357  5660   154
     9  7211     2   443   532   105    61 11786]
--------------------------------------------
decoder input
[   3 1361  174   13 3319 4872   69   39   11 1415  764  922    2    4
    0    0    0    0    0    0]
--------------------------------------------
encoder output
[1361  174   13 3319 4872   69   39   11 1415  764  922    2    4    0
    0    0    0    0    0    0]


# Attention

In [8]:
class CrossAttention(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(key_dim=units, num_heads=1, **kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

    def call(self, x, context):

        attn_output, attn_scores = self.mha(
            query=x,
            value=context,
            return_attention_scores=True)

        # Cache the attention scores for plotting later.
        attn_scores = tf.reduce_mean(attn_scores, axis=1)
        self.last_attention_weights = attn_scores

        x = self.add([x, attn_output])
        x = self.layernorm(x)

        return x

# Encoder

In [54]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, text_vectorizer, units, embed_dims):
        super(Encoder, self).__init__()
        self.text_vectorizer =  text_vectorizer
        self.units = units
        self.embed_dims = embed_dims
        self.vocab_size = text_vectorizer.vocabulary_size()
        self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embed_dims, mask_zero=True, )
        self.rnn = Bidirectional(merge_mode='concat', layer = GRU(self.units, return_sequences=True, return_state=True))
        
    def call(self, x, y=None, return_state=False):
        
        x = self.embedding(x)
        encoder_output, encoder_fw_state, encoder_bw_state = self.rnn(x)
        # encoder_state = [encoder_fw_state, encoder_bw_state]  # for LSTM
        encoder_state = encoder_fw_state                        # for GRU

        if return_state:
            return encoder_output, encoder_state
        else:
            return encoder_output
        
    def convert_input(self, texts, return_state=False):
        texts = tf.convert_to_tensor(texts)
        if len(texts.shape) == 0:
            texts = tf.convert_to_tensor(texts)[tf.newaxis]
        context = self.text_vectorizer(texts)
        
        context = self(context, return_state = return_state)
        
        return context

# Decoder

In [77]:
class Decoder(keras.layers.Layer):
    def __init__(self, text_vectorizer, units,  embed_dims) :
        super(Decoder, self).__init__()
        self.text_vectorizer =  text_vectorizer
        self.units = units
        self.embed_dims = embed_dims
        self.vocab_size = text_vectorizer.vocabulary_size()
        
        self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embed_dims, mask_zero=True, )
        self.rnn = GRU(self.units, return_sequences=True, return_state=True)
        
        # self.attention =  tf.keras.layers.Attention()
        self.attention = CrossAttention(units)
        
        self.output_dense = Dense(self.vocab_size)
        
        self.word_to_id = tf.keras.layers.StringLookup(vocabulary=text_vectorizer.get_vocabulary(), mask_token='', oov_token='[UNK]')
        self.id_to_word = tf.keras.layers.StringLookup(vocabulary=text_vectorizer.get_vocabulary(), mask_token='', oov_token='[UNK]', invert=True)
        
        self.start_token = self.word_to_id('[START]')
        self.end_token = self.word_to_id('[END]')

    def call(self, x, context, state=None, return_state = False):
        ''' x, context, state=None, return_sequence=False '''
        
        x = self.embedding(x)
        decoder_output, decoder_state = self.rnn(x, initial_state=state)
        
        # Simple attetion
        x = self.attention([decoder_output, context])
        
        # decoder_state = [decoder_state_h, decoder_state_c]
        x = self.attention(decoder_output, context)
        self.last_attention_weights = self.attention.last_attention_weights

        logits = self.output_dense(x)
        
        if return_state:
            return logits, decoder_state
        else:
            return logits
        
    def get_initial_state(self, context):
        batch_size = tf.shape(context)[0]
        start_tokens = tf.fill([batch_size, 1], self.start_token)
        done = tf.zeros([batch_size, 1], dtype=tf.bool)
        embedded = self.embedding(start_tokens)
        return start_tokens, done, self.rnn.get_initial_state(embedded)[0]

    
    def tokens_to_text(self, tokens):
        words = self.id_to_word(tokens)
        result = tf.strings.reduce_join(words, axis=-1, separator=' ')
        result = tf.strings.regex_replace(result, '^ *\[START\] *', '')
        result = tf.strings.regex_replace(result, ' *\[END\] *$', '')
        return result
    
    def get_next_token(self, next_token, context,  done, state, temperature = 0.0):
        
        logits, state = self(next_token, context, state = state, return_state=True) 

        if temperature == 0.0:
            next_token = tf.argmax(logits, axis=-1)
        else:
            logits = logits[:, -1, :]/temperature
            next_token = tf.random.categorical(logits, num_samples=1)

        # If a sequence produces an `end_token`, set it `done`
        done = done | (next_token == self.end_token)
        # Once a sequence is done it only produces 0-padding.
        next_token = tf.where(done, tf.constant(0, dtype=tf.int64), next_token)

        return next_token, done, state

In [78]:
# rnn = GRU(1)
# rnn1 = LSTM(1)

# ee = decoder.embedding(enc_input)
# print(ee.shape)

# ree = rnn1(ee)
# print(ree.shape)

# encoder = Encoder(vectorizer, UNITS, EMBEDDING_DIMS)
# enc_context, enc_state = encoder(enc_input, return_state = True)

# print('enc_input:', enc_input.shape)
# print('enc_context:', enc_context.shape)
# print('enc_state:', enc_state.shape)

# decoder = Decoder(vectorizer, UNITS, EMBEDDING_DIMS)
# dec_out, dec_state = decoder(dec_input, enc_context, return_state = True )

# print('dec_input:', dec_input.shape)
# print('dec_out:', dec_out.shape)
# print('dec_state:', dec_state.shape)b

In [96]:
re = tf.expand_dims(ree, -1)
reee = keras.layers.RepeatVector(4)(re)
reee.shape, ree.shape, re.shape

ValueError: ignored

# Model

In [81]:
class ChatBot(tf.keras.Model):
    
    @classmethod
    def add_method(cls, fun):
        setattr(cls, fun.__name__, fun)
        return fun

    def __init__(self, text_processor, units, embed_dims):
        super().__init__()
        self.text_processor = text_processor
        self.units = units
        self.embed_dims = embed_dims
        
        # Build the encoder and decoder
        encoder = Encoder(text_processor, units, embed_dims)
        decoder = Decoder(text_processor, units, embed_dims)
        
        self.encoder = encoder
        self.decoder = decoder

    def call(self, inputs):
        context, x = inputs
        context , state= self.encoder(context, return_state = True)
        logits = self.decoder(x, context, state)

        #TODO(b/250038731): remove this
        try:
          # Delete the keras mask, so keras doesn't scale the loss+accuracy. 
            del logits._keras_mask
        except AttributeError:
            pass

        return logits


In [82]:
def masked_loss(y_true, y_pred):
    # Calculate the loss for each item in the batch.
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    loss = loss_fn(y_true, y_pred)

    # Mask off the losses on padding.
    mask = tf.cast(y_true != 0, loss.dtype)
    loss *= mask

    # Return the total.
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)

In [83]:
def masked_acc(y_true, y_pred):
    # Calculate the loss for each item in the batch.
    y_pred = tf.argmax(y_pred, axis=-1)
    y_pred = tf.cast(y_pred, y_true.dtype)

    match = tf.cast(y_true == y_pred, tf.float32)
    mask = tf.cast(y_true != 0, tf.float32)

    return tf.reduce_sum(match)/tf.reduce_sum(mask)

# Compile and train

In [84]:
model = ChatBot(vectorizer, UNITS, EMBEDDING_DIMS)

In [85]:
model.compile(optimizer='adam',
              loss=masked_loss, 
              metrics=[masked_acc, masked_loss])

In [86]:
EPOCHS = 10

CKPT_DIR = './model_checkpoint'
# CKPT_DIR = '/content/drive/MyDrive/tf_model/chatbot'
os.makedirs(CKPT_DIR, exist_ok = True)
model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    os.path.join(CKPT_DIR,  f"{datetime.now().strftime('%m:%d:%Y, %H:%M:%S')}"),
    monitor= 'masked_acc',
    verbose= 0,
    save_best_only = True,
    save_weights_only = True,
    mode= 'auto',
    save_freq='epoch'
)

os.makedirs('log', exist_ok = True)
csv_logger = CSVLogger('./log/training.log')


In [87]:
# Train
history = model.fit(
    train_data.repeat(), 
    epochs=EPOCHS,
    steps_per_epoch = 80,
    validation_data=test_data,
    validation_steps = 3,
    callbacks=[
                tf.keras.callbacks.EarlyStopping(patience=5),
                model_ckpt,
                csv_logger]
                )

Epoch 1/10
embedding shape:  (None, None, 128)
rnn shape:  (None, None, 64)
embedding shape:  (None, None, 128)
rnn shape:  (None, None, 64)
rnn shape:  (None, None, 64)
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Translate

In [88]:
@ChatBot.add_method
def translate(self,
              texts, *,
              max_length=50,
              temperature=0.0):
    # Process the input texts
    context = self.encoder.convert_input(texts, return_state = True)

    context, enc_state = context
    # fw_state, bw_state = state
    batch_size = tf.shape(texts)[0]

    # Setup the loop inputs
    tokens = []
    attention_weights = []
    next_token, done, state = self.decoder.get_initial_state(context)
    state = enc_state
    # state =[state,state]
    for _ in range(max_length):
        # Generate the next token
        next_token, done, state = self.decoder.get_next_token(
                next_token, context, done,  state, temperature)

        # Collect the generated tokens
        tokens.append(next_token)
        # attention_weights.append(self.decoder.last_attention_weights)

        if tf.executing_eagerly() and tf.reduce_all(done):
            break

    # Stack the lists of tokens and attention weights.
    tokens = tf.concat(tokens, axis=-1)   # t*[(batch 1)] -> (batch, t)
    # self.last_attention_weights = tf.concat(attention_weights, axis=1)  # t*[(batch 1 s)] -> (batch, t s)

    result = self.decoder.tokens_to_text(tokens)
    return result, tokens

In [90]:
result = model.translate(['How long you have been there?'], temperature = 0)
result[0].numpy()[0]

embedding shape:  (1, 1, 128)
rnn shape:  (1, 1, 64)
embedding shape:  (1, 1, 128)
rnn shape:  (1, 1, 64)
embedding shape:  (1, 1, 128)
rnn shape:  (1, 1, 64)
embedding shape:  (1, 1, 128)
rnn shape:  (1, 1, 64)
embedding shape:  (1, 1, 128)
rnn shape:  (1, 1, 64)


b'i dont know . '

In [27]:
# model1 = model  # model with attention

In [21]:
result

(<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'i dont know . '], dtype=object)>,
 <tf.Tensor: shape=(1, 5), dtype=int64, numpy=array([[ 8, 23, 27,  2,  0]])>)