## Install dependencies

In [1]:
!pip install tf-nightly-gpu-2.0-preview==2.0.0.dev20190413

import tensorflow as tf
print(tf.__version__)


Collecting tf-nightly-gpu-2.0-preview==2.0.0.dev20190413
[?25l  Downloading https://files.pythonhosted.org/packages/dc/65/2e0abcabd4da641096dad92b4f2ab9d27de508d66efe5c4742ffb6ae4744/tf_nightly_gpu_2.0_preview-2.0.0.dev20190413-cp36-cp36m-manylinux1_x86_64.whl (345.3MB)
[K    100% |████████████████████████████████| 345.3MB 46kB/s 
[?25hCollecting google-pasta>=0.1.2 (from tf-nightly-gpu-2.0-preview==2.0.0.dev20190413)
[?25l  Downloading https://files.pythonhosted.org/packages/64/bb/f1bbc131d6294baa6085a222d29abadd012696b73dcbf8cf1bf56b9f082a/google_pasta-0.1.5-py3-none-any.whl (51kB)
[K    100% |████████████████████████████████| 61kB 26.6MB/s 
Collecting tensorflow-estimator-2.0-preview (from tf-nightly-gpu-2.0-preview==2.0.0.dev20190413)
[?25l  Downloading https://files.pythonhosted.org/packages/81/47/1a7a31baa3e34b33bc241014a6295588019e2922e3546a891e65f7671b8d/tensorflow_estimator_2.0_preview-1.14.0.dev2019042600-py2.py3-none-any.whl (421kB)
[K    100% |███████████████████████

## Integrate Google Drive

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive


## Update Git Repository 

In [0]:
%cd /gdrive/Team\ Drives/umayux/Research/NLP/chatbot/transformer/
!git pull



/gdrive/Team Drives/umayux/Research/NLP/chatbot/transformer
Already up to date.


## Train the model

In [0]:
import tensorflow_datasets as tfds
import tensorflow as tf

from src.optimizer import CustomSchedule, loss_function
from src.dataset import Dataset
from src.model import Transformer
import time
from src.masking import create_masks
import pickle


MAX_LENGTH=40
BUFFER_SIZE=20000
BATCH_SIZE=64
EPOCHS=100
num_heads=8
num_layers=4
d_model=128
dff=512
dropout_rate=0.1
test_partition=0.2
dataset_file="./data/banco/bancobot.tsv"
checkpoint_path="./data/banco/"
retrain=True

# Build the dataset for training validation
dataset = Dataset(filename=dataset_file)
dataset.build_train_test(test=test_partition)
train_examples, val_examples = dataset.format_train_test()

if retrain:
    
    # loading tokenizers for future predictions
    with open(checkpoint_path + "/tokenizer_source.pickle", "rb") as handle:
        tokenizer_source = pickle.load(handle)

    with open(checkpoint_path + "/tokenizer_target.pickle", "rb") as handle:
        tokenizer_target = pickle.load(handle)
    
    # update dataset class with previous data
    dataset.tokenizer_source = tokenizer_source
    dataset.tokenizer_target = tokenizer_target
    
else:        
    tokenizer_source, tokenizer_target = dataset.tokenizer(train_examples)


train_dataset = train_examples.map(dataset.tf_encode)
train_dataset = train_dataset.filter(dataset.filter_max_length)
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1])
)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

val_dataset = val_examples.map(dataset.tf_encode)
val_dataset = val_dataset.filter(dataset.filter_max_length).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1])
)

input_vocab_size = tokenizer_source.vocab_size + 2
target_vocab_size = tokenizer_target.vocab_size + 2

# Setup the learning rate and optimizer
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9
)

train_loss = tf.keras.metrics.Mean(name="train_loss")
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name="train_accuracy"
)

# setup Transformer Model
transformer = Transformer(
    num_layers,
    d_model,
    num_heads,
    dff,
    input_vocab_size,
    target_vocab_size,
    dropout_rate,
)

# setup checkpoints
ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")
else:
    print("Initializing from scratch.")

# saving tokenizers
with open(checkpoint_path + "/tokenizer_source.pickle", "wb") as handle:
    pickle.dump(tokenizer_source, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(checkpoint_path + "/tokenizer_target.pickle", "wb") as handle:
    pickle.dump(tokenizer_target, handle, protocol=pickle.HIGHEST_PROTOCOL)

# define training function step
@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        inp, tar_inp
    )

    with tf.GradientTape() as tape:
        predictions, _ = transformer(
            inp,
            tar_inp,
            True,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)

# training loop
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()

    # inp -> portuguese, tar -> english
    for (batch, (inp, tar)) in enumerate(train_dataset):
        train_step(inp, tar)
        if batch % 500 == 0:
            print(
                "Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}".format(
                    epoch + 1,
                    batch,
                    train_loss.result(),
                    train_accuracy.result(),
                )
            )

    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print(
            "Saving checkpoint for epoch {} at {}".format(
                epoch + 1, ckpt_save_path
            )
        )

    print(
        "Epoch {} Loss {:.4f} Accuracy {:.4f}".format(
            epoch + 1, train_loss.result(), train_accuracy.result()
        )
    )

    print("Time taken for 1 epoch: {} secs\n".format(time.time() - start))


  return filenames.flat_map(read_one_file)
  lambda filename: (tf.data.TextLineDataset(filename))
  lambda filename: (tf.data.TextLineDataset(filename))


ValueError: ignored

## Test the model

In [3]:
%cd /gdrive/Team\ Drives/umayux/Research/NLP/chatbot/transformer/

import tensorflow_datasets as tfds
import tensorflow as tf
import utensor.dataset as dt
from utensor.optimizer import CustomSchedule, loss_function
from utensor.model import Transformer
import time
from utensor.masking import create_masks
import pickle
import matplotlib.pyplot as plt



checkpoint_path="./data/banco/"
d_model = 128
MAX_LENGTH=60
BUFFER_SIZE=20000
BATCH_SIZE=64
num_heads=8
num_layers=4
d_model=128
dff=512
dropout_rate=0.1



def restore():
    
    # loading tokenizers for future predictions
    tokenizer_source = pickle.load(open(checkpoint_path + './tokenizer_source.pickle', 'rb'))
    tokenizer_target = pickle.load(open(checkpoint_path + './tokenizer_target.pickle', 'rb'))

    input_vocab_size = tokenizer_source.vocab_size + 2
    target_vocab_size = tokenizer_target.vocab_size + 2

    learning_rate = CustomSchedule(d_model)
    optimizer = tf.keras.optimizers.Adam(
        learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9
    )

    transformer = Transformer(
        num_layers,
        d_model,
        num_heads,
        dff,
        input_vocab_size,
        target_vocab_size,
        dropout_rate,
    )


    ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)
    ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

    # if a checkpoint exists, restore the latest checkpoint.
    if ckpt_manager.latest_checkpoint:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        print("Latest checkpoint restored!!")
    else:
        print("Initializing from scratch.")
        
    return transformer, tokenizer_source, tokenizer_target


           
    
def evaluate(inp_sentence):
    start_token = [tokenizer_source.vocab_size]
    end_token = [tokenizer_source.vocab_size + 1]

    # inp sentence is portuguese, hence adding the start and end token
    inp_sentence = start_token + tokenizer_source.encode(inp_sentence) + end_token
    encoder_input = tf.expand_dims(inp_sentence, 0)

    # as the target is english, the first word to the transformer should be the
    # english start token.
    decoder_input = [tokenizer_target.vocab_size]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(MAX_LENGTH):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(encoder_input, 
                                                     output,
                                                     False,
                                                     enc_padding_mask,
                                                     combined_mask,
                                                     dec_padding_mask)

        # select the last word from the seq_len dimension
        predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # return the result if the predicted_id is equal to the end token
        if tf.equal(predicted_id, tokenizer_target.vocab_size+1):
            return tf.squeeze(output, axis=0), attention_weights

        # concatentate the predicted_id to the output which is given to the decoder
        # as its input.
        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights



def translate(sentence):
    result, attention_weights = evaluate(sentence)

    predicted_sentence = tokenizer_target.decode([i for i in result 
                                            if i < tokenizer_target.vocab_size])  

    print('Pregunta: {}'.format(sentence))
    print('Respuesta UmyBot: {}'.format(predicted_sentence))



transformer, tokenizer_source, tokenizer_target = restore()


/gdrive/Team Drives/umayux/Research/NLP/chatbot/transformer
Latest checkpoint restored!!


In [4]:
translate('banco_falabella dicen que no están operativo')

Pregunta: banco_falabella dicen que no están operativo
Respuesta UmyBot: hola @fefith, hemos realizado pruebas de transferencia y no presentamos inconvenientes.


In [0]:
translate('@bancolombia #bancoeterno se cagaron literalmente el servicio. ahora son menos filas pero más demoras con el servicio.')

Pregunta: @bancolombia #bancoeterno se cagaron literalmente el servicio. ahora son menos filas pero más demoras con el servicio.
Respuesta UmyBot: ¡hola! cuéntanos por favor si haces referencia a un poco más de tu comentario, para poder ayudarte. saludos. ana


## Predict examples

In [0]:
%cd /gdrive/Team\ Drives/umayux/Research/NLP/chatbot/transformer/

import tensorflow_datasets as tfds
import tensorflow as tf
import utensor.dataset as dt
from utensor.optimizer import CustomSchedule, loss_function
from utensor.model import Transformer
import time
from utensor.masking import create_masks
import pickle
import matplotlib.pyplot as plt

checkpoint_path="./data/banco/"
d_model = 128
MAX_LENGTH=40
BUFFER_SIZE=20000
BATCH_SIZE=64
num_heads=8
num_layers=4
dff=512
dropout_rate=0.1

def restore():

    # loading tokenizers for future predictions
    tokenizer_source = pickle.load(open(checkpoint_path+'/tokenizer_source.pickle', 'rb'))
    tokenizer_target = pickle.load(open(checkpoint_path+'/tokenizer_target.pickle', 'rb'))

    input_vocab_size = tokenizer_source.vocab_size + 2
    target_vocab_size = tokenizer_target.vocab_size + 2

    learning_rate = CustomSchedule(d_model)
    optimizer = tf.keras.optimizers.Adam(
        learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9
    )

    transformer = Transformer(
        num_layers,
        d_model,
        num_heads,
        dff,
        input_vocab_size,
        target_vocab_size,
        dropout_rate,
    )

    ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)
    ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

    # if a checkpoint exists, restore the latest checkpoint.
    if ckpt_manager.latest_checkpoint:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        print("Latest checkpoint restored!!")
    else:
        print("Initializing from scratch.")

    return transformer, tokenizer_source, tokenizer_target
        

        
def evaluate(inp_sentence):
    start_token = [tokenizer_source.vocab_size]
    end_token = [tokenizer_source.vocab_size + 1]
    
    
    # inp sentence is portuguese, hence adding the start and end token
    inp_sentence = start_token + tokenizer_source.encode(inp_sentence) + end_token
    encoder_input = tf.expand_dims(inp_sentence, 0)

    # as the target is english, the first word to the transformer should be the
    # english start token.
    decoder_input = [tokenizer_target.vocab_size]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(40):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        encoder_input, output)

    # predictions.shape == (batch_size, seq_len, vocab_size)
    predictions, attention_weights = transformer(encoder_input, 
                                                 output,
                                                 False,
                                                 enc_padding_mask,
                                                 combined_mask,
                                                 dec_padding_mask)

    # select the last word from the seq_len dimension
    predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)

    predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

    # return the result if the predicted_id is equal to the end token
    if tf.equal(predicted_id, tokenizer_target.vocab_size+1):
        return tf.squeeze(output, axis=0), attention_weights

    # concatentate the predicted_id to the output which is given to the decoder
    # as its input.
    output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights




def translate(sentence, plot=''):
    
    result, attention_weights = evaluate(sentence)

    predicted_sentence = tokenizer_target.decode([i for i in result 
                                            if i < tokenizer_target.vocab_size])  

    print('Pregunta: {}'.format(sentence))
    print('Respuesta UmyBot: {}'.format(predicted_sentence))

    #   if plot:
    #     plot_attention_weights(attention_weights, sentence, result, plot)

    
    
transformer, tokenizer_source, tokenizer_target = restore()


/gdrive/Team Drives/umayux/Research/NLP/chatbot/transformer
Latest checkpoint restored!!


In [0]:
sentence = '@bancolombia #bancoeterno se cagaron literalmente el servicio. ahora son menos filas pero más demoras con el servicio.'
translate(sentence, plot='')

Pregunta: @bancolombia #bancoeterno se cagaron literalmente el servicio. ahora son menos filas pero más demoras con el servicio.
Respuesta UmyBot: ¡


In [0]:
import pandas as pd
data = pd.read_csv('./data/banco/bancobot.tsv.test', sep='\t', names=['source', 'target'])



In [0]:

for ix,i in data.iterrows():
    translate(
        i['source']
    )
    print("Respuesta Humano: {}".format(i['target']))
    print('\n\n')
    


In [0]:
def plot_attention_weights(attention, sentence, result, layer, tokenizer_source, tokenizer_target):
  fig = plt.figure(figsize=(30, 38))
  
  sentence = tokenizer_source.encode(sentence)
  
  attention = tf.squeeze(attention[layer], axis=0)
  
  for head in range(attention.shape[0]):
    ax = fig.add_subplot(8, 1, head+1)
    
    # plot the attention weights
    ax.matshow(attention[head][:-1, :], cmap='viridis')

    fontdict = {'fontsize': 10}
    
    ax.set_xticks(range(len(sentence)+2))
    ax.set_yticks(range(len(result)))
    
    ax.set_ylim(len(result)-1.5, -0.5)
        
    ax.set_xticklabels(
        ['<start>']+[tokenizer_source.decode([i]) for i in sentence]+['<end>'], 
        fontdict=fontdict, rotation=90)
    
    ax.set_yticklabels([tokenizer_target.decode([i]) for i in result 
                        if i < tokenizer_target.vocab_size], 
                       fontdict=fontdict)
    
    ax.set_xlabel('Head {}'.format(head+1))
  
  plt.tight_layout()
  plt.show()