<a href="https://colab.research.google.com/github/tvaditya/intro_ds_and_ml/blob/main/%5BDL7%5D_Gera%C3%A7%C3%A3o_de_textos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Projeto 16: Geração de textos

Codificação adaptada da documentação do TensorFlow: https://www.tensorflow.org/beta/tutorials/text/text_generation




- Usaremos uma base de dados extraída de textos escritos por Shakespeare's
- Link: http://karpathy.github.io/2015/05/21/rnn-effectiveness/
- O objetivo é treinar a LSTM para prever o próximo caractere em uma sequência de texto


# Etapa 1: Importação das bibliotecas

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
import numpy as np
import os
import time
tf.__version__

# Etapa 2: Carregamento e exploração da base de dados

In [None]:
data_url = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [None]:
dataset_text = open(data_url, 'rb').read().decode(encoding = 'utf-8')

In [None]:
print(dataset_text)

In [None]:
len(dataset_text)

In [None]:
vocab = sorted(set(dataset_text))

In [None]:
print('{} unique characters'.format(len(vocab)))

In [None]:
vocab

# Etapa 3: Mapeamento de texto para números

In [None]:
char2idx = {char: index for index, char in enumerate(vocab)}

In [None]:
char2idx

In [None]:
idx2char = np.array(vocab)

In [None]:
idx2char

In [None]:
idx2char[10]

In [None]:
char2idx[':']

In [None]:
text_as_int = np.array([char2idx[char] for char in dataset_text])

In [None]:
text_as_int

In [None]:
text_as_int.shape

In [None]:
print('{} characters mapped to int ---> {}'.format(repr(dataset_text[:13]), text_as_int[:13]))

# Etapa 4: Criação dos exemplos de treinamento e batches


- Dividiremos a base de dados em uma sequência de caracteres com "seq_length"
- A saída (dados reais) será o mesmo que a entrada, porém, com um caractere deslocado
- Exemplo com o texto "Hello" e seq_len = 4 
    - Entrada: "Hell"
    - Saída: "ello" 


In [None]:
len(dataset_text)

In [None]:
seq_length = 100
examples_per_epoch = len(dataset_text) // seq_length
examples_per_epoch

In [None]:
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [None]:
char_dataset

In [None]:
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

In [None]:
sequences

In [None]:
for item in sequences.take(50):
  print(repr(''.join(idx2char[item.numpy()])))

In [None]:
def split_input_target(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text

In [None]:
dataset = sequences.map(split_input_target)

In [None]:
for input_example, target_example in dataset.take(10):
  print('Input data:', repr(''.join(idx2char[input_example.numpy()])))
  print('Target data:', repr(''.join(idx2char[target_example.numpy()])))

In [None]:
batch_size = 64
buffer_size = 10000

In [None]:
dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder = True)

In [None]:
dataset

# Etapa 5: Construção do modelo

In [None]:
len(vocab)

In [None]:
vocab_size = len(vocab)

In [None]:
embedding_dim = 256

In [None]:
rnn_units = 1024

In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
                               tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                               tf.keras.layers.Dense(vocab_size)])
  return model

In [None]:
model = build_model(vocab_size = len(vocab), embedding_dim=embedding_dim, rnn_units=rnn_units, batch_size=batch_size)

In [None]:
for input_example_batch, target_example_batch in dataset.take(10):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape)

In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)

In [None]:
sampled_indices

In [None]:
sampled_indices = tf.squeeze(sampled_indices, axis = -1).numpy()

In [None]:
sampled_indices

In [None]:
print('Input: \n', repr(''.join(idx2char[input_example_batch[0]])))
print()
print('Next char predictions: \n', repr(''.join(idx2char[sampled_indices])))

# Etapa 6: Treinamento do modelo

### Otimizador e loss function

In [None]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [None]:
example_batch_loss = loss(target_example_batch, example_batch_predictions)

In [None]:
example_batch_loss.numpy().mean()

In [None]:
model.compile(optimizer='Adam', loss=loss)

### Checkpoints

In [None]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

### Execução do treinamento

In [None]:
epochs = 10
history = model.fit(dataset, epochs = epochs, callbacks=[checkpoint_callback])

# Etapa 7: Geração de textos

### Restauração do último checkpoint

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size = 1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [None]:
model.summary()

### Loop de previsão

In [None]:
def generate_text(model, start_string):
  # Número de caracteres a serem gerados
  num_generate = 1000

  # Conversão dos caracteres iniciais de string para números
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Lista para armazenar os textos gerados pela rede neural
  text_generated = []

  # Parâmetro temperatura
  # Valores baixos resultam em melhores textos (deve ser testado)
  temperature = 1.0

  # Loop para gerar os textos
  for i in range(num_generate):
    # Previsões
    predictions = model(input_eval)

    # Tratamento das previsões
    predictions = tf.squeeze(predictions, 0)
    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

    # Passamos a previsão como próxima entrada da rede
    input_eval = tf.expand_dims([predicted_id], 0)

    text_generated.append(idx2char[predicted_id])
  
  return (start_string + ''.join(text_generated))

In [None]:
print(generate_text(model, start_string='ROMEO: '))