# Source of information

The information was based:

https://lejdiprifti.com/2023/10/14/creating-a-text-generation-neural-network-with-tensorflow/

In [None]:
import os
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

# Input directory

In [2]:

BASE_DIR    = '../../outputs/example2_word_n_to_word'

TOKENIZER_DIR = os.path.join(BASE_DIR,'tokenizer')
TRAIN_DATASET = os.path.join(BASE_DIR,'train_dataset')
VAL_DATASET   = os.path.join(BASE_DIR,'val_dataset')

# Output directory

In [3]:
MODEL_DIR     = os.path.join(BASE_DIR,'model')
LOG_DIR       = os.path.join(BASE_DIR,'model','log')


os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)


# Loading tokenizer

In [None]:
from tensorflow.keras.preprocessing.text import tokenizer_from_json

with open(os.path.join(TOKENIZER_DIR,'tokenizer.json'), 'r', encoding='utf-8') as f:
    tokenizer_json = f.read()

tokenizer = tokenizer_from_json(tokenizer_json)


if tokenizer.num_words is None:
    total_words = len(tokenizer.word_index) + 1  # Usa todas as palavras disponíveis
else:
    total_words = tokenizer.num_words  # Usa o limite definido

print('Tokenizer:', tokenizer.word_index)
print('Total words:', total_words)

2025-03-21 11:31:57.547343: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742567517.580743   13323 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742567517.591882   13323 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742567517.612820   13323 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742567517.612844   13323 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742567517.612846   13323 computation_placer.cc:177] computation placer alr

Total words: 10000


# Loading dataset

In [None]:
import tensorflow as tf
import numpy as np
import json

train_dataset = tf.data.Dataset.load(TRAIN_DATASET)
val_dataset   = tf.data.Dataset.load(VAL_DATASET)

for sample in train_dataset.take(1):
    print("    xs batch shape:",sample[0].shape)
    print("labels batch shape:",sample[1].shape)
    print("  max labels batch:",np.max(sample[1]))

    xs batch shape: (32, 64)
labels batch shape: (32, 3)
  max labels batch: 9263


I0000 00:00:1742567523.472076   13323 gpu_process_state.cc:208] Using CUDA malloc Async allocator for GPU: 0
I0000 00:00:1742567523.472980   13323 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1730 MB memory:  -> device: 0, name: NVIDIA GeForce MX150, pci bus id: 0000:01:00.0, compute capability: 6.1
2025-03-21 11:32:03.553478: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


# Saving model input shape size 

In [None]:
input_size = sample[0].shape[1]
output_size = sample[1].shape[1]

with open(os.path.join(MODEL_DIR,'input_output_size.json'), 'w') as archivo_json:
    json.dump({"input_size":input_size, "output_size":output_size}, archivo_json)

print(' input_size:', input_size)
print('output_size:', output_size)

 Input shape: 64
Output shape: 3


# Model


In [7]:
import mymodules.model as mmm

model_1 = mmm.get_model(total_words, input_size, output_size)


# sparse_categorical_crossentropy. Isso permite que o modelo use um número inteiro como rótulo, sem precisar converter para one-hot encoding.
model_1.compile(loss=tf.losses.SparseCategoricalCrossentropy(), 
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), 
                metrics=['sparse_categorical_accuracy'])

model_1.summary()


Como o modelo "entende" que são sequências?

* 1️⃣ Formato dos dados de entrada:
Seu xs tem dimensão (N, 107), ou seja, cada amostra tem 107 números (índices de palavras).
Como passamos todas as palavras de uma vez para o modelo, ele vê cada amostra como um grupo de palavras relacionadas.

* 2️⃣ A camada Embedding:
Converte cada número (índice de palavra) em um vetor denso de tamanho 128.
Resultado: A entrada, que antes era (107,), vira um tensor (107, 128).
Agora, temos uma sequência de vetores e não só números.

* 3️⃣ A camada LSTM:
O LSTM lê os vetores da sequência um por um, na ordem, e processa a relação entre eles.
Como a camada mantém um estado interno (memória) ao longo do tempo, ela entende a sequência como algo conectado, e não apenas como números soltos.

* 4️⃣ A saída da segunda LSTM (32 neurônios):
Retorna um único vetor de tamanho 32, que contém a "memória" da sequência inteira.

* 5️⃣ A camada Dense(softmax):
Usa a saída da LSTM para prever a próxima palavra na sequência.

# Training



In [8]:
MODEL_WEIGHTS_PATH = os.path.join(MODEL_DIR,"best_model.weights.h5")
MODEL_KERAS_PATH   = os.path.join(MODEL_DIR,"best_model.keras")


# Callback to save only the weights of the best model based on the smallest val_loss
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    MODEL_WEIGHTS_PATH,  # Name of saved file
    monitor="val_loss",  # Monitor validation loss
    save_best_only=True,  # Save only the best model
    save_weights_only=True,  # Save only weights
    verbose=1
)

In [None]:
import json

# Train the model and capture history
history = model_1.fit(  train_dataset, 
                        validation_data=val_dataset, 
                        epochs=3,
                        callbacks=[checkpoint_callback])

model_1.load_weights(MODEL_WEIGHTS_PATH)

# Save history as JSON
with open(os.path.join(LOG_DIR,"history.json"), "w") as f:
    json.dump(history.history, f,indent=4)

model_1.save(MODEL_KERAS_PATH)


Epoch 1/3


I0000 00:00:1742567529.503021   13390 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m8275/8276[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 57ms/step - loss: 7.5297 - sparse_categorical_accuracy: 0.0588
Epoch 1: val_loss improved from inf to 7.46238, saving model to ../../outputs/example2_word_n_to_word/model/best_model.weights.h5
[1m8276/8276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m525s[0m 63ms/step - loss: 7.5296 - sparse_categorical_accuracy: 0.0588 - val_loss: 7.4624 - val_sparse_categorical_accuracy: 0.0534
Epoch 2/3
[1m8275/8276[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 57ms/step - loss: 7.4054 - sparse_categorical_accuracy: 0.0591
Epoch 2: val_loss improved from 7.46238 to 7.36931, saving model to ../../outputs/example2_word_n_to_word/model/best_model.weights.h5
[1m8276/8276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m519s[0m 63ms/step - loss: 7.4054 - sparse_categorical_accuracy: 0.0591 - val_loss: 7.3693 - val_sparse_categorical_accuracy: 0.0534
Epoch 3/3
[1m8275/8276[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0