# Source of information

The information was based:

https://lejdiprifti.com/2023/10/14/creating-a-text-generation-neural-network-with-tensorflow/

# Input directory

In [None]:
import os

BASE_DIR    = '../../outputs/example1_word_n_to_word'

TOKENIZER_DIR = os.path.join(BASE_DIR,'tokenizer')
TRAIN_DATASET = os.path.join(BASE_DIR,'train_dataset')
VAL_DATASET   = os.path.join(BASE_DIR,'val_dataset')

# Output directory

In [2]:
MODEL_DIR     = os.path.join(BASE_DIR,'model')
LOG_DIR       = os.path.join(BASE_DIR,'model','log')


os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)


# Loading tokenizer

In [3]:
from tensorflow.keras.preprocessing.text import tokenizer_from_json

with open(os.path.join(TOKENIZER_DIR,'tokenizer.json'), 'r', encoding='utf-8') as f:
    tokenizer_json = f.read()

tokenizer = tokenizer_from_json(tokenizer_json)
total_words = len(tokenizer.word_index) + 1

print('Tokenizer:', tokenizer.word_index)
print('Total words:', total_words)

2025-03-20 17:38:28.262278: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742503108.331743   25014 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742503108.353720   25014 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742503108.567616   25014 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742503108.567656   25014 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742503108.567659   25014 computation_placer.cc:177] computation placer alr

Total words: 49249


# Loading dataset

In [4]:
import tensorflow as tf
import json

train_dataset = tf.data.Dataset.load(TRAIN_DATASET)
val_dataset   = tf.data.Dataset.load(VAL_DATASET)

for sample in train_dataset.take(1):
    print("    xs batch shape:",sample[0].shape)
    print("labels batch shape:",sample[1].shape)



    xs batch shape: (512, 107)
labels batch shape: (512,)


I0000 00:00:1742503118.384315   25014 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1730 MB memory:  -> device: 0, name: NVIDIA GeForce MX150, pci bus id: 0000:01:00.0, compute capability: 6.1
2025-03-20 17:38:38.590331: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


# Saving model input shape size 

In [5]:
input_shape = sample[0].shape[1]

with open(os.path.join(MODEL_DIR,'input_shape.json'), 'w') as archivo_json:
    json.dump({"input_shape":input_shape}, archivo_json)

print('Input shape:', input_shape)

Input shape: 107


# Model


In [None]:
import mymodules.model as mmm

model_1 = mmm.get_model(total_words, input_shape)


# sparse_categorical_crossentropy. Isso permite que o modelo use um número inteiro como rótulo, sem precisar converter para one-hot encoding.
model_1.compile(loss=tf.losses.SparseCategoricalCrossentropy(), 
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), 
                metrics=['sparse_categorical_accuracy', 'sparse_categorical_crossentropy'])

model_1.summary()


Como o modelo "entende" que são sequências?

* 1️⃣ Formato dos dados de entrada:
Seu xs tem dimensão (N, 107), ou seja, cada amostra tem 107 números (índices de palavras).
Como passamos todas as palavras de uma vez para o modelo, ele vê cada amostra como um grupo de palavras relacionadas.

* 2️⃣ A camada Embedding:
Converte cada número (índice de palavra) em um vetor denso de tamanho 128.
Resultado: A entrada, que antes era (107,), vira um tensor (107, 128).
Agora, temos uma sequência de vetores e não só números.

* 3️⃣ A camada LSTM:
O LSTM lê os vetores da sequência um por um, na ordem, e processa a relação entre eles.
Como a camada mantém um estado interno (memória) ao longo do tempo, ela entende a sequência como algo conectado, e não apenas como números soltos.

* 4️⃣ A saída da segunda LSTM (32 neurônios):
Retorna um único vetor de tamanho 32, que contém a "memória" da sequência inteira.

* 5️⃣ A camada Dense(softmax):
Usa a saída da LSTM para prever a próxima palavra na sequência.

# Training



In [13]:
MODEL_WEIGHTS_PATH = os.path.join(MODEL_DIR,"best_model.weights.h5")
MODEL_KERAS_PATH   = os.path.join(MODEL_DIR,"best_model.keras")


# Callback to save only the weights of the best model based on the smallest val_loss
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    MODEL_WEIGHTS_PATH,  # Name of saved file
    monitor="val_loss",  # Monitor validation loss
    save_best_only=True,  # Save only the best model
    mode="min",  # Lower loss means better model
    save_weights_only=True  # Save only weights
)

In [14]:
import json

# Train the model and capture history
history = model_1.fit(  train_dataset, 
                        validation_data=val_dataset, 
                        epochs=3,
                        callbacks=[checkpoint_callback])

model_1.load_weights(MODEL_WEIGHTS_PATH)

# Save history as JSON
with open(os.path.join(MODEL_DIR,"history.json"), "w") as f:
    json.dump(history.history, f)

model_1.save(MODEL_KERAS_PATH)


Epoch 1/3


2025-03-20 18:09:46.209205: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 25215488 exceeds 10% of free system memory.


[1m1237/1237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m706s[0m 565ms/step - loss: 7.8732 - sparse_categorical_accuracy: 0.0744 - val_loss: 6.7912 - val_sparse_categorical_accuracy: 0.1331
Epoch 2/3
[1m1237/1237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m700s[0m 566ms/step - loss: 6.5611 - sparse_categorical_accuracy: 0.1382 - val_loss: 6.6291 - val_sparse_categorical_accuracy: 0.1482
Epoch 3/3
[1m1237/1237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m701s[0m 567ms/step - loss: 6.2191 - sparse_categorical_accuracy: 0.1539 - val_loss: 6.6005 - val_sparse_categorical_accuracy: 0.1557
