Utilizando el dataset construido, el objetivo es construir modelos de generación de texto utilizando redes neuronales que puedan generar noticias ficticias. Consultar el siguiente lab como referencia para la implementación:

In [74]:
import tensorflow as tf
import numpy as np
import os
import time
import tensorflow_datasets as tfds
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
from tensorflow.keras.models import Sequential
from typing import Any, List, Tuple
from collections import Counter


### Modelo de generacion de texto: Predecir letra siguiente

In [75]:
# Configurar para que TensorFlow utilice la GPU por defecto
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Configurar para que TensorFlow asigne memoria dinámicamente
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        # Especificar la GPU por defecto
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Manejar error
        print(e)

In [76]:
dataset, info = tfds.load('ag_news_subset', with_info=True,
                          as_supervised=True)
dataset_train, dataset_test = dataset['train'], dataset['test']

Preprocesamiento para entrenamiento

In [77]:
def create_vocab(train_dataset: Any) -> Tuple[List[str], str]:
    train_texts = []

    for text, _ in train_dataset:
        train_texts.append(text.numpy().decode('utf-8'))

    result_text = " ".join(train_texts)

    vocab = sorted(set(result_text))
    print(len(vocab))

    return vocab, result_text

vocab,result_text = create_vocab(dataset_train)


82


In [78]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

In [79]:
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)


In [80]:
def split_input_target(sequence: List[str]) -> Tuple[List[str],List[str]]:
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [81]:
ids = ids_from_chars(vocab[:5])
ids

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 2, 3, 4, 5], dtype=int64)>

In [82]:
chars=chars_from_ids(ids)

In [83]:
all_ids = ids_from_chars(tf.strings.unicode_split(result_text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(23328241,), dtype=int64, numpy=array([29, 41, 32, ..., 57, 68,  1], dtype=int64)>

In [84]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [85]:
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

A
M
D
 
#
3
9
;
s
 


In [86]:
seq_length = 100

In [87]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'A' b'M' b'D' b' ' b'#' b'3' b'9' b';' b's' b' ' b'n' b'e' b'w' b' '
 b'd' b'u' b'a' b'l' b'-' b'c' b'o' b'r' b'e' b' ' b'O' b'p' b't' b'e'
 b'r' b'o' b'n' b' ' b'c' b'h' b'i' b'p' b' ' b'i' b's' b' ' b'd' b'e'
 b's' b'i' b'g' b'n' b'e' b'd' b' ' b'm' b'a' b'i' b'n' b'l' b'y' b' '
 b'f' b'o' b'r' b' ' b'c' b'o' b'r' b'p' b'o' b'r' b'a' b't' b'e' b' '
 b'c' b'o' b'm' b'p' b'u' b't' b'i' b'n' b'g' b' ' b'a' b'p' b'p' b'l'
 b'i' b'c' b'a' b't' b'i' b'o' b'n' b's' b',' b' ' b'i' b'n' b'c' b'l'
 b'u' b'd' b'i'], shape=(101,), dtype=string)


In [88]:
split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [89]:
dataset = sequences.map(split_input_target)

In [90]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

Creacion del modelo de deep learning

In [91]:
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())
# The embedding dimension
embedding_dim = 256
# Number of RNN units
rnn_units = 1024
sequence_length = 100


In [92]:
def create_model(vocab_size: int, embedding_dim: int, sequence_length: int) -> tf.keras.Model:
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=sequence_length),
        LSTM(rnn_units, return_sequences=True, stateful=True),
        Dense(vocab_size)
    ])
    return model

In [130]:
#model = create_model(vocab_size=vocab_size,embedding_dim=embedding_dim,sequence_length=sequence_length)
model = tf.keras.models.load_model('letter_model.h5', compile=False)



In [94]:
for input_example_batch, target_example_batch in dataset.take(1):
    print("Input shape:", input_example_batch.shape)  # Verifica la forma de entrada
    example_batch_predictions = model(input_example_batch)
    print("Predictions shape:", example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

Input shape: (64, 100)
Predictions shape: (64, 100, 83) # (batch_size, sequence_length, vocab_size)


In [95]:
model.summary()

In [96]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([ 1, 60, 70, 30, 59, 67, 57, 74, 60, 75, 38, 65, 59, 63, 70, 60, 77,
       61,  1,  1, 61, 63, 77, 71, 61, 60,  1, 65, 74, 65, 60, 57, 81,  1,
        1, 42, 70, 59, 65, 70, 63,  1, 74,  1, 76, 71, 74,  1, 61, 74,  1,
       64, 57, 74, 65,  1, 76,  1, 57, 81,  1, 76, 77, 75,  1, 59, 65, 74,
       13, 81, 61,  1, 63, 61, 74, 70, 76, 57, 65, 70, 13, 73, 57, 77, 69,
       58, 61,  1, 57, 81, 76,  1, 62, 65, 70, 68, 76, 61, 60,  1],
      dtype=int64)

In [97]:
#loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [98]:
#model.compile(optimizer='adam', loss=loss)

In [99]:
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor='loss',
    patience=3)

reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='loss',
    factor=0.2,
    patience=2,
    min_lr=0.0001)


In [100]:
EPOCHS = 60

In [101]:
#history = model.fit(dataset, epochs=EPOCHS, callbacks=[early_stopping_callback,reduce_lr_callback])

Resultado del modelo y test

In [102]:
def text_to_ids(text: str) -> Any:
    return ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))

# Función para convertir IDs en texto
def ids_to_text(ids: int) -> Any:
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)


def generate_text(start_string: str, num_generate:int = 1000, temperature: float = 1.0) -> tf.Tensor:
    input_eval = text_to_ids(start_string)
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []

    for _ in range(num_generate):
        predictions = model(input_eval)
        predictions = predictions[:, -1, :] / temperature  # Tomar la última predicción y aplicar temperatura
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(predicted_id)

    return start_string + ids_to_text(text_generated)


In [103]:
print(generate_text("Breaking News ", num_generate=1000, temperature=0.1))

tf.Tensor(b'Breaking News Corp. (NWS) said it will buy a 4.5 billion pound (\\$1. The story of the Baltimore Ravens had a season-high 35 points and the New York Yankees clinched their seventh Chinese Grand Prix on Sunday with a strained right hamstring injury, the first step in a state visit to the country to the polls in the country #39;s first direct presidential election. The state of America has announced that it has been sentenced to a four-year contract with the Securities and Exchange Commission and the United States and the United States to continue their production of a bankruptcy filing. The company said it will sell its stake in the company #39;s stock to avoid a report that the company said it would buy a new service that will allow the company to make it easier for a special edition of the internet service provider that it says is set to remain in the past 12 months as their number two priority. AP - The United States and the United States said it would seek a controversia

In [104]:
print(generate_text("Boston Celtics ", num_generate=250, temperature=0.1))

tf.Tensor(b'Boston Celtics to a 101-98 victory over the Minnesota Twins at the Madrid Masters Series on Sunday. AP - The United States and the United States said it would seek a controversial proposal to sell its stake in the company #39;s proposed merger with Apple Computer I', shape=(), dtype=string)


In [105]:
print(generate_text("United States ", num_generate=250, temperature=0.6))

tf.Tensor(b'United States against Saudi Arabia and said it will continue to inspire concerns about the U.S. economy and sending its strongest possible   third-quarter earnings, but the company could start to the previous year, and it #39;s scheduled to discontinue their compu', shape=(), dtype=string)


In [106]:
print(generate_text("Argentina ", num_generate=500, temperature=0.8))

tf.Tensor(b'Argentina is the biggest ring with the same time.  VIENNA (Reuters) - Floodgators - The US Supreme Court is making clients swirling into as many years of ways that he felt the prospect of integrating the US team that begins in one of the worst losing streak to a touchdown by Dallas  kingom Sunday and signed an agreement to acquire a 2005 model of its controversial competitors in the music business. A lacklustre war with the US auto insurance business software maker, more than expected in its first-half pr', shape=(), dtype=string)


In [107]:
print(generate_text("Lionel Messi ", num_generate=100, temperature=0.05))

tf.Tensor(b'Lionel Messi the second time in five years. A senior U.S. official said on Tuesday it would support the U.S. econ', shape=(), dtype=string)


In [108]:
#model.save("model_2.h5")

### Modelo de generacion de texto: Predecir palabra siguiente

In [109]:
def create_vocab_2(train_dataset: Any, vocab_size: int = 10000) -> Tuple[List[str], List[str]]:
    train_texts = []

    for text, _ in train_dataset:
        train_texts.append(text.numpy().decode('utf-8'))

    result_text = " ".join(train_texts)
    words = result_text.split()

    # Crear el vocabulario basado en la frecuencia de las palabras
    vocab_counter = Counter(words)
    vocab = sorted(vocab_counter, key=vocab_counter.get, reverse=True)[:vocab_size]

    return vocab, words

vocab_2, words_2 = create_vocab_2(dataset_train, vocab_size=10000)

In [110]:
print(f"Tamaño del vocabulario: {len(vocab_2)}")
print(f"Algunas palabras del vocabulario: {vocab_2[:10]}")

Tamaño del vocabulario: 10000
Algunas palabras del vocabulario: ['the', 'to', 'a', 'of', 'in', 'and', 'on', '-', 'for', 'that']


In [111]:
# Funciones de transformación de texto a IDs y viceversa
word_to_id = {u:i for i, u in enumerate(vocab_2)}
id_to_word = np.array(vocab_2)


In [112]:
def text_to_ids_2(text: str) -> List[int]:
    return [word_to_id.get(word, 0) for word in text.split()]  # Usar 0 para palabras fuera de vocabulario

def ids_to_text_2(ids: List[int]) -> str:
    return " ".join(id_to_word[ids])

# Crear secuencias de entrada y objetivo
def split_input_target(sequence: List[int]) -> Tuple[List[int], List[int]]:
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [113]:
# Convertir las palabras en IDs
all_ids = text_to_ids_2(" ".join(words_2))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)


In [114]:
# Crear secuencias de longitud fija
seq_length = 100
sequences = ids_dataset.batch(seq_length + 1, drop_remainder=True)


In [115]:
# Dividir en entradas y objetivos
dataset = sequences.map(split_input_target)

# Configuración de entrenamiento
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)


In [116]:
# Longitud del vocabulario y dimensiones del embedding
vocab_size = len(vocab_2)
embedding_dim = 256
rnn_units = 1024
sequence_length = 100


In [117]:
def create_model_2(vocab_size: int, embedding_dim: int, sequence_length: int, batch_size: int) -> tf.keras.Model:
    model = Sequential([
        Input(batch_shape=(batch_size, sequence_length)),
        Embedding(vocab_size, embedding_dim),
        LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        Dense(vocab_size)
    ])
    return model

In [118]:
#model = create_model_2(vocab_size=vocab_size, embedding_dim=embedding_dim, sequence_length=sequence_length, batch_size=BATCH_SIZE)
#model.reset_states()  
model_2 = tf.keras.models.load_model('word_model_3.h5', compile=False)



In [119]:
# Compilar y entrenar el modelo
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

model_2.compile(optimizer='adam', loss=loss)

early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2, patience=2, min_lr=0.0001)


In [120]:
EPOCHS = 100
#history = model.fit(dataset, epochs=EPOCHS, callbacks=[early_stopping_callback, reduce_lr_callback])


In [122]:
gen_model = create_model_2(vocab_size=vocab_size, embedding_dim=embedding_dim, sequence_length=sequence_length, batch_size=1)
gen_model.set_weights(model_2.get_weights())


In [123]:
# Función para resetear los estados del RNN
def reset_lstm_states(model):
    for layer in model.layers:
        if isinstance(layer, LSTM):
            layer.reset_states()

In [124]:
# Función para generar texto
def generate_text_2(start_string: str, num_generate: int = 1000, temperature: float = 1.0) -> str:
    input_eval = text_to_ids_2(start_string)
    input_eval = tf.expand_dims(input_eval, 0)
    
    # Asegurar que la secuencia de entrada tenga la longitud correcta
    input_eval = tf.keras.preprocessing.sequence.pad_sequences(input_eval, maxlen=sequence_length, padding='pre')
    
    text_generated = []

    # Resetear los estados del RNN
    reset_lstm_states(gen_model)

    for _ in range(num_generate):
        predictions = gen_model(input_eval)
        predictions = predictions[:, -1, :] / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        # Añadir el id predicho a la secuencia de entrada
        input_eval = tf.concat([input_eval[:, 1:], tf.expand_dims([predicted_id], 0)], axis=-1)

        text_generated.append(predicted_id)

    return start_string + ' ' + ids_to_text_2(text_generated)



In [125]:
print(generate_text_2("Breaking News", num_generate=50, temperature=1))

Breaking News the the AP - Jets lost the Athens Games, and the is among the most New Zealand captain of the NHL season. By the the DALLAS (AP) -- the its relief the over China and home the the Corp., a global supplier of Global the Thursday night. One other the


In [126]:
print(generate_text_2("Boston Celtics", num_generate=50, temperature=0.9))

Boston Celtics manager at Madison Square Garden the : London. Mike the the quarterback chosen for the New York the managerial the the manager Jim the said that the only works the been room. The Iraqi National the offers stronger information about its security the The update includes a new format to


In [127]:
print(generate_text_2("United States", num_generate=50, temperature=1))

United States President George Bush signed an the law AP - The District of Columbia Council approved legislation to the the million dollars on Wednesday to build and build a robot from solar wind the By the the from Wired magazine. Storage vendors the and release of the today told the the


In [128]:
print(generate_text_2("Argentina", num_generate=50, temperature=0.8))

Argentina (Reuters) - Russian prosecutors have told the BBC that they hope to release French citizens and other ethnic groups in a the assault on the city of Fallujah after a school siege near the Afghan border. Astronomers say the arrest the the apparent of its the had been the the


In [129]:
print(generate_text_2("Lionel Messi", num_generate=50, temperature=1))

Lionel Messi Clinton the Chief of the Nigeria and Libya signed Friday the Israeli army the Air Force Virgin the flights and space astronauts on board this week will meet again today the the spacecraft the point guard in front of the players union at Oakland County #39;s the The trial came
