In [11]:
import sys
assert sys.version_info >= (3, 5)

import sklearn
assert sklearn.__version__ >= "0.20"

import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os
import pandas as pd

np.random.seed(42)
tf.random.set_seed(42)


def train_rnn_model(texto):    
    # --- Encode text to integer sequence and shift to start from 0 ---
    [encoded] = np.array(tokenizer.texts_to_sequences([texto])) - 1
    
    # --- Prepare training dataset (use 90% of total characters) ---
    train_size = dataset_size * 90 // 100
    dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
    
    # --- Create overlapping windows of 101 tokens (100 input + 1 target) ---
    n_steps = 100
    window_length = n_steps + 1                  # Input length + 1-step-ahead target
    dataset = dataset.window(window_length, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_length))
    
    # --- Shuffle and batch the dataset ---
    np.random.seed(42)
    tf.random.set_seed(42)
    batch_size = 32
    dataset = dataset.shuffle(10000).batch(batch_size)
    
    # --- Split into (X, Y) where Y is X shifted one character ahead ---
    dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
    
    # --- One-hot encode the input; targets stay as integer indices ---
    dataset = dataset.map(
        lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

    # --- Prefetch for performance (asynchronous loading) ---
    dataset = dataset.prefetch(1)

    # --- Print shape of one batch to verify format ---
    for X_batch, Y_batch in dataset.take(1):
        print(X_batch.shape, Y_batch.shape)

    # --- Define a simple RNN model using two GRU layers ---
    model = keras.models.Sequential([
        keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                         dropout=0.2),  # Optional: recurrent_dropout can be added
        keras.layers.GRU(128, return_sequences=True,
                         dropout=0.2),  # Optional: recurrent_dropout can be added
        keras.layers.TimeDistributed(
            keras.layers.Dense(max_id, activation="softmax"))
    ])

    # --- Compile and train the model ---
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
    history = model.fit(dataset, epochs=4)

    return model

def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [2]:
df_educacion = pd.read_csv("noticias_educacion_sample.csv")
df_educacion['clase'] = 0
df_politica = pd.read_csv("noticias_politica_sample.csv")
df_politica['clase'] = 1
df_deportes = pd.read_csv("noticias_deportes_sample.csv")
df_deportes['clase'] = 2
df_economia = pd.read_csv("noticias_economia_sample.csv")
df_economia['clase'] = 3
df = pd.concat([df_educacion, df_politica, df_deportes, df_economia]).dropna().reset_index()

todas_noticias = " ".join(df['content'])
len(todas_noticias)

8255754

In [12]:
# --- Tokenize the text at character level ---
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(todas_noticias)

max_id = len(tokenizer.word_index)           # Number of distinct characters
dataset_size = tokenizer.document_count      # Total number of characters



In [75]:
dataset_size

8255754

In [74]:
max_id

146

In [5]:
model = train_rnn_model(todas_noticias)

I0000 00:00:1747223349.518683  364696 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 699 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:1e.0, compute capability: 7.5


(32, 100, 146) (32, 100)


2025-05-14 11:49:11.846399: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 1/4


I0000 00:00:1747223356.963656  366135 cuda_dnn.cc:529] Loaded cuDNN version 90501
I0000 00:00:1747223357.892085  366132 service.cc:148] XLA service 0x7f8e51c05e10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1747223357.892117  366132 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2025-05-14 11:49:17.898190: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1747223357.988756  366132 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


 193925/Unknown - 2006s 10ms/step - loss: 1.4393

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





In [76]:
print(complete_text("el tiempo", temperature=1))

el tiempo que serán enviser que 47.024 unidades que pasa un


In [36]:
print(complete_text("mi nombre es gustavo", temperature=1))

mi nombre es gustavo petro, goupo régliza. la inversión. además, se de


In [86]:
print(complete_text("gustavo bolivar", temperature=0.1))

gustavo bolivar de la compañía de la compañía de la compañía de l
