In [1]:
import pandas as pd

df_educacion = pd.read_csv("noticias_educacion_sample.csv")
df_educacion['clase'] = 0
df_politica = pd.read_csv("noticias_politica_sample.csv")
df_politica['clase'] = 1
df_deportes = pd.read_csv("noticias_deportes_sample.csv")
df_deportes['clase'] = 2
df_economia = pd.read_csv("noticias_economia_sample.csv")
df_economia['clase'] = 3

In [2]:
df = pd.concat([df_educacion, df_politica, df_deportes, df_economia]).dropna().reset_index()
df

Unnamed: 0,index,content,date,headline,description,clase
0,0,Como parte de la política de puertas abiertas ...,2022-02-08T19:12:01.737Z,La CAN abre convocatorias para pasantías en Co...,La Comunidad Andina de Naciones abrió la posib...,0
1,1,"El programa, que cumple 30 años desde su prime...",2022-05-14T18:02:23.629Z,Colfuturo apoyará a 1.526 profesionales colomb...,"Los beneficiarios, en su mayoría, realizaron e...",0
2,2,Estudiar una carrera universitaria en Colombia...,2022-10-19T09:45:01.712Z,¿Cómo estudiar becado en la mejor universidad ...,"Según el ranking de Times Higher Education, la...",0
3,3,Escuche aquí el episodio número 27 de Finanzas...,2021-04-07T17:56:34.238Z,Consejos para financiar con inteligencia sus e...,Si estudiar es uno de sus principales objetivo...,0
4,4,Durante el último año de la carrera universita...,2022-04-02T18:08:22.865Z,Pruebas Saber Pro: el listado de universidades...,Las universidades públicas presentaron preocup...,0
...,...,...,...,...,...,...
1928,495,Colombia sigue aumentando su endeudamiento ext...,2023-02-10T23:08:47.922Z,"Deuda externa de Colombia representó el 52,8% ...",Así lo deja en evidencia el más reciente repor...,3
1929,496,La Agencia de Estados Unidos para el Desarroll...,2022-09-28T17:00:15.603Z,Lanzan convocatoria para apoyar a más de mil o...,La Usaid estará al frente de este proceso que ...,3
1930,497,La inflación es uno de los mayores retos que e...,2023-02-25T03:41:20.639Z,Controlar la inflación no será tan fácil como ...,El aumento en los precios será una constante e...,3
1931,498,23 lugares icónicos de Cúcuta fueron decorados...,2022-12-07T17:16:46.317Z,Reapertura económica en la frontera: artesanas...,Cúcuta prepara la Ruta Navideña luego de haber...,3


# BERT:

In [7]:
import pandas as pd
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# 1. Load and prepare your dataset
df = df[['content', 'clase']].dropna().reset_index(drop=True)

# 2. Split into train/val/test
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['clase'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['clase'], random_state=42)

# 3. Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# 4. Tokenization function
def encode_texts(texts, labels):
    tokens = tokenizer(
        list(texts),
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="tf"
    )
    return tokens, tf.convert_to_tensor(labels)

# 5. Encode splits
train_tokens, train_labels = encode_texts(train_df['content'], train_df['clase'])
val_tokens, val_labels = encode_texts(val_df['content'], val_df['clase'])
test_tokens, test_labels = encode_texts(test_df['content'], test_df['clase'])

# 6. Create tf.data.Dataset
def make_tf_dataset(tokens, labels, batch_size=16, shuffle=False):
    dataset = tf.data.Dataset.from_tensor_slices((dict(tokens), labels))
    if shuffle:
        dataset = dataset.shuffle(1000)
    return dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

tf_train = make_tf_dataset(train_tokens, train_labels, shuffle=True)
tf_val = make_tf_dataset(val_tokens, val_labels)
tf_test = make_tf_dataset(test_tokens, test_labels)

# 7. Load BERT model
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

# 8. Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)

# 9. Train
model.fit(tf_train, validation_data=tf_val, epochs=4)

# 10. Evaluate
results = model.evaluate(tf_test)
print(f"\nTest loss: {results[0]:.4f}, Test accuracy: {results[1]:.4f}")

# 11. Predict on test set
y_test = test_labels.numpy()
logits = model.predict(tf_test).logits
y_pred = np.argmax(logits, axis=1)

# 12. Compute precision, recall, F1 score for each class
num_classes = len(set(y_test))
for i in range(num_classes):
    class_predicted = [1 if x == i else 0 for x in y_pred]
    class_real = [1 if x == i else 0 for x in y_test]
    precision = precision_score(class_real, class_predicted, zero_division=0)
    recall = recall_score(class_real, class_predicted, zero_division=0)
    f1 = f1_score(class_real, class_predicted, zero_division=0)
    print(f"Class {i}: Precision: {precision:.2f}, Recall: {recall:.2f}, F1 score: {f1:.2f}")

# 13. Optional: Predict on a new sentence
test_text = ["La universidad abrió una nueva convocatoria para becas internacionales."]
encoded = tokenizer(test_text, truncation=True, padding=True, max_length=128, return_tensors="tf")
logits = model(encoded).logits
probs = tf.nn.softmax(logits, axis=-1)
predicted_class = tf.argmax(probs, axis=1).numpy()[0]
print(f"\nPredicted class for example text: {predicted_class}")


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

Test loss: 0.5328, Test accuracy: 0.8241
Class 0: Precision: 0.83, Recall: 0.89, F1 score: 0.86
Class 1: Precision: 0.90, Recall: 0.59, F1 score: 0.72
Class 2: Precision: 0.94, Recall: 1.00, F1 score: 0.97
Class 3: Precision: 0.67, Recall: 0.81, F1 score: 0.73

Predicted class for example text: 0


# GPT-2:

In [3]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)
import torch

# 1. Load your dataset
df = df[['content']].dropna().reset_index(drop=True)
dataset = Dataset.from_pandas(df)

# 2. Load tokenizer and prepare text
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["content"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# 3. Load model
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# 4. Data collator for causal LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# 5. Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=100,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

# 6. Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 7. Train the model
trainer.train()

# 8. Save the model
trainer.save_model("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

# 9. Generate new text (inference)
def generate_text(prompt, max_length=100, temperature=0.9):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    input_ids = input_ids.to(model.device)
    output = model.generate(
        input_ids,
        max_length=max_length,
        do_sample=True,
        temperature=temperature,
        top_k=50,
        top_p=0.95,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example generation
print(generate_text("Como parte de la política"))


2025-05-22 14:46:20.916509: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-22 14:46:20.932744: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747925180.952720   88939 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747925180.958945   88939 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747925180.974985   88939 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Map:   0%|          | 0/1933 [00:00<?, ? examples/s]

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,4.3408
200,3.9419
300,3.8218
400,3.6817
500,3.5964
600,3.4018
700,3.3751
800,3.3597
900,3.3245
1000,3.2946


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



📝 Generated Text Example:
Como parte de la política, las oportunidades especialmente para el pasado 15 de julio en la primera final de la selección del lugar, conocupación y un trino de los grandes grandes que se habilidades. El trino de los grandes se han varios afectados para el año de los parte de los colombianos de los est


In [7]:
print(generate_text("Gustavo Petro y el Pacto"))

Gustavo Petro y el Pacto Histórico más de SEMANA por el poco de nada por el Gobierno. Al tiempo con una luz de las últimas horas, el sesión se el acompañadores de la historia sobre los equipos de la empresa de la Universidad Nacional del Estado, Londres-Cámara. Poco no est
