# **1. Deep Learning**


<!-- ## **1. Imports** -->

In [1]:
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)


  from .autonotebook import tqdm as notebook_tqdm


## **2.  Cargar dataset**

In [2]:
df = pd.read_csv("data/data_clean/train_clean.csv")

df = df[["content_clean", "bias"]].dropna()
df.head()


Unnamed: 0,content_clean,bias
0,besides his most recent trip to quetta mr raha...,0
1,poll prestigious colleges wo nt make you happi...,0
2,house speaker paul ryan at a private dinner ea...,2
3,cnn president donald trump has reason to hope ...,0
4,the controversial immigrationreform bill that ...,2


## **3. División**

In [3]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["content_clean"].tolist(),
    df["bias"].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df["bias"]
)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts,
    train_labels,
    test_size=0.1,
    random_state=42,
    stratify=train_labels
)

print(f"Train: {len(train_texts)}")
print(f"Validation: {len(val_texts)}")
print(f"Test: {len(test_texts)}")
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["content_clean"].tolist(),
    df["bias"].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df["bias"]
)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts,
    train_labels,
    test_size=0.1,
    random_state=42,
    stratify=train_labels
)

print(f"Train: {len(train_texts)}")
print(f"Validation: {len(val_texts)}")
print(f"Test: {len(test_texts)}")


Train: 20143
Validation: 2239
Test: 5596
Train: 20143
Validation: 2239
Test: 5596


## **4. Tokenization con DistilBERT**

In [4]:
tokenizer = DistilBertTokenizerFast.from_pretrained(
    "distilbert-base-uncased"
)

def tokenize(texts):
    return tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=256
    )

train_encodings = tokenize(train_texts)
val_encodings   = tokenize(val_texts)
test_encodings  = tokenize(test_texts)


## **5. Dataset PyTorch**

In [5]:
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        # Para cada ejemplo devolvemos los tensores de input_ids, attention_mask y la etiqueta
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Crear datasets de entrenamiento, validación y test
train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset   = NewsDataset(val_encodings, val_labels)
test_dataset  = NewsDataset(test_encodings, test_labels)


## **6. Transformer**

In [6]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=3
)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## **7. Evaluación**

In [7]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred                # 1. Recibe la salida del modelo y las etiquetas reales
    preds = np.argmax(logits, axis=1)        # 2. Convierte los logits en la clase predicha
    
    acc = accuracy_score(labels, preds)      # 3. Calcula Accuracy
    f1  = f1_score(labels, preds, average="macro")  # 4. Calcula Macro-F1 (útil para clases desbalanceadas)
    
    return {
        "accuracy": acc,
        "macro_f1": f1
    }


## **8. Entrenamiento**

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=20,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=1000,
    # evaluation_strategy="steps",   # <-- quitar
    save_steps=1000,
    eval_steps=1000,
    save_total_limit=1,
    no_cuda=True,                 
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset.shuffle().select(range(500)),  # subset pequeño
    eval_dataset=val_dataset.shuffle().select(range(200)),     # subset pequeño
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()




Step,Training Loss


## **9. Evaluación final**