#Librerías

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import pipeline
from huggingface_hub import Repository


#Carga del dataset

In [None]:

# Cargamos el dataset de titulares sarcásticos vs. no sarcásticos
dataset = load_dataset("raquiba/Sarcasm_News_Headline")  # 49 000 ejemplos aprox.


Repo card metadata block was not found. Setting CardData to empty.


#Preprocesamiento y tokenización.

In [None]:


MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)  # Tokenizer de BERT básico

def preprocess(batch):
    return tokenizer(batch["headline"],
                     truncation=True,
                     padding="max_length",
                     max_length=128)

# Tokenizamos en paralelo
dataset = dataset.map(preprocess, batched=True, remove_columns=["headline", "article_link"])
# Renombramos la columna de etiquetas a "labels" para que sea compatible con el Trainer
dataset = dataset.rename_column("is_sarcastic", "labels")

dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])




Map:   0%|          | 0/26709 [00:00<?, ? examples/s]

#Configuración del modelo y entrenamiento.

In [None]:
# Cargamos un modelo de clasificación con 2 etiquetas
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
# Definimos argumentos de entrenamiento
args = TrainingArguments(
    output_dir="sarcasm-checkpoint",
    eval_strategy="epoch",      # Evalúa al final de cada época
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# Creamos el Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=lambda p: {"accuracy": (p.predictions.argmax(-1) == p.label_ids).mean()},
)
# Entrenamiento
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2236,0.07022,0.977386
2,0.0881,0.018745,0.99502
3,0.0253,0.006071,0.998615


TrainOutput(global_step=5367, training_loss=0.12810042949724135, metrics={'train_runtime': 8136.6382, 'train_samples_per_second': 10.552, 'train_steps_per_second': 0.66, 'total_flos': 5647481470010880.0, 'train_loss': 0.12810042949724135, 'epoch': 3.0})

#Evaluación y prueba

In [None]:
# Evaluar en el test set
metrics = trainer.evaluate()
print(metrics)  # Devuelve accuracy, loss, etc.


{'eval_loss': 0.0060714962892234325, 'eval_accuracy': 0.9986146991650754, 'eval_runtime': 760.3585, 'eval_samples_per_second': 35.127, 'eval_steps_per_second': 1.098, 'epoch': 3.0}


In [None]:

# Función de inferencia básica

sarcasm_pipe = pipeline(
    "text-classification",
    model="EARSV/sarcasm-detector",
    tokenizer="EARSV/sarcasm-detector",
    return_all_scores=False
)
# Prueba rápida
examples = [
    "scientist discovers water is wet, wins nobel prize for groundbreaking revelation",
    "local man finally learns to parallel park after 20 years of daily practice",
    "city council announces new plan to solve traffic by building more traffic lights",
    "study confirms people who sleep 8 hours feel more rested",
    "ceo of fast-food chain urges employees to eat healthier during unpaid overtime",
    "new app reminds users to blink regularly while staring at screens",
    "weather forecast predicts rain during outdoor wedding, couple devastated",
    "man wins lifetime supply of broccoli, considers moving to another country",
    "government proposes tax on air to fund climate change initiatives",
    "new yoga studio opens downtown, offers free classes this weekend"
]

for text in examples:
    res = sarcasm_pipe(text)[0]
    print(f"Texto: {text}\n  Sarcástico: {res['label']} ({res['score']:.4f})\n")

Device set to use cuda:0


Texto: scientist discovers water is wet, wins nobel prize for groundbreaking revelation
  Sarcástico: LABEL_0 (0.9507)

Texto: local man finally learns to parallel park after 20 years of daily practice
  Sarcástico: LABEL_1 (0.9999)

Texto: city council announces new plan to solve traffic by building more traffic lights
  Sarcástico: LABEL_1 (0.9993)

Texto: study confirms people who sleep 8 hours feel more rested
  Sarcástico: LABEL_1 (0.9999)

Texto: ceo of fast-food chain urges employees to eat healthier during unpaid overtime
  Sarcástico: LABEL_1 (0.9999)

Texto: new app reminds users to blink regularly while staring at screens
  Sarcástico: LABEL_1 (0.9998)

Texto: weather forecast predicts rain during outdoor wedding, couple devastated
  Sarcástico: LABEL_0 (0.9976)

Texto: man wins lifetime supply of broccoli, considers moving to another country
  Sarcástico: LABEL_1 (0.9998)

Texto: government proposes tax on air to fund climate change initiatives
  Sarcástico: LABEL_0 (0.9969

#Subir el modelo a Hugging Face

In [None]:
#Clonamos el repo de Hugging Face donde subiremos el modelo
repo = Repository(
    local_dir="sarcasm-detector-local",
    clone_from="EARSV/sarcasm-detector"
)

# Copiamos el modelo y el tokenizer al repo
model.save_pretrained("sarcasm-detector-local/")
tokenizer.save_pretrained("sarcasm-detector-local/")

# Añadimos el README.md
with open("sarcasm-detector-local/README.md","w") as f:
    f.write("""
# Sarcasm Detector

Fine-tuned `bert-base-uncased` on raquiba/Sarcasm_News_Headline.

**Exactitud (test)**: 99.86 %
**Epochs**: 3
**Batch size**: 16
**Dataset**: titulares con etiqueta `is_sarcastic`
    """)

# Commit y push
repo.push_to_hub(commit_message="Initial upload of sarcasm-detector")


d:\Copia de Seguridad\Documentos\Carrera Ciencia de Datos\8.Octavo Semestre\Procesamiento de Lenguaje Natural\Practicas\P6\sarcasm-detector-local is already a clone of https://huggingface.co/EARSV/sarcasm-detector. Make sure you pull the latest changes with `repo.git_pull()`.


Upload file model.safetensors:   0%|          | 1.00/418M [00:00<?, ?B/s]

remote: [33m-------------------------------------------------------------------------[0m        
remote: [33mhelp: https://huggingface.co/docs/hub/model-cards#model-card-metadata[0m        
remote: [33m-------------------------------------------------------------------------[0m        
remote: [32m-------------------------------------------------------------------------[0m        
remote: [32mPlease find the documentation at:[0m        
remote: [32mhttps://huggingface.co/docs/hub/model-cards#model-card-metadata[0m        
remote: [32m[0m        
remote: [32m-------------------------------------------------------------------------[0m        
To https://huggingface.co/EARSV/sarcasm-detector
   093b3f9..b630f4b  main -> main



'https://huggingface.co/EARSV/sarcasm-detector/commit/b630f4b8093b3e7507041edfbb7f6f0e342c6e54'