In [1]:
from huggingface_hub import login

# Cargamos el token de HuggingFace que lo tenemos en un fichero oculto e iniciamos sesión

with open("/home/adrian/Escritorio/DeepSexist/DatasetManagement/EXIST2025DatasetV0.3/huggingfaceToken.txt", "r") as file:
    hf_token = file.read().strip()

login(hf_token)

In [2]:
import torch
torch.cuda.empty_cache()


In [None]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score

listado_etiquetas = ['IDEOLOGICAL-INEQUALITY','STEREOTYPING-DOMINANCE','OBJECTIFICATION','SEXUAL-VIOLENCE','MISOGYNY-NON-SEXUAL-VIOLENCE']
etiqueta = listado_etiquetas[0]

# 1. Configuración
model_name = "meta-llama/Llama-3.2-1B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

# 2. Carga y formato del dataset
df = pd.read_csv("/home/adrian/Escritorio/DeepSexist/DatasetManagement/EXIST2025DatasetV0.3/EXIST 2025 Videos Dataset/training/EXIST2025_training_task3_3_ALL.csv")
df = df.rename(columns={str(etiqueta): "label"})

# División estratificada
train_df, test_df = train_test_split(df, test_size=0.1, stratify=df["label"], random_state=42)

def format_prompt(row):
    return f"""### Instrucción:
Decide si el siguiente mensaje tiene contenido de DESIGUALDAD IDEOLÓGICA: el texto desacredita el movimiento feminista, rechaza la desigualdad entre hombres y mujeres o presenta a los hombres como víctimas de la opresión de género. Responde solo con "1" si lo es o "0" si no lo es.
### Texto:
{row['text']}

### Respuesta:
{"YES" if row['label'] == 1 else "NO"}"""

# Aplicar formato a ambos splits
for split_df in (train_df, test_df):
    split_df["prompt"] = split_df.apply(format_prompt, axis=1)
    split_df["completion"] = split_df["label"].astype(str)

# Convertir a Dataset HuggingFace
train_dataset = Dataset.from_pandas(train_df[["prompt", "completion", "label"]])
test_dataset = Dataset.from_pandas(test_df[["prompt", "completion", "label"]])

# 3. Tokenizador
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# 4. Cuantización y carga del modelo
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

# 5. LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)

# 6. Preprocesamiento
def formatting_func(example):
    return f"{example['prompt']} {example['completion']}"

def preprocess(example):
    return tokenizer(
        formatting_func(example),
        truncation=True,
        padding="max_length",
        max_length=64,
    )

train_dataset = train_dataset.map(preprocess)
test_dataset = test_dataset.map(preprocess)

# 7. Argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir="./outputs_llm_peft",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none",
)

# 8. Entrenador
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    args=training_args,
    peft_config=lora_config,
    tokenizer=tokenizer,
)

trainer.train()




Map:   0%|          | 0/2257 [00:00<?, ? examples/s]

Map:   0%|          | 0/251 [00:00<?, ? examples/s]

  trainer = SFTTrainer(


Map:   0%|          | 0/2257 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/2257 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2257 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2257 [00:00<?, ? examples/s]

  0%|          | 0/6771 [00:00<?, ?it/s]

{'loss': 3.9265, 'grad_norm': 2.53337025642395, 'learning_rate': 4.992615566386059e-05, 'mean_token_accuracy': 0.3186500116428796, 'epoch': 0.0}
{'loss': 3.4969, 'grad_norm': 2.9931998252868652, 'learning_rate': 4.985231132772117e-05, 'mean_token_accuracy': 0.39275411641839775, 'epoch': 0.01}
{'loss': 3.5968, 'grad_norm': 3.1474406719207764, 'learning_rate': 4.977846699158175e-05, 'mean_token_accuracy': 0.3781121485097704, 'epoch': 0.01}
{'loss': 3.0993, 'grad_norm': 4.0319013595581055, 'learning_rate': 4.9704622655442326e-05, 'mean_token_accuracy': 0.4277257379504517, 'epoch': 0.02}
{'loss': 2.6994, 'grad_norm': 3.061086416244507, 'learning_rate': 4.963077831930291e-05, 'mean_token_accuracy': 0.5201531473749046, 'epoch': 0.02}
{'loss': 2.6263, 'grad_norm': 4.010173797607422, 'learning_rate': 4.955693398316349e-05, 'mean_token_accuracy': 0.5546146603514954, 'epoch': 0.03}
{'loss': 2.8511, 'grad_norm': 2.592853546142578, 'learning_rate': 4.948308964702408e-05, 'mean_token_accuracy': 0.5

TrainOutput(global_step=6771, training_loss=2.295069775189101, metrics={'train_runtime': 410.0615, 'train_samples_per_second': 16.512, 'train_steps_per_second': 16.512, 'total_flos': 8400770288640000.0, 'train_loss': 2.295069775189101})

In [5]:
# 1. Generar texto para cada entrada
from tqdm import tqdm

trainer.model.eval()
trainer.model.to("cuda")
tokenizer.padding_side = "left"

predictions = []
for example in tqdm(test_dataset):
    inputs = tokenizer(example["prompt"], return_tensors="pt", truncation=True, padding=True).to("cuda")
    outputs = trainer.model.generate(**inputs, max_new_tokens=10)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extrae la última línea (respuesta del modelo)
    response = decoded.split("### Respuesta:")[-1].strip()
    predictions.append(1 if "YES" in response.upper() else 0)

# 2. Comparar con los verdaderos
from sklearn.metrics import classification_report

true_labels = test_df["label"].tolist()
print(classification_report(true_labels, predictions, digits=4))


  0%|          | 0/251 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  0%|          | 1/251 [00:00<00:57,  4.34it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 2/251 [00:00<00:42,  5.86it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 3/251 [00:00<00:38,  6.42it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 4/251 [00:00<00:35,  7.03it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 5/251 [00:00<00:34,  7.24it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 6/251 [00:00<00:32,  7.48it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  3%|▎         | 7/251 [00:01<00:32,  7.47it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  3%|▎         | 8/251 [00:01<00:31,  7.71it/s]Setting `pad_toke

              precision    recall  f1-score   support

           0     1.0000    0.8626    0.9262       131
           1     0.8696    1.0000    0.9302       120

    accuracy                         0.9283       251
   macro avg     0.9348    0.9313    0.9282       251
weighted avg     0.9376    0.9283    0.9281       251






Prediccion sobre el fichero de test

In [None]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

listado_etiquetas = ['IDEOLOGICAL-INEQUALITY','STEREOTYPING-DOMINANCE','OBJECTIFICATION','SEXUAL-VIOLENCE','MISOGYNY-NON-SEXUAL-VIOLENCE']
etiqueta = listado_etiquetas[0]

# 1. Configuración
model_name = "meta-llama/Llama-3.2-1B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

# 2. Carga y formato del dataset de entrenamiento
df = pd.read_csv("/home/adrian/Escritorio/DeepSexist/DatasetManagement/EXIST2025DatasetV0.3/EXIST 2025 Videos Dataset/training/EXIST2025_training_task3_3_ALL.csv")
df = df.rename(columns={str(etiqueta): "label"})

def format_prompt(row):
    return f"""### Instrucción:
Te voy a enseñar frases que pueden pertenece a la clase de DESIGUALDAD IDEOLÓGICA o no y después te indico la respuesta
### Texto:
{row['text']}

### Respuesta:
{"YES" if row['label'] == 1 else "NO"}"""

df["prompt"] = df.apply(format_prompt, axis=1)
df["completion"] = df["label"].astype(str)
train_dataset = Dataset.from_pandas(df[["prompt", "completion"]])

# 3. Tokenizador
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# 4. Cuantización y carga del modelo
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

# 5. LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)

# 6. Tokenización del entrenamiento
def formatting_func(example):
    return f"{example['prompt']} {example['completion']}"

def preprocess(example):
    return tokenizer(
        formatting_func(example),
        truncation=True,
        padding="max_length",
        max_length=64,
    )

train_dataset = train_dataset.map(preprocess)

# 7. Argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir="./outputs_llm_peft",
    per_device_train_batch_size=1,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none",
)

# 8. Entrenador
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    args=training_args,
    peft_config=lora_config,
    tokenizer=tokenizer,
)

trainer.train()

# ================================================
# 9. Cargar y predecir sobre un dataset sin etiquetas
# ================================================

df_test = pd.read_json("/home/adrian/Escritorio/DeepSexist/DatasetManagement/EXIST2025DatasetV0.3/EXIST 2025 Videos Dataset/test/EXIST2025_test_clean.json", orient="index")
df_test["prompt"] = df_test["text"].apply(lambda x: f"""### Instrucción:
Decide si el siguiente mensaje tiene contenido de DESIGUALDAD IDEOLÓGICA: el texto desacredita el movimiento feminista, rechaza la desigualdad entre hombres y mujeres o presenta a los hombres como víctimas de la opresión de género. Responde solo con "1" si lo es o "0" si no lo es.
### Texto:
{x}

### Respuesta:""")
test_dataset = Dataset.from_pandas(df_test[["prompt"]])

# Tokenización
def preprocess_test(example):
    return tokenizer(
        example["prompt"],
        truncation=True,
        padding="max_length",
        max_length=64,
    )

test_dataset = test_dataset.map(preprocess_test)

# Predicción
trainer.model.eval()
trainer.model.to(device)
outputs = trainer.model.generate(
    input_ids=torch.tensor(test_dataset["input_ids"]).to(device),
    attention_mask=torch.tensor(test_dataset["attention_mask"]).to(device),
    max_new_tokens=10
)

decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
df_test["prediction"] = decoded_outputs

# Guardar resultados
df_test[["id_EXIST", "text", "prediction"]].to_csv("/home/adrian/Escritorio/DeepSexist/TrainingBooks/results/Task3.3/LLM_cuaderno/predicciones_test_" + etiqueta + ".csv", index=False)


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

: 

In [4]:
df_test.value_counts('prediction')

prediction
### Instrucción:\nDecide si el siguiente mensaje tiene contenido de DESIGUALDAD IDEOLÓGICA: el texto desacredita el movimiento feminista, rechaza la desigualdad entre hombres y mujeres o presenta a los hombres como víctimas de la opresión de género. Responde solo con "1"    674
Name: count, dtype: int64

In [5]:
from tqdm import tqdm

# Generación en lotes pequeños si es necesario
generated_outputs = []
batch_size = 2  # ajusta si tienes poca memoria

for i in tqdm(range(0, len(test_dataset), batch_size)):
    batch = test_dataset.select(range(i, min(i + batch_size, len(test_dataset))))
    inputs = tokenizer(
        batch["prompt"],
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=64
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,
            do_sample=False,
            temperature=0.0
        )
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    generated_outputs.extend(decoded)

# Limpieza y extracción de respuestas
def extract_label(text):
    text = text.strip().upper()
    if "YES" in text:
        return 1
    elif "NO" in text:
        return 0
    else:
        return -1  # valor desconocido

predicted_labels = [extract_label(output) for output in generated_outputs]

# Añadir al DataFrame
df_test["prediction_raw"] = generated_outputs
df_test["predicted_label"] = predicted_labels

# Guardar
df_test[["id_EXIST", "text", "predicted_label"]].to_csv("/home/adrian/Escritorio/DeepSexist/TrainingBooks/results/Task3.3/LLM_cuaderno/predicciones_test_" + etiqueta + ".csv", index=False)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  0%|          | 1/337 [00:00<01:05,  5.11it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 2/337 [00:00<00:58,  5.71it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 3/337 [00:00<00:58,  5.73it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 4/337 [00:00<00:55,  6.03it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|▏         | 5/337 [00:00<00:54,  6.07it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 6/337 [00:00<00:53,  6.21it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 7/337 [00:01<00:53,  6.16it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 8/337 [00:01<00:53,  6.20it/s]Setting `pad_token_id` to `eos_token_id`:None for open-e

In [6]:
df_test = pd.read_csv("/home/adrian/Escritorio/DeepSexist/TrainingBooks/results/Task3.3/LLM_cuaderno/predicciones_test_" + etiqueta + ".csv")

df_test.value_counts('predicted_label')

predicted_label
-1    674
Name: count, dtype: int64

In [None]:
df_test_json = df_test.to_json()

df_test_json

In [7]:
import pandas as pd
import json

# Cargar el CSV
#df = pd.read_csv("predicciones_test.csv")

# Convertir los valores
output = []
for _, row in df_test.iterrows():
    entry = {
        "test_case": "EXIST2025",
        "id": str(row["id_EXIST"]),
        "value": "YES" if row["predicted_label"] == 1 else "NO"
    }
    output.append(entry)

# Guardar como JSON
with open("/home/adrian/Escritorio/DeepSexist/TrainingBooks/results/Task3.3/LLM_cuaderno/predicciones_test_" + etiqueta + ".csv", "w") as f:
    json.dump(output, f, indent=2)

print("✅ Archivo guardado como 'predicciones_formato_EXIST2025.json'")


✅ Archivo guardado como 'predicciones_formato_EXIST2025.json'


In [8]:
df_json = pd.read_json("/home/adrian/Escritorio/DeepSexist/TrainingBooks/results/Task3.3/LLM_cuaderno/predicciones_test_" + etiqueta + ".csv")

df_json.value_counts('value')

value
NO    674
Name: count, dtype: int64

In [3]:
import pandas as pd
import torch
import re
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

listado_etiquetas = ['IDEOLOGICAL-INEQUALITY','STEREOTYPING-DOMINANCE','OBJECTIFICATION','SEXUAL-VIOLENCE','MISOGYNY-NON-SEXUAL-VIOLENCE']
etiqueta = listado_etiquetas[0]

# 1. Configuración
model_name = "meta-llama/Llama-3.2-1B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

# 2. Carga y formato del dataset de entrenamiento
df = pd.read_csv("/home/adrian/Escritorio/DeepSexist/DatasetManagement/EXIST2025DatasetV0.3/EXIST 2025 Videos Dataset/training/EXIST2025_training_task3_3_ALL.csv")
df = df.rename(columns={str(etiqueta): "label"})

# Usar "1"/"0" en el prompt y completion
def format_prompt(row):
    return f"""### Instrucción:
    Decide si el siguiente mensaje tiene contenido de DESIGUALDAD IDEOLÓGICA: el texto desacredita el movimiento feminista, rechaza la desigualdad entre hombres y mujeres o presenta a los hombres como víctimas de la opresión de género. Responde solo con "1" si lo es o "0" si no lo es.
    ### Texto:
    {row['text']}

    ### Respuesta:
    {row['label']}"""

df["prompt"] = df.apply(format_prompt, axis=1)
df["completion"] = df["label"].astype(str)
train_dataset = Dataset.from_pandas(df[["prompt", "completion"]])

# 3. Tokenizador
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# 4. Cuantización y carga del modelo
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

# 5. LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)

# 6. Tokenización del entrenamiento
def formatting_func(example):
    return f"{example['prompt']} {example['completion']}"

def preprocess(example):
    return tokenizer(
        formatting_func(example),
        truncation=True,
        padding="max_length",
        max_length=64,
    )

train_dataset = train_dataset.map(preprocess)

# 7. Argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir="./outputs_llm_peft",
    per_device_train_batch_size=1,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    report_to="none",
)

# 8. Entrenador
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    args=training_args,
    peft_config=lora_config,
    tokenizer=tokenizer,
)

trainer.train()

# ================================================
# 9. Cargar y predecir sobre un dataset sin etiquetas
# ================================================

df_test = pd.read_json("/home/adrian/Escritorio/DeepSexist/DatasetManagement/EXIST2025DatasetV0.3/EXIST 2025 Videos Dataset/test/EXIST2025_test_clean.json", orient="index")

df_test["prompt"] = df_test["text"].apply(lambda x: f"""### Instrucción:
Decide si el siguiente mensaje tiene contenido de DESIGUALDAD IDEOLÓGICA: el texto desacredita el movimiento feminista, rechaza la desigualdad entre hombres y mujeres o presenta a los hombres como víctimas de la opresión de género. Responde solo con "1" si lo es o "0" si no lo es.
### Texto:
{x}

### Respuesta:""")

test_dataset = Dataset.from_pandas(df_test[["prompt"]])

# Tokenización
def preprocess_test(example):
    return tokenizer(
        example["prompt"],
        truncation=True,
        padding="max_length",
        max_length=64,
    )

test_dataset = test_dataset.map(preprocess_test)

# 10. Predicción
trainer.model.eval()
trainer.model.to(device)

outputs = trainer.model.generate(
    input_ids=torch.tensor(test_dataset["input_ids"]).to(device),
    attention_mask=torch.tensor(test_dataset["attention_mask"]).to(device),
    max_new_tokens=5
)

decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# 11. Limpieza de las predicciones (extraer "1" o "0")
# def extract_label(text):
#     match = re.search(r"\b[01]\b", text)
#     return int(match.group()) if match else -1  # -1 si no se puede detectar

import re

def extract_label(text):
    # Limpia espacios y saltos de línea
    text = text.strip()

    # Intenta encontrar un 0 o 1 aislado, posiblemente entre comillas
    match = re.search(r'["\']?([01])["\']?', text)
    if match:
        return int(match.group(1))
    else:
        # Si no hay coincidencia clara, busca heurísticamente
        if "1" in text:
            return 1
        elif "0" in text:
            return 0
        return -1  # Fallback si no se puede determinar



df_test["predicted_label"] = [extract_label(out) for out in decoded_outputs]

# 12. Guardar resultados
df_test[["id_EXIST", "text", "predicted_label"]].to_csv(
    f"/home/adrian/Escritorio/DeepSexist/TrainingBooks/results/Task3.3/LLM_cuaderno/predicciones_test_{etiqueta}.csv",
    index=False
)

print(f"✅ Predicciones guardadas para la clase {etiqueta}")


Map:   0%|          | 0/2524 [00:00<?, ? examples/s]

  trainer = SFTTrainer(


Map:   0%|          | 0/2524 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/2524 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2524 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2524 [00:00<?, ? examples/s]

  0%|          | 0/7572 [00:00<?, ?it/s]

{'loss': 3.429, 'grad_norm': 2.184516429901123, 'learning_rate': 4.9933967247754884e-05, 'mean_token_accuracy': 0.4004102966430465, 'epoch': 0.0}
{'loss': 2.8565, 'grad_norm': 2.6146552562713623, 'learning_rate': 4.986793449550978e-05, 'mean_token_accuracy': 0.46811229239132646, 'epoch': 0.01}
{'loss': 2.8395, 'grad_norm': 2.256992816925049, 'learning_rate': 4.980190174326466e-05, 'mean_token_accuracy': 0.4909543594128105, 'epoch': 0.01}
{'loss': 2.7282, 'grad_norm': 2.2187490463256836, 'learning_rate': 4.973586899101955e-05, 'mean_token_accuracy': 0.5197358345432839, 'epoch': 0.02}
{'loss': 2.2355, 'grad_norm': 4.149733066558838, 'learning_rate': 4.9669836238774436e-05, 'mean_token_accuracy': 0.618944974115393, 'epoch': 0.02}
{'loss': 2.0137, 'grad_norm': 3.1141090393066406, 'learning_rate': 4.9603803486529324e-05, 'mean_token_accuracy': 0.6659987566408871, 'epoch': 0.02}
{'loss': 1.593, 'grad_norm': 3.1099398136138916, 'learning_rate': 4.9537770734284205e-05, 'mean_token_accuracy': 0

Map:   0%|          | 0/674 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


✅ Predicciones guardadas para la clase IDEOLOGICAL-INEQUALITY


In [4]:
df_test.value_counts('predicted_label')

predicted_label
-1    674
Name: count, dtype: int64

Formato de la subida *PyEvall*

In [18]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

predictions = "/home/adrian/Escritorio/DeepSexist/TrainingBooks/results/exist2025_I2C-UHU-Sirius/task3_1_hard_I2C-UHU-Sirius_1/Llama-3.2-1B-Instruct.json"
#gold = "/home/adrian/Escritorio/DeepSexist/TrainingBooks/results/exist2025_I2C-UHU-Sirius/task3_1_hard_I2C-UHU-Sirius_1/prediction.json"
gold = "/home/adrian/Escritorio/DeepSexist/DatasetManagement/EXIST2025DatasetV0.3/evaluation/golds/EXIST2025_training_task3_1_gold_hard.json"
test = PyEvALLEvaluation()
params= dict()
params[PyEvALLUtils.PARAM_REPORT]= PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED
metrics=["ICM", "ICMNorm" ,"FMeasure"]
report= test.evaluate(predictions, gold, metrics, **params)
report.print_report()

2025-05-18 14:18:46,153 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure']
2025-05-18 14:18:46,230 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-18 14:18:46,487 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2025-05-18 14:18:46,488 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-18 14:18:46,741 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-18 14:18:47,183 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
cargado 29
{
  "metrics": {
    "ICM": {
      "name": "Information Contrast model",
      "acronym": "ICM",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "average