# Taller I2C: Entrenamiento con LLMs

## Instalación de dependencias y preparación del kernel

In [None]:
# !pip install -q -U torch=='2.0.0'

In [None]:
# !pip install -q -U accelerate=='0.25.0' peft=='0.7.1' bitsandbytes=='0.41.3.post2' transformers=='4.36.1' trl=='0.7.4'

In [None]:
!pip install bitsandbytes
!pip install pytorch_lightning
!pip install datasets
!pip install trl

#### Limpiar caché

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TF_DETERMINISTIC_OPS"] = "1"

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd
import os
import random
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
import matplotlib.pyplot as plt
import seaborn as sns
from pytorch_lightning import seed_everything
from tqdm import tqdm
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging,
                          EarlyStoppingCallback)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from huggingface_hub import login

In [None]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
seed_everything(42, workers=True)
pd.set_option('display.max_colwidth', None)

In [None]:
# Comprobar GPU
if torch.cuda.device_count() > 0:
    print(f'GPU detected. Currently using: "{torch.cuda.get_device_name(0)}"')
    device = torch.device("cuda")
else:
    print('Currently using CPU. To utilize GPU acceleration, change the runtime type in the \'runtime\' tab.')

## Preparación del conjunto de datos

#### Carga de datos

In [None]:
# Montar directorio de drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# # Dataset Refugiados
# train_filename = "/content/drive/MyDrive/Colab Notebooks/Talleres/LLMs/data/refugiados_train_df.csv"
# test_filename = "/content/drive/MyDrive/Colab Notebooks/Talleres/LLMs/data/refugiados_test_df.csv"
# valid_filename = "/content/drive/MyDrive/Colab Notebooks/Talleres/LLMs/data/refugiados_valid_df.csv"
# campo_texto = 'text'
# campo_etiqueta = 'label'
# clase_0 = 'NO'
# clase_1 = 'SI'

In [None]:
# Dataset Alimenticio
train_filename = "/content/drive/MyDrive/Colab Notebooks/Talleres/LLMs/data/train_df.csv"
test_filename = "/content/drive/MyDrive/Colab Notebooks/Talleres/LLMs/data/test_df.csv"
valid_filename = "/content/drive/MyDrive/Colab Notebooks/Talleres/LLMs/data/valid_df.csv"
campo_texto = 'text'
campo_etiqueta = 'label'
clase_0 = '0'
clase_1 = '1'

In [None]:
# # Dataset HomoMEX
# train_filename = "/content/drive/MyDrive/Colab Notebooks/Talleres/LLMs/data/homomex_train_df.csv"
# test_filename = "/content/drive/MyDrive/Colab Notebooks/Talleres/LLMs/data/homomex_test_df.csv"
# valid_filename = "/content/drive/MyDrive/Colab Notebooks/Talleres/LLMs/data/homomex_valid_df.csv"
# campo_texto = 'content'
# campo_etiqueta = 'label'
# clase_0 = 'NP'
# clase_1 = 'P'

In [None]:
# Cargar los conjuntos de datos de entrenamiento, prueba y validación
def load_data(train_file, test_file, valid_file):
    X_train = pd.read_csv(train_file, encoding = "utf-8", encoding_errors = "replace", sep = ',')[[campo_texto, campo_etiqueta]]
    X_test = pd.read_csv(test_file, encoding = "utf-8", encoding_errors = "replace", sep = ',')[[campo_texto, campo_etiqueta]]
    X_eval = pd.read_csv(valid_file, encoding = "utf-8", encoding_errors = "replace", sep = ',')[[campo_texto, campo_etiqueta]]
    return X_train, X_test, X_eval

train_df, test_df, valid_df = load_data(train_filename, test_filename, valid_filename)

In [None]:
# def filter_classes(df, classes_to_keep):
#     return df[df[campo_etiqueta].isin(classes_to_keep)]

# # Filtrar datasets para quedarnos con las clases 0 y 1
# train_df = filter_classes(train_df, [clase 0, clase 1])
# test_df = filter_classes(test_df, [clase 0, clase 1])
# valid_df = filter_classes(valid_df, [clase 0, clase 1])

In [None]:
X_train, X_test, X_eval = train_df, test_df, valid_df
Y_true = X_test[campo_etiqueta]

#### Mezclar y reorganizar los datos

In [None]:
# Mezclar y reiniciar índices de los conjuntos de datos
def shuffle_and_reset_index(data, seed = 10):
    return data.sample(frac = 1, random_state = seed).reset_index(drop = True)

X_train = shuffle_and_reset_index(X_train)
X_eval = shuffle_and_reset_index(X_eval)
X_test = X_test.reset_index(drop = True)

#### Visualización de la distribución de clases

In [None]:
# Mostrar la distribución de clases en los conjuntos de datos
def show_class_distribution(data, name):
    print(f"\nDistribución de clases en el conjunto de {name}:")
    print(data[campo_etiqueta].value_counts())

show_class_distribution(X_train, "entrenamiento")
show_class_distribution(X_test, "prueba")
show_class_distribution(X_eval, "validación")

#### Generación de Prompts

In [None]:
i_prompt = 0
prompts = [ f"""
            [INST]
            Analiza el siguiente tweet para determinar si la persona que lo escribió muestra indicios de un trastorno alimenticio.
            Considera el contenido, tono y posibles referencias a hábitos alimenticios, percepción corporal o conducta relacionada con la alimentación.
            Devuelve exclusivamente **solo** la etiqueta "1" si hay signos de un trastorno alimenticio, o "0" si no los hay. No incluyas texto adicional.
            [/INST]""",
            f"""
            [INST]
            Analiza el siguiente tweet para determinar si contiene odio o no.
            Devuelve exclusivamente **solo** la etiqueta "SI" si el tweet incluye odio o "NO" si no lo incluye, sin texto adicional.
            [/INST]
            """,
          ]

In [None]:
# Funciones para generar los prompts de entrenamiento y prueba
def generate_prompt(data_point, prompt):
    return f"""
            {prompt}

            [TWEET: {data_point[campo_texto]}] = {data_point[campo_etiqueta]} """.strip()

def generate_test_prompt(data_point, prompt):
    return f"""
            {prompt}

            [{data_point[campo_texto]}] = """.strip()

# Aplicar la generación de prompts a los conjuntos de datos
X_train = pd.DataFrame(X_train.apply(lambda row: generate_prompt(row, prompts[i_prompt]), axis=1), columns=[campo_texto])
X_eval = pd.DataFrame(X_eval.apply(lambda row: generate_prompt(row, prompts[i_prompt]), axis=1), columns=[campo_texto])

Y_true = X_test.label
X_test = pd.DataFrame(X_test.apply(lambda row: generate_test_prompt(row, prompts[i_prompt]), axis=1), columns=[campo_texto])

In [None]:
X_train.head()

In [None]:
X_test.head()

#### Conversión a Dataset de HuggingFace

In [None]:
# Convertir a Dataset de HuggingFace
train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)
test_data = Dataset.from_pandas(X_test)

In [None]:
train_data

In [None]:
eval_data

In [None]:
test_data

## Funcion de evaluación

In [None]:
def evaluate(y_true, y_pred, metodo):
    labels = [clase_0, clase_1]
    mapping = {0: 0, 1: 1}

    # Convertir etiquetas a valores numéricos usando mapeo eficiente
    y_true = pd.Series(y_true).map(mapping).fillna(0).astype(int)
    y_pred = pd.Series(y_pred).map(mapping).fillna(0).astype(int)

    # Calcular precisión global
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    # Generar y mostrar el reporte de clasificación completo
    print('\nClassification Report:')
    class_report = classification_report(y_true, y_pred, target_names=labels, output_dict=True)
    print(classification_report(y_true, y_pred, target_names=labels))

    # Generar y mostrar la matriz de confusión
    print('\nConfusion Matrix:')
    conf_matrix = confusion_matrix(y_true, y_pred)
    print(conf_matrix)

    # Reorganizar la matriz de confusión si es necesario
    conf_reordered = conf_matrix  # Puedes ajustar el orden si lo necesitas

    # Graficar la matriz de confusión
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_reordered, annot=True, cmap='Reds', fmt='d', xticklabels=labels, yticklabels=labels)
    plt.title(f'Matriz Confusion - {i_model} (Prompt {i_prompt}). {metodo}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')

    # Guardar la matriz de confusión
    matriz_path = f'/content/drive/MyDrive/Colab Notebooks/Talleres/LLMs/matrices/matriz_{i_model}_prompt{i_prompt}_{metodo}.jpeg'
    plt.savefig(matriz_path)
    plt.show()

    # Descomponer la matriz de confusión (TN, FP, FN, TP)
    TN, FP, FN, TP = conf_matrix.ravel()

    # Calcular la curva ROC
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)

    # Graficar la curva ROC
    plt.figure()
    plt.plot(fpr, tpr, color='darkblue', lw=2, label='Curva ROC (AUC = {:.2f})'.format(roc_auc))
    plt.plot([0, 1], [0, 1], color='lightgrey', linestyle='--')  # Línea diagonal
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('Tasa de Falsos Positivos')
    plt.ylabel('Tasa de Verdaderos Positivos')
    plt.title(f'Curva ROC - {i_model} (Prompt {i_prompt}). {metodo}')
    plt.legend(loc='lower right')
    plt.grid()

    # Guardar la gráfica de la curva ROC
    curva_roc_path = f'/content/drive/MyDrive/Colab Notebooks/Talleres/LLMs/curvas/curvaroc_{i_model}_prompt{i_prompt}_{metodo}.jpeg'
    plt.savefig(curva_roc_path)
    plt.show()

    # Extraer métricas del reporte de clasificación
    precision_1 = class_report[clase_1]['precision']
    recall_1 = class_report[clase_1]['recall']
    f1_score_1 = class_report[clase_1]['f1-score']
    support_1 = class_report[clase_1]['support']

    precision_0 = class_report[clase_0]['precision']
    recall_0 = class_report[clase_0]['recall']
    f1_score_0 = class_report[clase_0]['f1-score']
    support_0 = class_report[clase_0]['support']

    # Asegurarse de que las métricas y la matriz se devuelvan correctamente
    return {
        'accuracy': accuracy,
        'precision_1': precision_1,
        'recall_1': recall_1,
        'f1_score_1': f1_score_1,
        'support_1': support_1,
        'precision_0': precision_0,
        'recall_0': recall_0,
        'f1_score_0': f1_score_0,
        'support_0': support_0,
        'TN': TN,
        'FP': FP,
        'FN': FN,
        'TP': TP,
        'roc_auc': roc_auc,
        'curva_roc_path': curva_roc_path,
        'matriz_path': matriz_path
    }


## Uso del modelo

#### Definición de hiperparámetros

In [None]:
num_epochs = 4
num_epochs_bloque = 2

#### Enumeración de modelos

In [None]:
i_model = 0
model_names = ['meta-llama/Llama-3.2-1B-Instruct', 'tiiuae/falcon-7b', 'BSC-LT/salamandra-7b-instruct', 'meta-llama/Llama-3.2-3B-Instruct', 'Qwen/Qwen2.5-3B-Instruct', 'google/gemma-2-2b-it', 'apry/best_2b']
model_shorts = ['llama-1b', 'falcon-7b', 'salamandra-7b', 'llama-3b', 'qwen-3b', 'gemma-2b', 'best-2b']

In [None]:
metodo1 = 'Base'
metodo2 = 'PreprocesadoES'

#### Configuración del modelo y carga de tokenizer

In [None]:
# Token de Hugging Face
hf_token = ""
login(hf_token)

In [None]:
def load_model_and_tokenizer(model_name, hf_token=None, quantization=True):
    try:
        # Configuración de cuantización en 4 bits
        if quantization:
            compute_dtype = getattr(torch, "float16")
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=False,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=compute_dtype,
            )
        else:
            bnb_config = None  # Sin cuantización

        # Cargar modelo con configuración de cuantización
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            quantization_config=bnb_config,
            use_auth_token=hf_token,
        )

        # Configuraciones adicionales específicas del modelo
        if hasattr(model.config, "use_cache"):
            model.config.use_cache = False
        if hasattr(model.config, "pretraining_tp"):
            model.config.pretraining_tp = 1  # Modelos LLaMA

        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True,
            padding_side="left",
            add_eos_token=True,
            use_auth_token=hf_token,
        )
        tokenizer.pad_token = tokenizer.eos_token

        return model, tokenizer

    except Exception as e:
        print(f"Error al cargar el modelo {model_name}: {e}")
        return None, None

In [None]:
# Llamada a la función
model_name = model_names[i_model]
model, tokenizer = load_model_and_tokenizer(model_name)

#### Función de predicción

In [None]:
X_test

In [None]:
# Función de predicción
def predict(X_test, model, tokenizer):
    y_pred = []

    # Crear pipeline de generación de texto
    pipe = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=2,
        do_sample=False,
        return_full_text=False,
        truncation=True
    )

    # Iterar sobre los ejemplos del dataset de prueba
    for i in tqdm(range(len(X_test))):
        prompt = X_test[i][campo_texto]

        # Agregar token especial para evitar problemas de caracteres extraños
        result = pipe(prompt, pad_token_id=tokenizer.eos_token_id)
        generated_text = result[0]['generated_text'].strip().lower()
        print(generated_text)

        # Filtrar la respuesta generada y buscar etiquetas específicas "1" o "0"
        if clase_1.lower() in generated_text and not clase_0.lower() in generated_text:
            y_pred.append(1)
        elif clase_0.lower() in generated_text:
            y_pred.append(0)
        else:
            # Valor predeterminado si no se encuentra ni "1" ni "0"
            y_pred.append(0)

    return y_pred

# Prueba de predicción
Y_pred = predict(test_data.select(range(10)), model, tokenizer)
print("\n", Y_pred)
print((Y_pred == Y_true.iloc[0:10]).tolist())

In [None]:
# Función de evaluación
Y_pred = predict(test_data, model, tokenizer)
evaluate(Y_true, Y_pred, metodo1)

In [None]:
# # Contar cuántos True y False hay en la comparación
# true_count = sum(Y_pred == Y_true)
# false_count = len(Y_pred) - true_count
# print(f"True count: {true_count}")
# print(f"False count: {false_count}")

#### Configuración de PEFT (LoRA)

In [None]:
from peft import LoraConfig

# Configuración de LoRA (PEFT)
def setup_peft():
    peft_config = LoraConfig(
        lora_alpha = 16,
        lora_dropout = 0.05,
        r = 64,
        bias = "none",
        task_type = "CAUSAL_LM"
    )
    return peft_config

peft_config = setup_peft()

#### Entrenamiento del modelo

In [None]:
print(f"Memoria libre: {torch.cuda.memory_reserved() / 1e9} GB")
print(f"Memoria total: {torch.cuda.memory_allocated() / 1e9} GB")

In [None]:
torch.cuda.empty_cache()

In [None]:
# # Configuración del entrenamiento
# def setup_training_arguments():
#     return TrainingArguments(
#         output_dir = "logs",
#         num_train_epochs = num_epochs,
#         per_device_train_batch_size = 8,
#         gradient_accumulation_steps = 8,
#         optim = "paged_adamw_32bit",
#         save_steps = 0,
#         logging_steps = 25,
#         learning_rate = 5e-5,
#         weight_decay = 0.01,
#         fp16 = True,
#         bf16 = False,
#         max_grad_norm = 0.5,
#         max_steps = -1,
#         warmup_ratio = 0.1,
#         group_by_length = True,
#         lr_scheduler_type = "cosine",
#         report_to = "tensorboard",
#         save_strategy="epoch",
#         evaluation_strategy = "epoch",
#         load_best_model_at_end = True,
#         metric_for_best_model = "eval_loss",
#         greater_is_better = False,
#     )

# training_arguments  =  setup_training_arguments()

# # Inicialización del trainer
# trainer = SFTTrainer(
#     model = model,
#     train_dataset = train_data,
#     eval_dataset = eval_data,
#     peft_config = peft_config,
#     dataset_text_field = campo_texto,
#     tokenizer = tokenizer,
#     args = training_arguments,
#     packing = False,
#     max_seq_length = 256,
#     callbacks = [
#         EarlyStoppingCallback(early_stopping_patience = 5)
#     ],
# )

In [None]:
# trainer.train()
# # Guardar el modelo entrenado
# trainer.model.save_pretrained(f"/content/drive/MyDrive/Colab Notebooks/Talleres/LLMs/models/{model_shorts[i_model]}_prompt{i_prompt}_{num_epochs}_{metodo2}")

In [None]:
# Configuración del entrenamiento
def setup_training_arguments():
    return TrainingArguments(
        output_dir = "logs",
        num_train_epochs = num_epochs_bloque,
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 8,
        optim = "paged_adamw_32bit",
        save_steps = 0,
        logging_steps = 25,
        learning_rate = 5e-5,
        weight_decay = 0.01,
        fp16 = True,
        bf16 = False,
        max_grad_norm = 0.5,
        max_steps = -1,
        warmup_ratio = 0.1,
        group_by_length = True,
        lr_scheduler_type = "cosine",
        report_to = "tensorboard",
        save_strategy = "epoch",
        evaluation_strategy = "epoch",
        load_best_model_at_end = True,
        metric_for_best_model = "eval_loss",
        greater_is_better = False,
    )

training_arguments = setup_training_arguments()

# Inicialización del trainer
trainer = SFTTrainer(
    model = model,
    train_dataset = train_data,
    eval_dataset = eval_data,
    peft_config = peft_config,
    dataset_text_field = campo_texto,
    tokenizer = tokenizer,
    args = training_arguments,
    packing = False,
    max_seq_length = 256,
    callbacks = [
        EarlyStoppingCallback(early_stopping_patience = 5)
    ],
)

In [None]:
# Entrenar en bloques de num_epochs_bloque épocas
for block in range(1, (num_epochs // num_epochs_bloque) + 1):
    print(f"Entrenando bloque {block} de {num_epochs_bloque} épocas...")
    trainer.train()

    # Guardar cada 2 bloques
    if block % 2 == 0:
        checkpoint_dir = (
            f"/content/drive/MyDrive/Colab Notebooks/Talleres/LLMs/models_checks/"
            f"{model_shorts[i_model]}_prompt{i_prompt}_{num_epochs}_epoch{block * 10}_{metodo2}"
        )
        trainer.model.save_pretrained(checkpoint_dir)
        tokenizer.save_pretrained(checkpoint_dir)
        print(f"Modelo temporal guardado en: {checkpoint_dir}")

In [None]:
# Guardar el modelo final
final_model_dir = f"/content/drive/MyDrive/Colab Notebooks/Talleres/LLMs/models/{model_shorts[i_model]}_prompt{i_prompt}_{num_epochs}_{metodo2}"
trainer.model.save_pretrained(final_model_dir)
tokenizer.save_pretrained(final_model_dir)

print(f"Modelo final guardado en: {final_model_dir}")

In [None]:
# # Cargar modelo entrenado
# model_path = f"/content/drive/MyDrive/Colab Notebooks/Talleres/LLMs/models/{model_shorts[i_model]}_prompt{i_prompt}_{metodo1}"
# model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")

In [None]:
torch.cuda.empty_cache()

#### Predicción final

In [None]:
# Predicción después del entrenamiento
Y_pred = predict(test_data, model, tokenizer)

# Guardar resultados en archivo CSV
predictions = pd.DataFrame({'label': test_df[campo_texto],
                           'Y_true': test_df[campo_etiqueta],
                           'Y_pred': Y_pred})

predictions.to_csv(f"/content/drive/MyDrive/Colab Notebooks/Talleres/LLMs/preds/test_predictions_{model_shorts[i_model]}_prompt{i_prompt}_{num_epochs}.csv", index=False)

#### Evaluación de las predicciones

In [None]:
# Evaluar predicciones y obtener las métricas
metrics = evaluate(Y_true, Y_pred, metodo2)

#### Grabado de las evaluaciones

In [None]:
results_file = "/content/drive/MyDrive/Colab Notebooks/Talleres/LLMs/resultados.csv"

# Comprobar si el archivo existe
if os.path.exists(results_file):
    # Si existe, cargar el CSV existente
    df_results = pd.read_csv(results_file)
else:
    # Si no existe, crear un DataFrame vacío
    df_results = pd.DataFrame(columns=['i_prompt', 'model', 'epochs', 'accuracy', 'precision_si', 'recall_si',
                                       'f1_score_si', 'support_si', 'precision_no', 'recall_no',
                                       'f1_score_no', 'support_no', 'TN', 'FP', 'FN', 'TP', 'roc_auc'])

# Crear una fila con las métricas y la información adicional
fila = {
    'i_prompt': i_prompt,
    'model': model_name,
    'epochs': num_epochs,
    'accuracy': metrics['accuracy'],
    'precision_si': metrics['precision_1'],
    'recall_si': metrics['recall_1'],
    'f1_score_si': metrics['f1_score_1'],
    'support_si': metrics['support_1'],
    'precision_no': metrics['precision_0'],
    'recall_no': metrics['recall_0'],
    'f1_score_no': metrics['f1_score_0'],
    'support_no': metrics['support_0'],
    'TN': metrics['TN'],
    'FP': metrics['FP'],
    'FN': metrics['FN'],
    'TP': metrics['TP'],
    'roc_auc': metrics['roc_auc']
}

# Agregar la fila al DataFrame
df_results = pd.concat([df_results, pd.DataFrame([fila])], ignore_index=True)

# Guardar el DataFrame actualizado de nuevo en el archivo CSV
df_results.to_csv(results_file, index=False)

print(f"Métricas guardadas correctamente en {results_file}")