# Clasificación - RoBERTa 

Este Notebook hace _fine-tuning_ de RoBERTa para predecir si un conjunto de líneas pertenece a una película es de inmigración o no y toma las predicciones para construir un índice de contenido de inmigración. Se corrió desde Google Colab usando T4 GPU. Pasos:

1. [Definir longitud de los ejemplos](#trunk)
2. [Configuración del entrenamiento](#config)
3. [Entrenamiento](#entrenam)
4. [Predicciones para usar como input de índice de contenido de inmigración](#predic)


Inspirado en https://github.com/nanom/llm_adaptation_2nlp_workshop?tab=readme-ov-file -Copyright (c) 2023 Hernán J. Maina- y posteriormente adaptado para la tarea, modelo y datos específicos.

In [None]:
# Instalar e importar librerías necesarias
!pip install datasets huggingface_hub  --quiet
!pip install -U transformers[torch] --quiet
!pip install tensorboard --quiet

import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from datasets import Dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
    pipeline
)
from huggingface_hub import HfFolder, notebook_login

from sklearn import metrics
from sklearn.metrics import confusion_matrix,  ConfusionMatrixDisplay,f1_score, accuracy_score, precision_score, roc_auc_score, balanced_accuracy_score

In [None]:
# Login Hugging Face y datos de repositorio y modelo a usar
notebook_login()
repository_id = "wenbrau/roberta-base_immifilms"
model_id = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

## Importar datos

In [None]:
from google.colab import drive
drive.mount('/content/drive') # luego, cargar archivos


films = pd.read_pickle("/content/drive/MyDrive/TESIS DATA/data/grouped_lines_sample_inputdf.pkl") # líneas de subtítulo agrupadas para respetar el máximo admitido por RoBERTa preentrenado


## Estructura de datasets, división en train, test y validación

In [None]:
films = pd.read_pickle("/content/drive/MyDrive/TESIS DATA/data/grouped_lines_sample_inputdf.pkl")

df  = Dataset.from_pandas(films)
df  = df.rename_column("just_migra", "label")
df = df.train_test_split(test_size=0.2, seed = 9)
print(np.mean(df["test"]["label"]))
print(np.mean(df["train"]["label"]))

# Train y test
train_df = df['train']
test_df = df["test"].shard(num_shards=2, index=0)

# Validación
val_df = df['test'].shard(num_shards=2, index=1)

<a id='trunk'></a>

## Definir longitud de los ejemplos

In [None]:
# Largo promedio de cada ejemplo
def tokenize(example):
    return tokenizer(example["line"], truncation=False, padding=False) # que no le ponga padding, ni trunque

tokenized_dataset = train_df.map(tokenize, batched=False, remove_columns=["line"])

n_tokens = [len(x) for x in tokenized_dataset["input_ids"]]
plt.hist(n_tokens)
plt.show()

In [None]:
# Cálculo de percentiles 
percentiles = np.percentile(n_tokens, [25, 50, 75])

print(f"Percentil 25: {percentiles[0]}")
print(f"Percentil 50 (Mediana): {percentiles[1]}")
print(f"Percentile 75: {percentiles[2]}")
print(f"Max: {np.max(n_tokens)}")

In [None]:
# Tokenizar el texto de input con el tokenizador de RoBERTa. Padding y truncado para que las secuencias tegan la misma cantidad de tokens que el percentil 75.
def tokenize(batch):
    return tokenizer(batch["line"], padding=True, truncation=True, max_length= int(percentiles[2]))

train_df = train_df.map(tokenize, batched=True, batch_size=len(train_df))
val_df = val_df.map(tokenize, batched=True, batch_size=len(val_df))
test_df = test_df.map(tokenize, batched=True, batch_size=len(test_df))

<a id='config'></a>
## Configuración de entrenamiento

In [None]:
train_df.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_df.set_format("torch",  columns=["input_ids", "attention_mask", "label"])
test_df.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Mapeo de etiqueta
id2label = {1: "migrac", 0: "no migrac"}
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

# Liberar memoria
torch.cuda.empty_cache()
import gc
gc.collect()

In [None]:
# Eliminación de checkpoints anteriores
!rm -r "$CUSTOM_MODEL_CHECKPOINT"

# Configuración de hiperparámetros
BATCH_SIZE = 48 
LEARNING_RATE = 2e-5 
EPOCHS = 3 
LOGGING_STEPS = len(train_df) // BATCH_SIZE
CUSTOM_MODEL_CHECKPOINT = repository_id

# Configuración de entrenamiento
training_args = TrainingArguments(
    output_dir=CUSTOM_MODEL_CHECKPOINT,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    num_train_epochs=EPOCHS,
    optim='adamw_torch',
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_steps=500,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    fp16=True,
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=LOGGING_STEPS,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2
)

# Instanciación de modelo
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

# Instanciación de clase Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=test_df,
)

trainer.args._n_gpu = 1

<a id='entrenam'></a>

## Entrenamiento

In [None]:
# Fijamos semilla para reproducibilidad de resultados
torch.manual_seed(42)

# Inicio de entrenamiento
trainer.train()

# Guardado de entrenamiento
tokenizer.save_pretrained(CUSTOM_MODEL_CHECKPOINT)
trainer.save_model(CUSTOM_MODEL_CHECKPOINT)
trainer.create_model_card()
trainer.push_to_hub()

# Estimación de perplexity posterior a entrenamiento
eval_results = trainer.evaluate()
eval_results

# Guardar resultados
tokenizer.save_pretrained(CUSTOM_MODEL_CHECKPOINT)
trainer.save_model(CUSTOM_MODEL_CHECKPOINT)
trainer.create_model_card()
trainer.push_to_hub()


<a id='predic'></a>

## Predicciones para usar como input de índice de contenido de inmigración

In [None]:
# Importar modelos
roberta_base = pipeline('text-classification',model_id, max_length=512, truncation=True)
roberta_tuned = pipeline('text-classification',repository_id, max_length=512, truncation=True)


# Calcular etiquetas tanto pre como post finetuning para comparar y medir mejoras
validation = pd.DataFrame()
validation["text"] = val_df["line"]
validation["label"] = val_df["label"]
validation["tconst"] = val_df["tconst"]

validation["pred_score_tuned"] = [roberta_tuned(x)[0]["label"] for x in tqdm(validation.text, position=0, leave=True)]
validation.to_pickle("/content/drive/MyDrive/TESIS DATA/data/roberta_tuning_validation_results.pkl")
validation["pred_score_base"] = [roberta_base(x)[0]["label"] for x in tqdm(validation.text, position=0, leave=True)]

validation["pred_score_tuned_01"] = validation.pred_score_tuned.map({"migrac": 1, "no migrac": 0})
validation["pred_score_base_01"] = validation.pred_score_base.map({"LABEL_1": 1, "LABEL_0": 0})
validation.to_pickle("/content/drive/MyDrive/TESIS DATA/data/roberta_tuning_validation_results.pkl")

validation.head()

In [None]:
# comparar métricas post y pre fine-tuning / adaptación de dominio
print(roc_auc_score(validation.label, validation.pred_score_tuned_01), roc_auc_score(validation.label, validation.pred_score_base_01))
print(balanced_accuracy_score(validation.label, validation.pred_score_tuned_01), balanced_accuracy_score(validation.label, validation.pred_score_base_01))

In [None]:
# Scores y para toda la muestra de validación y test
i = 0
for c in [val_df, test_df]:
    conjunto = ["val", "test"][i]
    print(conjunto)

    aux = pd.DataFrame()
    aux["text"] = c["line"]
    aux["label"] = c["label"]
    aux["tconst"] = c["tconst"]

    aux["pred_score_tuned"] = [ roberta_tuned(x)[0]["score"] for x in tqdm(aux.text, position=0, leave=True)]
    aux["pred_lab_tuned"] =   [roberta_tuned(x)[0]["label"]  for x in tqdm(aux.text, position=0, leave=True)]
    aux["pred_lab_tuned_01"] = aux.pred_lab_tuned.map({"migrac": 1, "no migrac": 0})
    aux.to_pickle(f"/content/drive/MyDrive/TESIS DATA/data/roberta_tuned_{conjunto}_scores.pkl")
    print(aux)

    i +=1
    del aux