## **Tarea PLN**

En este notebook se presenta la parte de entrenamiento del modelo seleccionado: Ensemble Learning y la predicción final sobre el dataset de *sem_eval_test_blank_es*

### Importación de datos

In [7]:
import pandas as pd
# Cargar el conjunto de datos de entrenamiento
df = pd.read_csv('/kaggle/input/trainn/sem_eval_train_es.csv')

### Entrenamiento Ensemble Learning

In [10]:
import joblib
import torch
from transformers import BertTokenizer, BertForSequenceClassification, XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import os

os.environ["WANDB_DISABLED"] = "true"

# Descargar stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('spanish'))

# Función para limpiar texto
def limpiar_texto(text):
    text = text.lower()
    text = re.sub(r'(@\w+|#\w+|http\S+)', '', text)
    text = re.sub(r'[^a-záéíóúñü\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Función para eliminar stopwords
def eliminar_stopwords(text):
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Cargar el conjunto de datos de entrenamiento
df = pd.read_csv('/kaggle/input/trainn/sem_eval_train_es (1).csv')
df['Tweet'] = df['Tweet'].apply(limpiar_texto).apply(eliminar_stopwords)

# Separar las características y las etiquetas
X = df['Tweet']
y = df.iloc[:, 2:]

# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Cargar los tokenizadores y modelos
bert_tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')
xlm_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Preparar los datos para los dos modelos
class BERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        labels = self.labels.iloc[idx].values.astype(float)
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

# Crear datasets para BERT
train_dataset_bert = BERTDataset(X_train, y_train, bert_tokenizer, max_len=128)
test_dataset_bert = BERTDataset(X_test, y_test, bert_tokenizer, max_len=128)

# Crear datasets para XLM-Roberta
train_dataset_xlm = BERTDataset(X_train, y_train, xlm_tokenizer, max_len=128)
test_dataset_xlm = BERTDataset(X_test, y_test, xlm_tokenizer, max_len=128)

# Definir los modelos para clasificación multietiqueta
model_bert = BertForSequenceClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased', num_labels=y_train.shape[1])
model_xlm = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=y_train.shape[1])

# Configurar el entrenamiento
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=8,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=3e-5,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Guardar solo al final de cada epoch
    save_steps=5000,  # Guardar con menos frecuencia
    save_total_limit=1,  # Mantener solo el último checkpoint
)

# Entrenar el modelo BERT
trainer_bert = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=train_dataset_bert,
    eval_dataset=test_dataset_bert,
)
trainer_bert.train()

# Evaluar el modelo BERT
predictions_bert = trainer_bert.predict(test_dataset_bert)
probabilities_bert = torch.sigmoid(torch.tensor(predictions_bert.predictions)).numpy()

# Guardar el modelo BERT
model_bert.save_pretrained('/kaggle/working/bert_model')
bert_tokenizer.save_pretrained('/kaggle/working/bert_tokenizer')

# Liberar memoria de BERT
del model_bert
torch.cuda.empty_cache()

# Configurar el entrenamiento para XLM-Roberta
training_args.save_strategy = "no"  # No guardar checkpoints intermedios

# Entrenar el modelo XLM-Roberta
trainer_xlm = Trainer(
    model=model_xlm,
    args=training_args,
    train_dataset=train_dataset_xlm,
    eval_dataset=test_dataset_xlm,
)
trainer_xlm.train()

# Evaluar el modelo XLM-Roberta
predictions_xlm = trainer_xlm.predict(test_dataset_xlm)
probabilities_xlm = torch.sigmoid(torch.tensor(predictions_xlm.predictions)).numpy()

# Guardar el modelo XLM-Roberta
model_xlm.save_pretrained('/kaggle/working/xlm_model')
xlm_tokenizer.save_pretrained('/kaggle/working/xlm_tokenizer')

# Promediar las probabilidades de los dos modelos para el ensemble
probabilities_ensemble = (probabilities_bert + probabilities_xlm) / 2

# Optimizar los umbrales para cada etiqueta
optimal_thresholds = []
for i in range(y_train.shape[1]):
    best_threshold = 0.5
    best_f1 = 0
    for threshold in np.arange(0.1, 0.9, 0.01):
        y_pred_bin = (probabilities_ensemble[:, i] >= threshold).astype(int)
        f1 = f1_score(y_test.iloc[:, i], y_pred_bin)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    optimal_thresholds.append(best_threshold)

# Guardar los umbrales óptimos
joblib.dump(optimal_thresholds, '/kaggle/working/optimal_thresholds.pkl')


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,0.3761,0.387972
2,0.3432,0.341893
3,0.3145,0.319048
4,0.2393,0.318191
5,0.2047,0.315419
6,0.1738,0.32334
7,0.1269,0.331108
8,0.1342,0.331562




Epoch,Training Loss,Validation Loss
1,0.3857,0.391638
2,0.389,0.383708
3,0.3784,0.364458
4,0.3431,0.354653
5,0.3213,0.332313
6,0.3035,0.326647
7,0.2763,0.33014
8,0.3094,0.327486


['/kaggle/working/optimal_thresholds.pkl']

### Predecir sobre el dataset final

In [14]:
import os
import joblib
import torch
from transformers import BertTokenizer, BertForSequenceClassification, XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from torch.utils.data import Dataset

# Descargar stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('spanish'))

# Función para limpiar texto
def limpiar_texto(text):
    text = text.lower()
    text = re.sub(r'(@\w+|#\w+|http\S+)', '', text)
    text = re.sub(r'[^a-záéíóúñü\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Función para eliminar stopwords
def eliminar_stopwords(text):
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Definir la clase de dataset
class BERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        labels = self.labels.iloc[idx].values.astype(float)
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

# Cargar el conjunto de datos de test
test_df = pd.read_csv('/kaggle/input/trainn/sem_eval_test_blank_es.csv')
test_df['Tweet'] = test_df['Tweet'].apply(limpiar_texto).apply(eliminar_stopwords)
X_test = test_df['Tweet']
tweet_ids = test_df['ID']

# Crear dataset para predicción
test_dataset_bert = BERTDataset(X_test, pd.DataFrame(np.zeros((X_test.shape[0], len(optimal_thresholds)))), bert_tokenizer, max_len=128)
test_dataset_xlm = BERTDataset(X_test, pd.DataFrame(np.zeros((X_test.shape[0], len(optimal_thresholds)))), xlm_tokenizer, max_len=128)

# Cargar los modelos y los tokenizadores
bert_tokenizer = BertTokenizer.from_pretrained('/kaggle/working/bert_tokenizer')
xlm_tokenizer = XLMRobertaTokenizer.from_pretrained('/kaggle/working/xlm_tokenizer')
model_bert = BertForSequenceClassification.from_pretrained('/kaggle/working/bert_model', num_labels=len(optimal_thresholds))
model_xlm = XLMRobertaForSequenceClassification.from_pretrained('/kaggle/working/xlm_model', num_labels=len(optimal_thresholds))

# Cargar los umbrales óptimos
optimal_thresholds = joblib.load('/kaggle/working/optimal_thresholds.pkl')

# Configurar los entrenadores para predicción
trainer_bert = Trainer(model=model_bert)
trainer_xlm = Trainer(model=model_xlm)

# Obtener las predicciones
predictions_bert = trainer_bert.predict(test_dataset_bert)
probabilities_bert = torch.sigmoid(torch.tensor(predictions_bert.predictions)).numpy()

predictions_xlm = trainer_xlm.predict(test_dataset_xlm)
probabilities_xlm = torch.sigmoid(torch.tensor(predictions_xlm.predictions)).numpy()

# Promediar las probabilidades
probabilities_ensemble = (probabilities_bert + probabilities_xlm) / 2

# Convertir probabilidades a etiquetas binarias utilizando los umbrales óptimos
y_pred_bin = np.zeros(probabilities_ensemble.shape)
for i in range(probabilities_ensemble.shape[1]):
    y_pred_bin[:, i] = (probabilities_ensemble[:, i] >= optimal_thresholds[i]).astype(int)

# Crear el DataFrame con las predicciones
pred_df = pd.DataFrame(y_pred_bin, columns=['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'])
pred_df.insert(0, 'ID', tweet_ids)
pred_df = pred_df.astype({col: 'bool' for col in pred_df.columns if col != 'ID'})

# Guardar el DataFrame en un archivo CSV
pred_df.to_csv('/kaggle/working/soluciones_Vicent_Munoz_Correcher.csv', index=False)


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
