# Binary Classification with BETO
 * Classification of comments as related and not related to the video content
 * Classes: 1. yes (comment is related to the video or its content), and 2. no (comment is not related to the video or its content).

## Requirements

In [None]:
!pip install torch transformers datasets

In [None]:
!pip install num2words torch

In [None]:
!python -m spacy download es_core_news_sm

In [None]:
import numpy as np
import torch
import torch.nn as nn
import seaborn as sns
import copy
import warnings
import torch.optim as optim
import pandas as pd
import re
import string
import nltk
import spacy
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from torch.utils.data import DataLoader
from datasets import Dataset
from sklearn.model_selection import train_test_split
from num2words import num2words
from datasets import load_dataset
from transformers import BertModel,BertTokenizer, TrainingArguments, Trainer, AutoConfig
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

# Data

For preprocessing use ***Preprocessing.ipynb script***

In [None]:
# Loading preprocessed data
data = pd.read_csv("sample_Task1.csv") # change to sample_Task2.csv for experiment 2
dataD_copy = data.copy() # creating copy of data file

Removing rows with Nan values and with only one character

In [None]:
# Removing rows with nan values
dataD_copy.dropna(subset=["comment_cleaned"], inplace=True)

# Removing rows with only one character
def filter_comments(text):
    if not isinstance(text, str):  # skip if NaN or non-string
        return False
    tokens = text.split()
    # Condition 1: only one character (ignoring spaces)
    if len(text.strip()) == 1:
        return False
    return True

dataD_copy = dataD_copy[dataD_copy['comment_cleaned'].apply(filter_comments)]

In [None]:
dataD_copy.columns

# Splitting datasets

In [None]:
label_mapping = {"yes": 0, "no": 1}
dataD_copy['label'] = dataD_copy['related_video'].map(label_mapping)
dataD_copy['label'].value_counts()

In [None]:
# Split ratios
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# First split: train and temp (val + test)
df_train, df_temp = train_test_split(dataD_copy, test_size=(1 - train_ratio), random_state=42)

# Second split: validation and test from df_temp
# Compute relative proportions
val_size = validation_ratio / (validation_ratio + test_ratio)

df_val, df_test = train_test_split(df_temp, test_size=(1 - val_size), random_state=42)

# Check size distribution (optional)
print(f"Train size: {len(df_train)}")
print(f"Validation size: {len(df_val)}")
print(f"Test size: {len(df_test)}")


In [None]:
# Droping id column in dataframes
for df_split in [df_train, df_val, df_test]:
  df_split.drop(columns=['id'], inplace=True)

In [None]:
# Converting datasets into Hugging Face Dataset format
train_set = Dataset.from_pandas(df_train)
val_set = Dataset.from_pandas(df_val)
test_set = Dataset.from_pandas(df_test)

In [None]:
# Removing '__index_level_0__' form datasets
train_set = train_set.remove_columns(['__index_level_0__'])
val_set = val_set.remove_columns(['__index_level_0__'])
test_set = test_set.remove_columns(['__index_level_0__'])

# Loading Model
* Using transformer BERT based model - BETO
* More information available at: https://github.com/dccuchile/beto

In [None]:
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Loading tokenizerf
tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")

In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['comment_cleaned'], padding="max_length", truncation=True)

In [None]:
# Tokenizing all sets (train, validation, and test)
tokenized_train_set = train_set.map(tokenize_function, batched=True)
tokenized_val_set = val_set.map(tokenize_function, batched=True)
tokenized_test_set = test_set.map(tokenize_function, batched=True)

In [None]:
# Loading BETO Model (transformer based BERT fo Spanish)
model = BertModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased").to(device)

In [None]:
# Defining the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # evaluate at the end of each epoch
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=64,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    num_train_epochs=8,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

In [None]:
# Function for computing metrics during fine-tuning
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Convert logits and labels to tensors
    logits_tensor = torch.tensor(logits)
    labels_tensor = torch.tensor(labels)

    # Compute cross entropy loss
    loss_fct = CrossEntropyLoss()
    loss = loss_fct(logits_tensor, labels_tensor).item()

    # You can also compute accuracy or other metrics
    accuracy = accuracy_score(labels, predictions)

    return {
        'cross_entropy_loss': loss,
        'accuracy': accuracy
    }


In [None]:
# Creating  Trainer
trainer = Trainer(
    model=model,                         # BETO- pre-trained model
    args=training_args,                  # training arguments
    train_dataset=tokenized_train_set,   # tokenized training dataset
    eval_dataset=tokenized_val_set,      # tokenized validation dataset
    tokenizer=tokenizer,                 # tokenizer for BETO
    compute_metrics=compute_metrics,     # metrics for fine-tuning evaluation
)

In [None]:
# Fine-tuning the model
trainer.train()

# Predicting over Test Sample

In [None]:
# creating test loader
test_loader = DataLoader(tokenized_test_set, batch_size=64)

In [None]:
tokenized_test_set.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [None]:
# Function for predictions
def predict(model, dataloader):
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            predictions.append(logits.argmax(dim=-1).cpu().numpy())

    return predictions

In [None]:
# Predicting
predictions = predict(model, test_loader)

In [None]:
# Converting predictions to flat list
predictions = [item for sublist in predictions for item in sublist]

In [None]:
# Adding predictions to test set Dataframe
df_test['Predicted_Label'] = predictions

## Evaluation of test set predictions

In [None]:
# Getting true labels and predicted labels form test dataframe
y_true = df_test['label']
y_pred = df_test['Predicted_Label']

In [None]:
# Calculating metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, pos_label=0)  # assuming 'yes'=0 is the positive class
recall = recall_score(y_true, y_pred, pos_label=0)
f1 = f1_score(y_true, y_pred, pos_label=0)
conf_matrix = confusion_matrix(y_true, y_pred)


# Printing the metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_report(y_true, y_pred))

# Predicting over complete  data

In [None]:
# Loading complete raw data
data_comp_df = pd.read_csv("Raw_Data.csv")

In [None]:
# Converting datasets into Hugging Face Dataset format
data_comp_set = Dataset.from_pandas(data_comp_df)

In [None]:
# Tokenizing data complete set
tokenized_data_comp_set = data_comp_set.map(tokenize_function, batched=True)

In [None]:
# Creating data loader for data complete
data_comp_loader = DataLoader(tokenized_data_comp_set, batch_size=64)

In [None]:
# Getting predictions
predictions_comp_data = predict(model, data_comp_loader)

In [None]:
# Converting predictions to flat list
predictions_comp_data_list = [item for sublist in predictions_comp_data for item in sublist]

In [None]:
# Adding prections to test set Dataframe
data_comp_df['Predicted_Labels'] = predictions_comp_data_list

In [None]:
# If needed use for saving results add file name and uncomment
#data_comp_df.to_csv("add_file_name.csv")