In [None]:
#!pip install datasets

## Imports

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader, TensorDataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
import torch
import torch.nn.functional as F
import os
from collections import defaultdict

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/My Drive/datasets

/content/drive/.shortcut-targets-by-id/1CNLowRdVtLOrHbCZbdjg3IVys53iPk7m/datasets


## Dataset



In [None]:
import csv
import pandas as pd

input_file = 'articles.csv'
output_file = 'corrected_articles.csv'

# Lista para armazenar as linhas válidas
data = []

# Ler o arquivo CSV manualmente
with open(input_file, 'r', encoding='utf-8') as infile:
    reader = csv.reader(infile)
    header = next(reader)  # Ler o cabeçalho
    for row in reader:
        data.append(row)

# Criar o DataFrame a partir dos dados
art_df = pd.DataFrame(data, columns=header)


In [None]:
art_df.columns
art_df = art_df.drop(columns = ['title','date','subcategory', 'link'])

As per the documentation on the Hugging Face website, it's necessary to tokenize the texts that will be used in training for fine-tuning these models. Therefore, also for memory considerations, I chose to keep only the columns containing the news body and its corresponding category


In [None]:
from sklearn.preprocessing import LabelEncoder

# Codificar as categorias
label_encoder = LabelEncoder()
art_df['category'] = label_encoder.fit_transform(art_df['category'])
art_df = art_df.rename(columns={'category': 'label'})


The classes are in text format, which is not ideal for model training. Therefore, the label_encoder function converts the nominal classes into numeric formats.

In [None]:
#Aplicação da operação usando pandas e tqdm para acompanhar o progresso
from tqdm import tqdm
tqdm.pandas()  # Ativar tqdm com pandas

# Função para truncar o texto após o último ponto
def truncate_text(s):
    if isinstance(s, str):
        return s[:s.rfind('.') + 1]
    return ''

# Aplicar a função para truncar o texto
art_df['text'] = art_df['text'].progress_apply(truncate_text)


100%|██████████| 167053/167053 [00:00<00:00, 425392.92it/s]


During the tokenization process, I was facing issues. Tokenization was returning errors and not functioning properly, until I searched through tutorials and realized that the problem was the text not being terminated with '.'. In other words, even though the body of the news article had ended, there were noise elements such as advertisements after the message body, requiring cleaning to maintain consistency. The Transformer interprets ',' as a stopping point, so the lack of proper message termination was causing errors. truncate_text resolves this issue.

In [None]:
# Divida o DataFrame em treinamento (80%) e uma parte temporária (20%)
df_train, df_temp = train_test_split(art_df, test_size=0.9999, random_state=10)

# Divida a parte temporária em validação (50%) e teste (50%)
df_val, df_test = train_test_split(df_train, test_size=0.5, random_state=42)

This data splitting stage is quite delicate. The dataset is very large and was causing memory issues, so I had to use a configuration that allowed me to run the experiments. Since I believe the goal of the activity is to test our skills with the models, learn how to preprocess them, among other aspects, I don't see a problem with the size of the data I can use in the experiments.

In [None]:
df_train.shape, df_test.shape

((16, 2), (8, 2))

In [None]:
import pandas as pd
import itertools

def remove_missing_classes(df_train, df_test):
  """
  Removes rows in df_train where the label is present in df_train but not in df_test.

  Args:
      df_train (pandas.DataFrame): The training DataFrame.
      df_test (pandas.DataFrame): The testing DataFrame.

  Returns:
      pandas.DataFrame: The filtered df_train with missing classes removed.
  """

  classes_train = set(df_train['label'].unique())
  classes_test = set(df_test['label'].unique())

  missing_classes = classes_train.difference(classes_test)

  return df_train[~df_train['label'].isin(missing_classes)]

# Example usage:
df_train = remove_missing_classes(df_train, df_test)
print(set(df_train['label']))
print(set(df_test['label'].unique()))


{35, 9, 12, 46, 26}
{35, 9, 12, 46, 26}


Due to the dataset's size, some classes are present in the training set but absent in the test set, causing issues. Therefore, the removing_missing_classes function aims to ensure that both sets have the same classes.

## Training Function
The train_evaluate_model function was created to be used with both requested models, enabling a fair comparison of results. For this purpose, it receives the models, tokenizers, as well as the pre-tokenized training and test datasets

In [None]:
def train_evaluate_model(tokenizer, model, tokenized_train, tokenized_test, y_train, y_test):

    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

    train_dataset = TensorDataset(
        tokenized_train["input_ids"].clone().detach(),
        tokenized_train["attention_mask"].clone().detach(),
        torch.tensor(y_train)
    )
    test_dataset = TensorDataset(
        tokenized_test["input_ids"].clone().detach(),
        tokenized_test["attention_mask"].clone().detach(),
        torch.tensor(y_test)
    )

    train_dataloader = DataLoader(train_dataset, batch_size=5, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=5, shuffle=True)

    optimizer = AdamW(model.parameters(), lr=1e-5)
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device='cpu'
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    for epoch in range(1):  # Loop over epochs
        model.train()
        for batch in train_dataloader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch

            # Debugging: Print shapes and values
            #print("Batch input_ids shape train :", input_ids.shape)
            #print("Batch attention_mask shape train :", attention_mask.shape)
            #print("Batch labels shape train:", labels.shape)

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    model.eval()
    eval_loss = 0
    acc = []
    prec =[]
    num_classes = len(set(y_test))
    overall_cm = np.zeros((num_classes, num_classes), dtype=int)

    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = batch

            # Debugging: Print shapes and values
            #print("Batch input_ids shape:", input_ids.shape)
            #print("Batch attention_mask shape:", attention_mask.shape)
            #print("Batch labels shape:", labels.shape)

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            predictions = torch.argmax(outputs.logits, dim=1)

            eval_loss += outputs.loss.item()



            accuracy = accuracy_score(labels, predictions)
            acc.append(accuracy)

            precision = precision_score(labels, predictions, average="weighted")
            prec.append(precision)

            print(f"Accuracy: {accuracy:.3f}")
            print(f"Precision: {precision:.3f}")


        average_accuracy = np.mean(acc)
        average_precision = np.mean(prec)

        print(f"Average Accuracy: {average_accuracy:.3f}")
        print(f"Average Precision: {average_precision:.3f}")
        print(f"Evaluation Loss: {eval_loss / len(test_dataloader)}")





##  Bertimbal



In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Carregamento dos modelos BERT e Bertimbal

bertimbal_tokenizer = AutoTokenizer.from_pretrained("tiagoblima/newsdata-bertimbal")
bertimbal_model = AutoModelForSequenceClassification.from_pretrained("tiagoblima/newsdata-bertimbal")


## Final Data Preparation
The datasets need to be dataset-type objects. Therefore, it's necessary to convert both the training and test sets to this format. Additionally, since the data selection was previously done randomly, the resulting classes in my dataset may not be ordered. Hence, the convert_class_labels function associates each class with a numerical order (0, 1, 2, etc.), preventing issues during training.

In [None]:
dataset_train = Dataset.from_pandas(df_train)
dataset_test = Dataset.from_pandas(df_test)

In [None]:
y_train = dataset_train['label']
y_test = dataset_test['label']

In [None]:
def convert_class_labels(labels):
  """
  Converts a list of class labels (integers) to numerical representations (starting from 0).

  Args:
      labels (list): A list of integer class labels.

  Returns:
      torch.Tensor: A tensor containing the numerical representations of the class labels.
  """

  # Create a dictionary to map unique class labels to numerical representations
  class_to_index = {label: i for i, label in enumerate(sorted(set(labels)))}

  # Convert each label in the original list to its numerical representation
  numerical_labels = [class_to_index[label] for label in labels]

  # Convert the list to a PyTorch tensor
  return numerical_labels

In [None]:
y_train = convert_class_labels(y_train)
y_test =  convert_class_labels(y_test)

In [None]:
set(y_train), set(y_test), len(dataset_test["text"]), len(y_test), len(dataset_train["text"]), len(y_train)

({0, 1, 2, 3, 4}, {0, 1, 2, 3, 4}, 8, 8, 12, 12)

## Tokenization, training and evaluation

In [None]:
tokenized_bertimbau_train = bertimbal_tokenizer(dataset_train["text"], padding="max_length", truncation=True, max_length=300, return_tensors="pt")

In [None]:
tokenized_bertimbau_test = bertimbal_tokenizer(dataset_test["text"], padding="max_length", truncation=True, max_length=300, return_tensors="pt")

In [None]:
train_evaluate_model(bertimbal_tokenizer, bertimbal_model, tokenized_bertimbau_train, tokenized_bertimbau_test, y_train, y_test)



Accuracy: 1.000
Precision: 1.000
Accuracy: 1.000
Precision: 1.000
Average Accuracy: 1.000
Average Precision: 1.000
Evaluation Loss: 0.0019725957536138594


The model is evaluated in batches, so I print the accuracies and precisions for each batch. Since both are 1, we can't visualize which classes had more errors/accuracies. Additionally, the loss is quite small, which makes sense given these metrics.

## Fazendo o mesmo pra o Bert.

In [None]:
def train_evaluate_model(tokenizer, model, tokenized_train, tokenized_test, y_train, y_test):

    os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

    train_dataset = TensorDataset(
        tokenized_train["input_ids"].clone().detach(),
        tokenized_train["attention_mask"].clone().detach(),
        torch.tensor(y_train)
    )
    test_dataset = TensorDataset(
        tokenized_test["input_ids"].clone().detach(),
        tokenized_test["attention_mask"].clone().detach(),
        torch.tensor(y_test)
    )

    train_dataloader = DataLoader(train_dataset, batch_size=5, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=5, shuffle=True)

    optimizer = AdamW(model.parameters(), lr=1e-5)
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device='cpu'
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    for epoch in range(1):  # Loop over epochs
        model.train()
        for batch in train_dataloader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch

            # Debugging: Print shapes and values
            #print("Batch input_ids shape train :", input_ids.shape)
            #print("Batch attention_mask shape train :", attention_mask.shape)
            #print("Batch labels shape train:", labels.shape)

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    model.eval()
    eval_loss = 0
    acc = []
    prec =[]
    num_classes = len(set(y_test))
    overall_cm = np.zeros((num_classes, num_classes), dtype=int)
    guessed_classes = []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = batch

            # Debugging: Print shapes and values
            #print("Batch input_ids shape:", input_ids.shape)
            #print("Batch attention_mask shape:", attention_mask.shape)
            #print("Batch labels shape:", labels.shape)

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            predictions = torch.argmax(outputs.logits, dim=1)
            for i in range(len(labels)):
                 if predictions[i] != labels[i]:
                    guessed_classes.append((predictions[i].item(), labels[i].item()))

            eval_loss += outputs.loss.item()



            accuracy = accuracy_score(labels, predictions)
            acc.append(accuracy)

            precision = precision_score(labels, predictions, average="weighted")
            prec.append(precision)

            print(f"Accuracy: {accuracy:.3f}")
            print(f"Precision: {precision:.3f}")


        average_accuracy = np.mean(acc)
        average_precision = np.mean(prec)

        print(f"Average Accuracy: {average_accuracy:.3f}")
        print(f"Average Precision: {average_precision:.3f}")
        print(f"Evaluation Loss: {eval_loss / len(test_dataloader)}")


        for guessed, correct in guessed_classes:
            print(f"Guessed: {guessed} (Correct: {correct})")




In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Carregamento dos modelos BERT e Bertimbal

bert_tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
bert_model = AutoModelForSequenceClassification.from_pretrained("neuralmind/bert-base-portuguese-cased", num_labels=len(set(y_train)))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenized_bert_train = bert_tokenizer(dataset_train["text"], padding="max_length", truncation=True, max_length=300, return_tensors="pt")

In [None]:
tokenized_bert_test = bert_tokenizer(dataset_test["text"], padding="max_length", truncation=True, max_length=300, return_tensors="pt")

The model is evaluated in batches, so I print the accuracies and precisions for each batch. Since both are 1, we can't visualize which classes had more errors/accuracies. Additionally, the loss is quite small, which makes sense given these metrics.

In [None]:
train_evaluate_model(bert_tokenizer, bert_model, tokenized_bert_train, tokenized_bert_test, y_train, y_test)

  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.400
Precision: 0.400
Accuracy: 0.667
Precision: 0.500
Average Accuracy: 0.533
Average Precision: 0.450
Evaluation Loss: 1.2417539358139038
Guessed: 1 (Correct: 4)
Guessed: 1 (Correct: 0)
Guessed: 1 (Correct: 2)
Guessed: 1 (Correct: 0)


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
y_train

[3, 4, 1, 3, 1, 3, 1, 1, 0, 2, 2, 0]

As BERT presented errors, I created a modified function for it that allows us to know which class it predicted and which one was correct. We can observe that it generally predicts class 1, which is the class with the highest number of samples.

 ## Conclusion

BERTimbau had the best performance, which was expected, considering it was pre-trained in Portuguese, unlike BERT. Additionally, BERT tends to predict the class with the highest availability in the dataset.