Classificação de Texto Multilabel


José Augusto de Almeida Neto


## Importar bibliotecas


In [1]:
import os
import time
import numpy as np
import pandas as pd
import torch
from dotenv import load_dotenv
from sklearn.metrics import (classification_report, f1_score, precision_score,
                             recall_score)
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification, BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


## Carregar datasets


In [2]:
# Carregar variáveis de ambiente do arquivo .env
load_dotenv()

# URLs dos datasets
file_urls = {
    'df_fold1': os.getenv('STEMMING_DF_FOLD1'),
    'df_fold2': os.getenv('STEMMING_DF_FOLD2'),
    'df_fold3': os.getenv('STEMMING_DF_FOLD3'),
    'df_fold4': os.getenv('STEMMING_DF_FOLD4'),
    'df_fold5': os.getenv('STEMMING_DF_FOLD5')
}

def load_dataset(url):
    return pd.read_csv(url)


# Carregar datasets
df_fold1 = load_dataset(file_urls['df_fold1'])
df_fold2 = load_dataset(file_urls['df_fold2'])
df_fold3 = load_dataset(file_urls['df_fold3'])
df_fold4 = load_dataset(file_urls['df_fold4'])
df_fold5 = load_dataset(file_urls['df_fold5'])

In [3]:
labels = ['ambiente', 'bebida', 'comida', 'geral',
          'localização', 'outros', 'preço', 'serviço']

In [4]:
def remove_empty_sentenca_rows(df):
    empty_sentenca_rows = df['sentenca'].isnull() | (df['sentenca'] == '')
    df = df[~empty_sentenca_rows]
    df.reset_index(drop=True, inplace=True)
    return df


# Removendo linhas vazias
df_fold1 = remove_empty_sentenca_rows(df_fold1)
df_fold2 = remove_empty_sentenca_rows(df_fold2)
df_fold3 = remove_empty_sentenca_rows(df_fold3)
df_fold4 = remove_empty_sentenca_rows(df_fold4)
df_fold5 = remove_empty_sentenca_rows(df_fold5)

## Listas e hiperparâmetros


In [5]:
# Gerar listas de métricas
f1_score_list_macro = []
precision_list_macro = []
recall_list_macro = []
f1_score_list_micro = []
precision_list_micro = []
recall_list_micro = []

In [6]:
# Hiperparâmetros
max_len = 128
train_batch_size = 16
valid_batch_size = 4
test_batch_size = 4
epochs = 5
learning_rate = 5e-05
num_labels = len(labels)
threshold = 0.35

## Funções auxiliares


In [7]:
class TokenizeDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['sentenca']
        self.targets = self.df[labels].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [8]:
def train_model(model, train_loader, optimizer, criterion, sigmoid):
    # Loop de Treinamento
    model.train()  # modo de treinamento
    for batch in train_loader:
        inputs_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        token_type_ids = batch['token_type_ids']
        targets = batch['targets']

        # Passagem Direta - os dados de entrada são passados para o modelo
        outputs = model(input_ids=inputs_ids,
                        attention_mask=attention_mask, token_type_ids=token_type_ids)

        # A função sigmóide é aplicada aos logits para converter os valores em probabilidades
        logits = outputs.logits
        predicted_probs = sigmoid(logits)

        # A perda é calculada comparando os logits com os rótulos verdadeiros (targets)
        loss = criterion(logits, targets)  # função de perda criterion

        # Retropropagação e Otimização
        optimizer.zero_grad()
        loss.backward()  # gradiente é calculado
        optimizer.step()  # parâmetros do modelo são otimizados

In [9]:
def evaluate_model(model, val_loader, sigmoid):
    # Loop de Validação
    model.eval()  # modo de avaliação
    all_targets = []
    all_predictions = []
    with torch.no_grad():
        for batch in val_loader:
            inputs_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            token_type_ids = batch['token_type_ids']
            targets = batch['targets']

            # Passagem Direta (inferência) - da mesma foram que o treinamento
            outputs = model(
                input_ids=inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

            # A função sigmóide é aplicada aos logits para converter os valores em probabilidades
            logits = outputs.logits
            predicted_probs = sigmoid(logits)

            # Aplicação do threshold
            predicted_labels = (predicted_probs > threshold).float()

            # Conversão dos rótulos verdadeiros e previstos para numpy arrays
            targets_np = targets.cpu().numpy()
            predicted_labels_np = predicted_labels.cpu().numpy()

            # Coleção dos targets e predictions para calculas as métricas
            all_targets.extend(targets_np)
            all_predictions.extend(predicted_labels_np)
    return all_targets, all_predictions

In [10]:
def run_fold(train_data, val_data, test_data):
    start_time = time.time()

    # Carregar o tokenizador BERT
    tokenizer = BertTokenizer.from_pretrained(
        'neuralmind/bert-base-portuguese-cased')

    # Tokenizar o dataset
    tokenized_train_data = TokenizeDataset(train_data, tokenizer, max_len)
    tokenized_val_data = TokenizeDataset(val_data, tokenizer, max_len)
    tokenized_test_data = TokenizeDataset(test_data, tokenizer, max_len)

    # Criar DataLoader para carregar os dados em lotes durante o treinamento
    train_loader = DataLoader(tokenized_train_data,
                              batch_size=train_batch_size, shuffle=True)
    val_loader = DataLoader(
        tokenized_val_data, batch_size=valid_batch_size, shuffle=False)
    test_loader = DataLoader(
        tokenized_test_data, batch_size=test_batch_size, shuffle=False)

    # Definir modelo, optimizer, e loss function
    model = BertForSequenceClassification.from_pretrained(
        'neuralmind/bert-base-portuguese-cased', num_labels=num_labels, problem_type="multi_label_classification")
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = torch.nn.BCEWithLogitsLoss()
    sigmoid = torch.nn.Sigmoid()

    # Treinamento
    for epoch in range(epochs):
        train_model(model, train_loader, optimizer, criterion, sigmoid)

        # Validação
        val_targets, val_predictions = evaluate_model(
            model, val_loader, sigmoid)

        # Calcular as métricas
        f1_macro = f1_score(val_targets, val_predictions, average='macro')
        precision_macro = precision_score(
            val_targets, val_predictions, average='macro')
        recall_macro = recall_score(
            val_targets, val_predictions, average='macro')
        f1_micro = f1_score(val_targets, val_predictions, average='micro')
        precision_micro = precision_score(
            val_targets, val_predictions, average='micro')
        recall_micro = recall_score(
            val_targets, val_predictions, average='micro')

        # Imprimir as métricas ao final de cada época
        print(f"\nEpoch {epoch + 1}:")
        print(
            f"F1-Score (macro): {f1_macro:.4f}, Precision (macro): {precision_macro:.4f}, Recall (macro): {recall_macro:.4f}")
        print(
            f"F1-Score (micro): {f1_micro:.4f}, Precision (micro): {precision_micro:.4f}, Recall (micro): {recall_micro:.4f}")

    # Teste
    test_targets, test_predictions = evaluate_model(
        model, test_loader, sigmoid)

    # Calcular as métricas no conjunto de teste
    f1_macro_test = f1_score(test_targets, test_predictions, average='macro')
    precision_macro_test = precision_score(
        test_targets, test_predictions, average='macro')
    recall_macro_test = recall_score(
        test_targets, test_predictions, average='macro')
    f1_micro_test = f1_score(test_targets, test_predictions, average='micro')
    precision_micro_test = precision_score(
        test_targets, test_predictions, average='micro')
    recall_micro_test = recall_score(
        test_targets, test_predictions, average='micro')

    # Atualizar a lista das métricas
    f1_score_list_macro.append(f1_macro_test)
    precision_list_macro.append(precision_macro_test)
    recall_list_macro.append(recall_macro_test)
    f1_score_list_micro.append(f1_micro_test)
    precision_list_micro.append(precision_micro_test)
    recall_list_micro.append(recall_micro_test)

    # Imprimir as métricas no conjunto de teste
    print("\nMétricas no Conjunto de Teste:")
    print(
        f"F1-Score (macro): {f1_macro_test:.4f}, Precision (macro): {precision_macro_test:.4f}, Recall (macro): {recall_macro_test:.4f}")
    print(
        f"F1-Score (micro): {f1_micro_test:.4f}, Precision (micro): {precision_micro_test:.4f}, Recall (micro): {recall_micro_test:.4f}")

    # Calcular o tempo total de teste
    end_time = time.time()
    elapsed_time = end_time - start_time
    hours, rem = divmod(elapsed_time, 3600)
    minutes, seconds = divmod(rem, 60)

    print(
        f"\nTempo de execução: {int(hours):02}:{int(minutes):02}:{seconds:.2f}")
    
    return test_targets, test_predictions

# BERT


## 1° Fold


In [11]:
train_data = pd.concat([df_fold2, df_fold3, df_fold4], ignore_index=True)
val_data = df_fold5
test_data = df_fold1

test_targets_f1, test_predictions_f1 = run_fold(train_data, val_data, test_data)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1:
F1-Score (macro): 0.7586, Precision (macro): 0.7441, Recall (macro): 0.7824
F1-Score (micro): 0.8849, Precision (micro): 0.8524, Recall (micro): 0.9200

Epoch 2:
F1-Score (macro): 0.8237, Precision (macro): 0.8460, Recall (macro): 0.8496
F1-Score (micro): 0.9052, Precision (micro): 0.8840, Recall (micro): 0.9274

Epoch 3:
F1-Score (macro): 0.8653, Precision (macro): 0.8924, Recall (macro): 0.8697
F1-Score (micro): 0.9196, Precision (micro): 0.8915, Recall (micro): 0.9496

Epoch 4:
F1-Score (macro): 0.8626, Precision (macro): 0.8718, Recall (macro): 0.8648
F1-Score (micro): 0.9225, Precision (micro): 0.9060, Recall (micro): 0.9395

Epoch 5:
F1-Score (macro): 0.8665, Precision (macro): 0.8653, Recall (macro): 0.8759
F1-Score (micro): 0.9220, Precision (micro): 0.9038, Recall (micro): 0.9409

Métricas no Conjunto de Teste:
F1-Score (macro): 0.8744, Precision (macro): 0.8670, Recall (macro): 0.8898
F1-Score (micro): 0.9243, Precision (micro): 0.9073, Recall (micro): 0.9420

Tempo

In [12]:
print(classification_report(test_targets_f1, test_predictions_f1, target_names=labels))

              precision    recall  f1-score   support

    ambiente       0.93      0.96      0.95       269
      bebida       0.81      0.98      0.88        82
      comida       0.94      0.95      0.95       418
       geral       0.84      0.83      0.83       206
 localização       0.91      0.97      0.94        30
      outros       0.67      0.45      0.54        22
       preço       0.92      0.99      0.96       161
     serviço       0.93      0.99      0.96       277

   micro avg       0.91      0.94      0.92      1465
   macro avg       0.87      0.89      0.87      1465
weighted avg       0.91      0.94      0.92      1465
 samples avg       0.91      0.93      0.91      1465



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 2° Fold


In [13]:
# Dividir o dataset
train_data = pd.concat([df_fold3, df_fold4, df_fold5], ignore_index=True)
val_data = df_fold1
test_data = df_fold2

test_targets_f2, test_predictions_f2 = run_fold(train_data, val_data, test_data)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 1:
F1-Score (macro): 0.7420, Precision (macro): 0.7817, Recall (macro): 0.7301
F1-Score (micro): 0.8919, Precision (micro): 0.9057, Recall (micro): 0.8785

Epoch 2:
F1-Score (macro): 0.8796, Precision (macro): 0.8693, Recall (macro): 0.8943
F1-Score (micro): 0.9267, Precision (micro): 0.9093, Recall (micro): 0.9447

Epoch 3:
F1-Score (macro): 0.8733, Precision (macro): 0.8566, Recall (macro): 0.8944
F1-Score (micro): 0.9176, Precision (micro): 0.9013, Recall (micro): 0.9345

Epoch 4:
F1-Score (macro): 0.8749, Precision (macro): 0.8441, Recall (macro): 0.9102
F1-Score (micro): 0.9163, Precision (micro): 0.8957, Recall (micro): 0.9379

Epoch 5:
F1-Score (macro): 0.8788, Precision (macro): 0.8511, Recall (macro): 0.9102
F1-Score (micro): 0.9182, Precision (micro): 0.8987, Recall (micro): 0.9386

Métricas no Conjunto de Teste:
F1-Score (macro): 0.8600, Precision (macro): 0.8417, Recall (macro): 0.8821
F1-Score (micro): 0.9107, Precision (micro): 0.8877, Recall (micro): 0.9350

Tempo

In [14]:
print(classification_report(test_targets_f2, test_predictions_f2, target_names=labels))

              precision    recall  f1-score   support

    ambiente       0.94      0.93      0.94       241
      bebida       0.93      0.86      0.89        72
      comida       0.92      0.98      0.95       401
       geral       0.73      0.88      0.80       243
 localização       0.82      0.97      0.89        33
      outros       0.50      0.50      0.50        24
       preço       0.93      0.99      0.96       152
     serviço       0.96      0.95      0.96       280

   micro avg       0.89      0.93      0.91      1446
   macro avg       0.84      0.88      0.86      1446
weighted avg       0.89      0.93      0.91      1446
 samples avg       0.89      0.92      0.90      1446



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 3° Fold


In [15]:
# Dividir o dataset
train_data = pd.concat([df_fold1, df_fold4, df_fold5], ignore_index=True)
val_data = df_fold2
test_data = df_fold3

test_targets_f3, test_predictions_f3 = run_fold(train_data, val_data, test_data)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 1:
F1-Score (macro): 0.7212, Precision (macro): 0.7594, Recall (macro): 0.7237
F1-Score (micro): 0.8813, Precision (micro): 0.8669, Recall (micro): 0.8963

Epoch 2:
F1-Score (macro): 0.8641, Precision (macro): 0.8715, Recall (macro): 0.8654
F1-Score (micro): 0.9142, Precision (micro): 0.9074, Recall (micro): 0.9212

Epoch 3:
F1-Score (macro): 0.8505, Precision (macro): 0.8979, Recall (macro): 0.8545
F1-Score (micro): 0.9060, Precision (micro): 0.8868, Recall (micro): 0.9260

Epoch 4:
F1-Score (macro): 0.8314, Precision (macro): 0.8818, Recall (macro): 0.8492
F1-Score (micro): 0.9042, Precision (micro): 0.8741, Recall (micro): 0.9364

Epoch 5:
F1-Score (macro): 0.8450, Precision (macro): 0.8857, Recall (macro): 0.8657
F1-Score (micro): 0.9076, Precision (micro): 0.8739, Recall (micro): 0.9440

Métricas no Conjunto de Teste:
F1-Score (macro): 0.8694, Precision (macro): 0.8867, Recall (macro): 0.8834
F1-Score (micro): 0.9119, Precision (micro): 0.8742, Recall (micro): 0.9529

Tempo

In [16]:
print(classification_report(test_targets_f3, test_predictions_f3, target_names=labels))

              precision    recall  f1-score   support

    ambiente       0.88      0.96      0.92       245
      bebida       0.86      0.93      0.89        83
      comida       0.90      0.99      0.94       424
       geral       0.72      0.91      0.80       254
 localização       0.94      0.96      0.95        48
      outros       0.90      0.38      0.53        24
       preço       0.95      0.96      0.95       161
     serviço       0.95      0.99      0.97       270

   micro avg       0.87      0.95      0.91      1509
   macro avg       0.89      0.88      0.87      1509
weighted avg       0.88      0.95      0.91      1509
 samples avg       0.89      0.94      0.90      1509



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 4° Fold


In [17]:
# Dividir o dataset
train_data = pd.concat([df_fold1, df_fold2, df_fold5], ignore_index=True)
val_data = df_fold3
test_data = df_fold4

test_targets_f4, test_predictions_f4 = run_fold(train_data, val_data, test_data)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 1:
F1-Score (macro): 0.7538, Precision (macro): 0.7633, Recall (macro): 0.7602
F1-Score (micro): 0.8733, Precision (micro): 0.8524, Recall (micro): 0.8953

Epoch 2:
F1-Score (macro): 0.8603, Precision (macro): 0.8806, Recall (macro): 0.8706
F1-Score (micro): 0.9146, Precision (micro): 0.8838, Recall (micro): 0.9476

Epoch 3:
F1-Score (macro): 0.8802, Precision (macro): 0.8634, Recall (macro): 0.9011
F1-Score (micro): 0.9079, Precision (micro): 0.8747, Recall (micro): 0.9437

Epoch 4:
F1-Score (macro): 0.8882, Precision (macro): 0.8962, Recall (macro): 0.8828
F1-Score (micro): 0.9284, Precision (micro): 0.9284, Recall (micro): 0.9284

Epoch 5:
F1-Score (macro): 0.8799, Precision (macro): 0.8863, Recall (macro): 0.8894
F1-Score (micro): 0.9263, Precision (micro): 0.9083, Recall (micro): 0.9450

Métricas no Conjunto de Teste:
F1-Score (macro): 0.8852, Precision (macro): 0.9030, Recall (macro): 0.8842
F1-Score (micro): 0.9381, Precision (micro): 0.9274, Recall (micro): 0.9492

Tempo

In [18]:
print(classification_report(test_targets_f4, test_predictions_f4, target_names=labels))

              precision    recall  f1-score   support

    ambiente       0.95      0.94      0.94       264
      bebida       0.90      0.98      0.94        93
      comida       0.92      0.98      0.95       431
       geral       0.85      0.89      0.87       246
 localização       0.88      0.96      0.92        48
      outros       0.75      0.38      0.50        16
       preço       0.99      0.99      0.99       143
     serviço       0.98      0.97      0.97       293

   micro avg       0.93      0.95      0.94      1534
   macro avg       0.90      0.88      0.89      1534
weighted avg       0.93      0.95      0.94      1534
 samples avg       0.93      0.94      0.93      1534



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 5° Fold


In [19]:
# Dividir o dataset
train_data = pd.concat([df_fold1, df_fold2, df_fold3], ignore_index=True)
val_data = df_fold4
test_data = df_fold5

test_targets_f5, test_predictions_f5 = run_fold(train_data, val_data, test_data)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1:
F1-Score (macro): 0.7644, Precision (macro): 0.7931, Recall (macro): 0.7605
F1-Score (micro): 0.8956, Precision (micro): 0.8779, Recall (micro): 0.9140

Epoch 2:
F1-Score (macro): 0.8778, Precision (macro): 0.9129, Recall (macro): 0.8501
F1-Score (micro): 0.9234, Precision (micro): 0.9317, Recall (micro): 0.9153

Epoch 3:
F1-Score (macro): 0.8921, Precision (macro): 0.8994, Recall (macro): 0.8884
F1-Score (micro): 0.9366, Precision (micro): 0.9294, Recall (micro): 0.9439

Epoch 4:
F1-Score (macro): 0.8764, Precision (macro): 0.8840, Recall (macro): 0.8736
F1-Score (micro): 0.9347, Precision (micro): 0.9269, Recall (micro): 0.9426

Epoch 5:
F1-Score (macro): 0.8757, Precision (macro): 0.8887, Recall (macro): 0.8744
F1-Score (micro): 0.9303, Precision (micro): 0.9170, Recall (micro): 0.9439

Métricas no Conjunto de Teste:
F1-Score (macro): 0.8810, Precision (macro): 0.9009, Recall (macro): 0.8766
F1-Score (micro): 0.9278, Precision (micro): 0.9107, Recall (micro): 0.9456

Tempo

In [20]:
print(classification_report(test_targets_f5, test_predictions_f5, target_names=labels))

              precision    recall  f1-score   support

    ambiente       0.92      0.97      0.95       238
      bebida       0.91      0.86      0.89        74
      comida       0.94      0.94      0.94       417
       geral       0.78      0.94      0.85       244
 localização       0.90      0.88      0.89        42
      outros       0.82      0.45      0.58        20
       preço       0.98      0.99      0.98       167
     serviço       0.95      0.98      0.96       286

   micro avg       0.91      0.95      0.93      1488
   macro avg       0.90      0.88      0.88      1488
weighted avg       0.91      0.95      0.93      1488
 samples avg       0.91      0.94      0.92      1488



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Resultado final (média dos 5 folds)


In [21]:
# Imprimir as listas de métricas
print("Listas de Métricas:")
print("F1-Score Macro:", f1_score_list_macro)
print("Precision Macro:", precision_list_macro)
print("Recall Macro:", recall_list_macro)
print("F1-Score Micro:", f1_score_list_micro)
print("Precision Micro:", precision_list_micro)
print("Recall Micro:", recall_list_micro)

Listas de Métricas:
F1-Score Macro: [0.8744412242059474, 0.8599600919668292, 0.8694115621000182, 0.885213377274209, 0.8809547510198024]
Precision Macro: [0.8669535527645051, 0.8416805811328647, 0.8866600414331209, 0.9029695377056264, 0.900875832864955]
Recall Macro: [0.889847964471866, 0.8820546408956357, 0.8834406828463164, 0.8842410868299201, 0.8765910176498183]
F1-Score Micro: [0.9243134628265238, 0.9107443583698215, 0.911857958148383, 0.9381443298969072, 0.9277942631058358]
Precision Micro: [0.9072978303747534, 0.8877216021011162, 0.8741641337386018, 0.9273885350318471, 0.9106796116504854]
Recall Micro: [0.9419795221843004, 0.9349930843706777, 0.9529489728296885, 0.9491525423728814, 0.9455645161290323]


In [22]:
# Imprimir as métricas médias obtidas nas 5 partições
print(f"F1-Score Macro médio: {np.mean(f1_score_list_macro):.4f}")
print(f"Precision Macro média: {np.mean(precision_list_macro):.4f}")
print(f"Recall Macro média: {np.mean(recall_list_macro):.4f}")
print(f"F1-Score Micro médio: {np.mean(f1_score_list_micro):.4f}")
print(f"Precision Micro média: {np.mean(precision_list_micro):.4f}")
print(f"Recall Micro média: {np.mean(recall_list_micro):.4f}")

F1-Score Macro médio: 0.8740
Precision Macro média: 0.8798
Recall Macro média: 0.8832
F1-Score Micro médio: 0.9226
Precision Micro média: 0.9015
Recall Micro média: 0.9449
