Classificação de Texto Multilabel


José Augusto de Almeida Neto


## Importar bibliotecas


In [1]:
import os
import time
import numpy as np
import pandas as pd
import torch
from dotenv import load_dotenv
from sklearn.metrics import (classification_report, f1_score, precision_score,
                             recall_score)
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification, BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


## Carregar datasets


In [2]:
# Carregar variáveis de ambiente do arquivo .env
load_dotenv()

# URLs dos datasets
file_urls = {
    'df_fold1': os.getenv('STEMMING_DF_FOLD1'),
    'df_fold2': os.getenv('STEMMING_DF_FOLD2'),
    'df_fold3': os.getenv('STEMMING_DF_FOLD3'),
    'df_fold4': os.getenv('STEMMING_DF_FOLD4'),
    'df_fold5': os.getenv('STEMMING_DF_FOLD5')
}

def load_dataset(url):
    return pd.read_csv(url)


# Carregar datasets
df_fold1 = load_dataset(file_urls['df_fold1'])
df_fold2 = load_dataset(file_urls['df_fold2'])
df_fold3 = load_dataset(file_urls['df_fold3'])
df_fold4 = load_dataset(file_urls['df_fold4'])
df_fold5 = load_dataset(file_urls['df_fold5'])

In [3]:
labels = ['ambiente', 'bebida', 'comida', 'geral',
          'localização', 'outros', 'preço', 'serviço']

In [4]:
def remove_empty_sentenca_rows(df):
    empty_sentenca_rows = df['sentenca'].isnull() | (df['sentenca'] == '')
    df = df[~empty_sentenca_rows]
    df.reset_index(drop=True, inplace=True)
    return df


# Removendo linhas vazias
df_fold1 = remove_empty_sentenca_rows(df_fold1)
df_fold2 = remove_empty_sentenca_rows(df_fold2)
df_fold3 = remove_empty_sentenca_rows(df_fold3)
df_fold4 = remove_empty_sentenca_rows(df_fold4)
df_fold5 = remove_empty_sentenca_rows(df_fold5)

## Listas e hiperparâmetros


In [5]:
# Gerar listas de métricas
f1_score_list_macro = []
precision_list_macro = []
recall_list_macro = []
f1_score_list_micro = []
precision_list_micro = []
recall_list_micro = []

In [6]:
# Hiperparâmetros
max_len = 128
train_batch_size = 16
valid_batch_size = 4
test_batch_size = 4
epochs = 10
learning_rate = 4e-05
num_labels = len(labels)
threshold = 0.28

## Funções auxiliares


In [7]:
class TokenizeDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['sentenca']
        self.targets = self.df[labels].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [8]:
def train_model(model, train_loader, optimizer, criterion, sigmoid):
    # Loop de Treinamento
    model.train()  # modo de treinamento
    for batch in train_loader:
        inputs_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        token_type_ids = batch['token_type_ids']
        targets = batch['targets']

        # Passagem Direta - os dados de entrada são passados para o modelo
        outputs = model(input_ids=inputs_ids,
                        attention_mask=attention_mask, token_type_ids=token_type_ids)

        # A função sigmóide é aplicada aos logits para converter os valores em probabilidades
        logits = outputs.logits
        predicted_probs = sigmoid(logits)

        # A perda é calculada comparando os logits com os rótulos verdadeiros (targets)
        loss = criterion(logits, targets)  # função de perda criterion

        # Retropropagação e Otimização
        optimizer.zero_grad()
        loss.backward()  # gradiente é calculado
        optimizer.step()  # parâmetros do modelo são otimizados

In [9]:
def evaluate_model(model, val_loader, sigmoid):
    # Loop de Validação
    model.eval()  # modo de avaliação
    all_targets = []
    all_predictions = []
    with torch.no_grad():
        for batch in val_loader:
            inputs_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            token_type_ids = batch['token_type_ids']
            targets = batch['targets']

            # Passagem Direta (inferência) - da mesma foram que o treinamento
            outputs = model(
                input_ids=inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

            # A função sigmóide é aplicada aos logits para converter os valores em probabilidades
            logits = outputs.logits
            predicted_probs = sigmoid(logits)

            # Aplicação do threshold
            predicted_labels = (predicted_probs > threshold).float()

            # Conversão dos rótulos verdadeiros e previstos para numpy arrays
            targets_np = targets.cpu().numpy()
            predicted_labels_np = predicted_labels.cpu().numpy()

            # Coleção dos targets e predictions para calculas as métricas
            all_targets.extend(targets_np)
            all_predictions.extend(predicted_labels_np)
    return all_targets, all_predictions

In [10]:
def run_fold(train_data, val_data, test_data):
    start_time = time.time()

    # Carregar o tokenizador BERT
    tokenizer = BertTokenizer.from_pretrained(
        'neuralmind/bert-base-portuguese-cased')

    # Tokenizar o dataset
    tokenized_train_data = TokenizeDataset(train_data, tokenizer, max_len)
    tokenized_val_data = TokenizeDataset(val_data, tokenizer, max_len)
    tokenized_test_data = TokenizeDataset(test_data, tokenizer, max_len)

    # Criar DataLoader para carregar os dados em lotes durante o treinamento
    train_loader = DataLoader(tokenized_train_data,
                              batch_size=train_batch_size, shuffle=True)
    val_loader = DataLoader(
        tokenized_val_data, batch_size=valid_batch_size, shuffle=False)
    test_loader = DataLoader(
        tokenized_test_data, batch_size=test_batch_size, shuffle=False)

    # Definir modelo, optimizer, e loss function
    model = BertForSequenceClassification.from_pretrained(
        'neuralmind/bert-base-portuguese-cased', num_labels=num_labels, problem_type="multi_label_classification")
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = torch.nn.BCEWithLogitsLoss()
    sigmoid = torch.nn.Sigmoid()

    # Treinamento
    for epoch in range(epochs):
        train_model(model, train_loader, optimizer, criterion, sigmoid)

        # Validação
        val_targets, val_predictions = evaluate_model(
            model, val_loader, sigmoid)

        # Calcular as métricas
        f1_macro = f1_score(val_targets, val_predictions, average='macro')
        precision_macro = precision_score(
            val_targets, val_predictions, average='macro')
        recall_macro = recall_score(
            val_targets, val_predictions, average='macro')
        f1_micro = f1_score(val_targets, val_predictions, average='micro')
        precision_micro = precision_score(
            val_targets, val_predictions, average='micro')
        recall_micro = recall_score(
            val_targets, val_predictions, average='micro')

        # Imprimir as métricas ao final de cada época
        print(f"\nEpoch {epoch + 1}:")
        print(
            f"F1-Score (macro): {f1_macro:.4f}, Precision (macro): {precision_macro:.4f}, Recall (macro): {recall_macro:.4f}")
        print(
            f"F1-Score (micro): {f1_micro:.4f}, Precision (micro): {precision_micro:.4f}, Recall (micro): {recall_micro:.4f}")

    # Teste
    test_targets, test_predictions = evaluate_model(
        model, test_loader, sigmoid)

    # Calcular as métricas no conjunto de teste
    f1_macro_test = f1_score(test_targets, test_predictions, average='macro')
    precision_macro_test = precision_score(
        test_targets, test_predictions, average='macro')
    recall_macro_test = recall_score(
        test_targets, test_predictions, average='macro')
    f1_micro_test = f1_score(test_targets, test_predictions, average='micro')
    precision_micro_test = precision_score(
        test_targets, test_predictions, average='micro')
    recall_micro_test = recall_score(
        test_targets, test_predictions, average='micro')

    # Atualizar a lista das métricas
    f1_score_list_macro.append(f1_macro_test)
    precision_list_macro.append(precision_macro_test)
    recall_list_macro.append(recall_macro_test)
    f1_score_list_micro.append(f1_micro_test)
    precision_list_micro.append(precision_micro_test)
    recall_list_micro.append(recall_micro_test)

    # Imprimir as métricas no conjunto de teste
    print("\nMétricas no Conjunto de Teste:")
    print(
        f"F1-Score (macro): {f1_macro_test:.4f}, Precision (macro): {precision_macro_test:.4f}, Recall (macro): {recall_macro_test:.4f}")
    print(
        f"F1-Score (micro): {f1_micro_test:.4f}, Precision (micro): {precision_micro_test:.4f}, Recall (micro): {recall_micro_test:.4f}")

    # Calcular o tempo total de teste
    end_time = time.time()
    elapsed_time = end_time - start_time
    hours, rem = divmod(elapsed_time, 3600)
    minutes, seconds = divmod(rem, 60)

    print(
        f"\nTempo de execução: {int(hours):02}:{int(minutes):02}:{seconds:.2f}")
    
    return test_targets, test_predictions

# BERT


## 1° Fold


In [11]:
train_data = pd.concat([df_fold2, df_fold3, df_fold4], ignore_index=True)
val_data = df_fold5
test_data = df_fold1

test_targets_f1, test_predictions_f1 = run_fold(train_data, val_data, test_data)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1:
F1-Score (macro): 0.7307, Precision (macro): 0.6668, Recall (macro): 0.8136
F1-Score (micro): 0.8528, Precision (micro): 0.7830, Recall (micro): 0.9362

Epoch 2:
F1-Score (macro): 0.8674, Precision (macro): 0.8607, Recall (macro): 0.8811
F1-Score (micro): 0.9110, Precision (micro): 0.8847, Recall (micro): 0.9388

Epoch 3:
F1-Score (macro): 0.8634, Precision (macro): 0.8489, Recall (macro): 0.8810
F1-Score (micro): 0.9154, Precision (micro): 0.8937, Recall (micro): 0.9382

Epoch 4:
F1-Score (macro): 0.8670, Precision (macro): 0.8415, Recall (macro): 0.8975
F1-Score (micro): 0.9203, Precision (micro): 0.8988, Recall (micro): 0.9429

Epoch 5:
F1-Score (macro): 0.8802, Precision (macro): 0.8696, Recall (macro): 0.8992
F1-Score (micro): 0.9181, Precision (micro): 0.8857, Recall (micro): 0.9530

Epoch 6:
F1-Score (macro): 0.8488, Precision (macro): 0.8638, Recall (macro): 0.8656
F1-Score (micro): 0.9166, Precision (micro): 0.8829, Recall (micro): 0.9530

Epoch 7:
F1-Score (macro): 

In [12]:
print(classification_report(test_targets_f1, test_predictions_f1, target_names=labels))

              precision    recall  f1-score   support

    ambiente       0.94      0.93      0.94       269
      bebida       0.78      0.98      0.87        82
      comida       0.93      0.95      0.94       418
       geral       0.75      0.92      0.83       206
 localização       0.78      0.97      0.87        30
      outros       0.67      0.36      0.47        22
       preço       0.91      0.99      0.95       161
     serviço       0.96      0.98      0.97       277

   micro avg       0.89      0.95      0.92      1465
   macro avg       0.84      0.89      0.85      1465
weighted avg       0.90      0.95      0.92      1465
 samples avg       0.90      0.94      0.91      1465



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 2° Fold


In [13]:
# Dividir o dataset
train_data = pd.concat([df_fold3, df_fold4, df_fold5], ignore_index=True)
val_data = df_fold1
test_data = df_fold2

test_targets_f2, test_predictions_f2 = run_fold(train_data, val_data, test_data)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 1:
F1-Score (macro): 0.7644, Precision (macro): 0.7208, Recall (macro): 0.8166
F1-Score (micro): 0.8803, Precision (micro): 0.8305, Recall (micro): 0.9365

Epoch 2:
F1-Score (macro): 0.8469, Precision (macro): 0.8939, Recall (macro): 0.8655
F1-Score (micro): 0.9156, Precision (micro): 0.8888, Recall (micro): 0.9440

Epoch 3:
F1-Score (macro): 0.8875, Precision (macro): 0.8723, Recall (macro): 0.9135
F1-Score (micro): 0.9182, Precision (micro): 0.8814, Recall (micro): 0.9584

Epoch 4:
F1-Score (macro): 0.8925, Precision (macro): 0.8869, Recall (macro): 0.9116
F1-Score (micro): 0.9263, Precision (micro): 0.8945, Recall (micro): 0.9604

Epoch 5:
F1-Score (macro): 0.8978, Precision (macro): 0.8872, Recall (macro): 0.9186
F1-Score (micro): 0.9306, Precision (micro): 0.9025, Recall (micro): 0.9604

Epoch 6:
F1-Score (macro): 0.9105, Precision (macro): 0.9029, Recall (macro): 0.9204
F1-Score (micro): 0.9347, Precision (micro): 0.9223, Recall (micro): 0.9474

Epoch 7:
F1-Score (macro): 

In [14]:
print(classification_report(test_targets_f2, test_predictions_f2, target_names=labels))

              precision    recall  f1-score   support

    ambiente       0.93      0.95      0.94       241
      bebida       0.86      0.89      0.88        72
      comida       0.93      0.98      0.96       401
       geral       0.78      0.88      0.83       243
 localização       0.86      0.91      0.88        33
      outros       0.53      0.33      0.41        24
       preço       0.93      0.99      0.96       152
     serviço       0.94      0.97      0.96       280

   micro avg       0.90      0.94      0.92      1446
   macro avg       0.85      0.86      0.85      1446
weighted avg       0.90      0.94      0.92      1446
 samples avg       0.90      0.93      0.90      1446



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 3° Fold


In [15]:
# Dividir o dataset
train_data = pd.concat([df_fold1, df_fold4, df_fold5], ignore_index=True)
val_data = df_fold2
test_data = df_fold3

test_targets_f3, test_predictions_f3 = run_fold(train_data, val_data, test_data)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 1:
F1-Score (macro): 0.6945, Precision (macro): 0.6726, Recall (macro): 0.7280
F1-Score (micro): 0.8750, Precision (micro): 0.8362, Recall (micro): 0.9177

Epoch 2:
F1-Score (macro): 0.8261, Precision (macro): 0.8803, Recall (macro): 0.8404
F1-Score (micro): 0.8953, Precision (micro): 0.8618, Recall (micro): 0.9315

Epoch 3:
F1-Score (macro): 0.8339, Precision (macro): 0.8372, Recall (macro): 0.8477
F1-Score (micro): 0.9076, Precision (micro): 0.8799, Recall (micro): 0.9371

Epoch 4:
F1-Score (macro): 0.8516, Precision (macro): 0.8490, Recall (macro): 0.8633
F1-Score (micro): 0.9143, Precision (micro): 0.8958, Recall (micro): 0.9336

Epoch 5:
F1-Score (macro): 0.8473, Precision (macro): 0.8586, Recall (macro): 0.8520
F1-Score (micro): 0.9052, Precision (micro): 0.8866, Recall (micro): 0.9246

Epoch 6:
F1-Score (macro): 0.8392, Precision (macro): 0.8580, Recall (macro): 0.8467
F1-Score (micro): 0.9091, Precision (micro): 0.8890, Recall (micro): 0.9302

Epoch 7:
F1-Score (macro): 

In [16]:
print(classification_report(test_targets_f3, test_predictions_f3, target_names=labels))

              precision    recall  f1-score   support

    ambiente       0.92      0.96      0.94       245
      bebida       0.87      0.92      0.89        83
      comida       0.95      0.97      0.96       424
       geral       0.81      0.89      0.84       254
 localização       0.90      0.98      0.94        48
      outros       0.88      0.29      0.44        24
       preço       0.95      0.97      0.96       161
     serviço       0.95      0.99      0.97       270

   micro avg       0.91      0.94      0.93      1509
   macro avg       0.90      0.87      0.87      1509
weighted avg       0.92      0.94      0.93      1509
 samples avg       0.91      0.93      0.91      1509



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 4° Fold


In [17]:
# Dividir o dataset
train_data = pd.concat([df_fold1, df_fold2, df_fold5], ignore_index=True)
val_data = df_fold3
test_data = df_fold4

test_targets_f4, test_predictions_f4 = run_fold(train_data, val_data, test_data)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 1:
F1-Score (macro): 0.6868, Precision (macro): 0.7061, Recall (macro): 0.6978
F1-Score (micro): 0.8540, Precision (micro): 0.8384, Recall (micro): 0.8701

Epoch 2:
F1-Score (macro): 0.8443, Precision (macro): 0.8713, Recall (macro): 0.8430
F1-Score (micro): 0.8997, Precision (micro): 0.8686, Recall (micro): 0.9331

Epoch 3:
F1-Score (macro): 0.8869, Precision (macro): 0.8912, Recall (macro): 0.8897
F1-Score (micro): 0.9239, Precision (micro): 0.9068, Recall (micro): 0.9417

Epoch 4:
F1-Score (macro): 0.8869, Precision (macro): 0.8938, Recall (macro): 0.8873
F1-Score (micro): 0.9253, Precision (micro): 0.9113, Recall (micro): 0.9397

Epoch 5:
F1-Score (macro): 0.8934, Precision (macro): 0.8809, Recall (macro): 0.9082
F1-Score (micro): 0.9216, Precision (micro): 0.9017, Recall (micro): 0.9423

Epoch 6:
F1-Score (macro): 0.8971, Precision (macro): 0.8939, Recall (macro): 0.9023
F1-Score (micro): 0.9286, Precision (micro): 0.9140, Recall (micro): 0.9437

Epoch 7:
F1-Score (macro): 

In [18]:
print(classification_report(test_targets_f4, test_predictions_f4, target_names=labels))

              precision    recall  f1-score   support

    ambiente       0.85      0.98      0.91       264
      bebida       0.88      0.98      0.93        93
      comida       0.90      0.99      0.94       431
       geral       0.83      0.89      0.86       246
 localização       0.90      0.96      0.93        48
      outros       1.00      0.06      0.12        16
       preço       0.97      1.00      0.99       143
     serviço       0.97      0.97      0.97       293

   micro avg       0.90      0.96      0.93      1534
   macro avg       0.91      0.85      0.83      1534
weighted avg       0.90      0.96      0.92      1534
 samples avg       0.90      0.95      0.92      1534



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 5° Fold


In [19]:
# Dividir o dataset
train_data = pd.concat([df_fold1, df_fold2, df_fold3], ignore_index=True)
val_data = df_fold4
test_data = df_fold5

test_targets_f5, test_predictions_f5 = run_fold(train_data, val_data, test_data)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 1:
F1-Score (macro): 0.7543, Precision (macro): 0.7808, Recall (macro): 0.7641
F1-Score (micro): 0.8945, Precision (micro): 0.8630, Recall (micro): 0.9283

Epoch 2:
F1-Score (macro): 0.8609, Precision (macro): 0.9058, Recall (macro): 0.8773
F1-Score (micro): 0.9252, Precision (micro): 0.8865, Recall (micro): 0.9674

Epoch 3:
F1-Score (macro): 0.8571, Precision (macro): 0.8597, Recall (macro): 0.8666
F1-Score (micro): 0.9210, Precision (micro): 0.8968, Recall (micro): 0.9465

Epoch 4:
F1-Score (macro): 0.8730, Precision (macro): 0.8702, Recall (macro): 0.8796
F1-Score (micro): 0.9293, Precision (micro): 0.9121, Recall (micro): 0.9472

Epoch 5:
F1-Score (macro): 0.8615, Precision (macro): 0.8556, Recall (macro): 0.8764
F1-Score (micro): 0.9329, Precision (micro): 0.9065, Recall (micro): 0.9609

Epoch 6:
F1-Score (macro): 0.8713, Precision (macro): 0.8699, Recall (macro): 0.8751
F1-Score (micro): 0.9374, Precision (micro): 0.9235, Recall (micro): 0.9518

Epoch 7:
F1-Score (macro): 

In [20]:
print(classification_report(test_targets_f5, test_predictions_f5, target_names=labels))

              precision    recall  f1-score   support

    ambiente       0.94      0.97      0.95       238
      bebida       0.88      0.92      0.90        74
      comida       0.90      0.96      0.93       417
       geral       0.85      0.86      0.86       244
 localização       0.89      0.81      0.85        42
      outros       0.88      0.35      0.50        20
       preço       0.96      0.98      0.97       167
     serviço       0.92      0.98      0.95       286

   micro avg       0.91      0.94      0.92      1488
   macro avg       0.90      0.85      0.86      1488
weighted avg       0.91      0.94      0.92      1488
 samples avg       0.92      0.94      0.92      1488



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Resultado final (média dos 5 folds)


In [21]:
# Imprimir as listas de métricas
print("Listas de Métricas:")
print("F1-Score Macro:", f1_score_list_macro)
print("Precision Macro:", precision_list_macro)
print("Recall Macro:", recall_list_macro)
print("F1-Score Micro:", f1_score_list_micro)
print("Precision Micro:", precision_list_micro)
print("Recall Micro:", recall_list_micro)

Listas de Métricas:
F1-Score Macro: [0.8540000799399854, 0.8510891002903889, 0.8682968814366243, 0.8307164867740235, 0.8644002119260752]
Precision Macro: [0.8409660989182532, 0.8459881542036363, 0.9047240514980501, 0.913263471318437, 0.9042060860351993]
Recall Macro: [0.8859868307665915, 0.8634570606000744, 0.8694928275565699, 0.8544394577420373, 0.8538714155575661]
F1-Score Micro: [0.9182930863380747, 0.9186635167060412, 0.9285481239804242, 0.9271982351087299, 0.9231788079470199]
Precision Micro: [0.8908857509627728, 0.8971654581410679, 0.9145244215938303, 0.8974984746796827, 0.9099216710182768]
Recall Micro: [0.9474402730375426, 0.9412171507607192, 0.9430086149768059, 0.9589308996088657, 0.9368279569892473]


In [22]:
# Imprimir as métricas médias obtidas nas 5 partições
print(f"F1-Score Macro médio: {np.mean(f1_score_list_macro):.4f}")
print(f"Precision Macro média: {np.mean(precision_list_macro):.4f}")
print(f"Recall Macro média: {np.mean(recall_list_macro):.4f}")
print(f"F1-Score Micro médio: {np.mean(f1_score_list_micro):.4f}")
print(f"Precision Micro média: {np.mean(precision_list_micro):.4f}")
print(f"Recall Micro média: {np.mean(recall_list_micro):.4f}")

F1-Score Macro médio: 0.8537
Precision Macro média: 0.8818
Recall Macro média: 0.8654
F1-Score Micro médio: 0.9232
Precision Micro média: 0.9020
Recall Micro média: 0.9455
