Classificação de Texto Multilabel


José Augusto de Almeida Neto


## Importar bibliotecas


In [13]:
import os
import time
import pandas as pd
import torch
from dotenv import load_dotenv
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification, BertTokenizer

## Carregar datasets


In [14]:
# Carregar variáveis de ambiente do arquivo .env
load_dotenv()

# URLs dos datasets
file_urls = {
    'df_fold1': os.getenv('STEMMING_DF_FOLD1'),
    'df_fold2': os.getenv('STEMMING_DF_FOLD2'),
    'df_fold3': os.getenv('STEMMING_DF_FOLD3'),
    'df_fold4': os.getenv('STEMMING_DF_FOLD4'),
    'df_fold5': os.getenv('STEMMING_DF_FOLD5')
}

def load_dataset(url):
    return pd.read_csv(url)


# Carregar datasets
df_fold1 = load_dataset(file_urls['df_fold1'])
df_fold2 = load_dataset(file_urls['df_fold2'])
df_fold3 = load_dataset(file_urls['df_fold3'])
df_fold4 = load_dataset(file_urls['df_fold4'])
df_fold5 = load_dataset(file_urls['df_fold5'])

df = pd.read_csv('../datasets/dataset-case_study-tratado.csv')

In [15]:
labels = ['ambiente', 'bebida', 'comida', 'geral',
          'localização', 'outros', 'preço', 'serviço']

In [16]:
def remove_empty_sentenca_rows(df):
    empty_sentenca_rows = df['sentenca'].isnull() | (df['sentenca'] == '')
    df = df[~empty_sentenca_rows]
    df.reset_index(drop=True, inplace=True)
    return df


# Removendo linhas vazias
df_fold1 = remove_empty_sentenca_rows(df_fold1)
df_fold2 = remove_empty_sentenca_rows(df_fold2)
df_fold3 = remove_empty_sentenca_rows(df_fold3)
df_fold4 = remove_empty_sentenca_rows(df_fold4)
df_fold5 = remove_empty_sentenca_rows(df_fold5)

# Renomeando a coluna de df
df.rename(columns={'comment_text_translated': 'sentenca'}, inplace=True)

## Hiperparâmetros


In [17]:
# Hiperparâmetros
max_len = 128
train_batch_size = 16
valid_batch_size = 4
test_batch_size = 4
epochs = 5
learning_rate = 4e-05
num_labels = len(labels)
threshold = 0.28

## Funções auxiliares


In [18]:
class TokenizeDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['sentenca']
        self.targets = self.df[labels].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [19]:
def train_model(model, train_loader, optimizer, criterion, sigmoid):
    # Loop de Treinamento
    model.train()  # modo de treinamento
    for batch in train_loader:
        inputs_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        token_type_ids = batch['token_type_ids']
        targets = batch['targets']

        # Passagem Direta - os dados de entrada são passados para o modelo
        outputs = model(input_ids=inputs_ids,
                        attention_mask=attention_mask, token_type_ids=token_type_ids)

        # A função sigmóide é aplicada aos logits para converter os valores em probabilidades
        logits = outputs.logits
        predicted_probs = sigmoid(logits)

        # A perda é calculada comparando os logits com os rótulos verdadeiros (targets)
        loss = criterion(logits, targets)  # função de perda criterion

        # Retropropagação e Otimização
        optimizer.zero_grad()
        loss.backward()  # gradiente é calculado
        optimizer.step()  # parâmetros do modelo são otimizados

In [20]:
def evaluate_model(model, val_loader, sigmoid):
    # Loop de Validação
    model.eval()  # modo de avaliação
    all_targets = []
    all_predictions = []
    with torch.no_grad():
        for batch in val_loader:
            inputs_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            token_type_ids = batch['token_type_ids']
            targets = batch['targets']

            # Passagem Direta (inferência) - da mesma foram que o treinamento
            outputs = model(
                input_ids=inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

            # A função sigmóide é aplicada aos logits para converter os valores em probabilidades
            logits = outputs.logits
            predicted_probs = sigmoid(logits)

            # Aplicação do threshold
            predicted_labels = (predicted_probs > threshold).float()

            # Conversão dos rótulos verdadeiros e previstos para numpy arrays
            targets_np = targets.cpu().numpy()
            predicted_labels_np = predicted_labels.cpu().numpy()

            # Coleção dos targets e predictions para calculas as métricas
            all_targets.extend(targets_np)
            all_predictions.extend(predicted_labels_np)
    return all_targets, all_predictions

In [21]:
def run_training(train_data, val_data):
    start_time = time.time()

    # Carregar o tokenizador BERT
    tokenizer = BertTokenizer.from_pretrained(
        'neuralmind/bert-base-portuguese-cased')

    # Tokenizar o dataset
    tokenized_train_data = TokenizeDataset(train_data, tokenizer, max_len)
    tokenized_val_data = TokenizeDataset(val_data, tokenizer, max_len)

    # Criar DataLoader para carregar os dados em lotes durante o treinamento
    train_loader = DataLoader(tokenized_train_data,
                              batch_size=train_batch_size, shuffle=True)
    val_loader = DataLoader(
        tokenized_val_data, batch_size=valid_batch_size, shuffle=False)

    # Definir modelo, optimizer, e loss function
    model = BertForSequenceClassification.from_pretrained(
        'neuralmind/bert-base-portuguese-cased', num_labels=num_labels, problem_type="multi_label_classification")
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = torch.nn.BCEWithLogitsLoss()
    sigmoid = torch.nn.Sigmoid()

    # Treinamento
    for epoch in range(epochs):
        train_model(model, train_loader, optimizer, criterion, sigmoid)

        # Validação
        val_targets, val_predictions = evaluate_model(
            model, val_loader, sigmoid)
        
        # Calcular as métricas
        f1_macro = f1_score(val_targets, val_predictions, average='macro')
        precision_macro = precision_score(
            val_targets, val_predictions, average='macro')
        recall_macro = recall_score(
            val_targets, val_predictions, average='macro')
        f1_micro = f1_score(val_targets, val_predictions, average='micro')
        precision_micro = precision_score(
            val_targets, val_predictions, average='micro')
        recall_micro = recall_score(
            val_targets, val_predictions, average='micro')

        # Imprimir as métricas ao final de cada época
        print(f"\nEpoch {epoch + 1}:")
        print(
            f"F1-Score (macro): {f1_macro:.4f}, Precision (macro): {precision_macro:.4f}, Recall (macro): {recall_macro:.4f}")
        print(
            f"F1-Score (micro): {f1_micro:.4f}, Precision (micro): {precision_micro:.4f}, Recall (micro): {recall_micro:.4f}")

    # Calcular o tempo total de teste
    end_time = time.time()
    elapsed_time = end_time - start_time
    hours, rem = divmod(elapsed_time, 3600)
    minutes, seconds = divmod(rem, 60)

    print(
        f"\nTempo de treinamento: {int(hours):02}:{int(minutes):02}:{seconds:.2f}")
    
    return model

In [22]:
def run_prediction(model, df_unlabeled):
    start_time = time.time()

    # Carregar o tokenizador BERT
    tokenizer = BertTokenizer.from_pretrained(
        'neuralmind/bert-base-portuguese-cased')

    # Tokenizar o dataset
    tokenized_unlabeled_data = TokenizeDataset(df_unlabeled, tokenizer, max_len)

    # Criar DataLoader para carregar os dados em lotes durante o treinamento
    unlabeled_loader = DataLoader(tokenized_unlabeled_data,
                              batch_size=train_batch_size, shuffle=True)

    model.eval()
    all_predictions = []
    sigmoid = torch.nn.Sigmoid()
    with torch.no_grad():
        for batch in unlabeled_loader:
            inputs_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            token_type_ids = batch['token_type_ids']

            outputs = model(input_ids=inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

            logits = outputs.logits
            predicted_probs = sigmoid(logits)

            predicted_labels = (predicted_probs > threshold).float()
            predicted_labels_np = predicted_labels.cpu().numpy()

            all_predictions.extend(predicted_labels_np)
    
    # Calcular o tempo total de teste
    end_time = time.time()
    elapsed_time = end_time - start_time
    hours, rem = divmod(elapsed_time, 3600)
    minutes, seconds = divmod(rem, 60)

    print(
        f"\nTempo de predição: {int(hours):02}:{int(minutes):02}:{seconds:.2f}")

    return all_predictions

## Treinamento do modelo


In [23]:
all_data = pd.concat([df_fold1, df_fold2, df_fold3, df_fold4, df_fold5], ignore_index=True)
train_data, val_data = train_test_split(all_data, test_size=0.1, random_state=42)

train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

model = run_training(train_data, val_data)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1:
F1-Score (macro): 0.8118, Precision (macro): 0.8856, Recall (macro): 0.8328
F1-Score (micro): 0.9015, Precision (micro): 0.8650, Recall (micro): 0.9413

Epoch 2:
F1-Score (macro): 0.8718, Precision (macro): 0.8442, Recall (macro): 0.9048
F1-Score (micro): 0.9176, Precision (micro): 0.8927, Recall (micro): 0.9439

Epoch 3:
F1-Score (macro): 0.8872, Precision (macro): 0.8808, Recall (macro): 0.8962
F1-Score (micro): 0.9233, Precision (micro): 0.9072, Recall (micro): 0.9399

Epoch 4:
F1-Score (macro): 0.8858, Precision (macro): 0.8961, Recall (macro): 0.8952
F1-Score (micro): 0.9139, Precision (micro): 0.8810, Recall (micro): 0.9493

Epoch 5:
F1-Score (macro): 0.8867, Precision (macro): 0.8899, Recall (macro): 0.8925
F1-Score (micro): 0.9191, Precision (micro): 0.8991, Recall (micro): 0.9399

Tempo de treinamento: 02:53:13.28


In [24]:
# Salvar o modelo
output_dir = './bertimbau'

# Verifique se o diretório existe, se não, crie-o
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Salve o modelo e o tokenizador
model.save_pretrained(output_dir)

## Aplicação do modelo

In [25]:
# name = 'teste1'

# # Fazer previsões no dataset sem anotações
# predictions = run_prediction(model, df)

# # Salvar as previsões em um arquivo CSV
# predictions_df = pd.DataFrame(predictions, columns=labels)
# predictions_df.to_csv(f'../datasets/dataset-case_study_predictions-{name}.csv', index=False)