In [None]:
from collections import Counter
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import evaluate
import os
import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from datasets import concatenate_datasets
import unidecode
import nltk
nltk.download('stopwords')
from nltk import tokenize


In [None]:
irrelevant_words = nltk.corpus.stopwords.words('portuguese')

In [None]:
def stopWordsUnidecode(example):

    processed_sentence = []
    stopWords_unidecode = [unidecode.unidecode(phrase) for phrase in irrelevant_words]

    text = example['text']
    unidecode_text = unidecode.unidecode(text)
    word_punct_tokenizer = tokenize.WordPunctTokenizer()
    text_token = word_punct_tokenizer.tokenize(unidecode_text)
    new_word = [word for word in text_token if word.isalpha() and word not in stopWords_unidecode]
    processed_sentence.append(' '.join(new_word))

    example['processing_Unidecode'] = ' '.join(processed_sentence)

    return example

In [None]:
login(token='YOUR_HUGGINGFACE_TOKEN')

dataset = load_dataset("manueltonneau/portuguese-hate-speech-superset",token=True)

dataset = dataset['train']
dataset = dataset.remove_columns(['target', 'nb_annotators','dataset','source'])

new_dataset = load_dataset('franciellevargas/HateBR')

new_dataset = new_dataset['train']
dt = new_dataset.remove_columns(['id', 'anotator1', 'anotator2', 'anotator3', 'links_post', 'account_post'])
hate = dt.filter(lambda x: x['label_final'] == 1)
hate = hate.rename_columns({'comentario': 'text', 'label_final': 'labels'})
dataset = concatenate_datasets([hate,dataset])
dataset = dataset.map(stopWordsUnidecode)
dataset['text']


In [None]:
dataset['processing_Unidecode']

In [None]:
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

model_name = "ruanchaves/bert-large-portuguese-cased-hatebr"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Treinando no dispositivo: {device}")
print(torch.cuda.device_count())
print("Local rank:", os.environ.get("LOCAL_RANK"))
print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

In [None]:

def tokenize_function(examples):
    return tokenizer(examples["processing_Unidecode"], padding="max_length", truncation=True, max_length=70)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

train_test_split = tokenized_dataset.train_test_split(test_size=0.2 )
train_dataset = train_test_split["train"]
eveal_dataset = train_test_split["test"]

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=preds, references=labels)["accuracy"]
    f1_macro = f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    return {"accuracy": acc, "f1": f1_macro}

In [None]:
training_args = TrainingArguments(
    output_dir="./results1/classificador_odio.model",
    per_device_train_batch_size=8,
    gradient_accumulation_steps= 8,
    fp16= True,
    num_train_epochs=3,
    logging_dir="./logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none"
)

class_weights = torch.tensor([1.0, 1 0.0])
loss_fn = CrossEntropyLoss(weight=class_weights)


In [None]:
class CustomLossTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        self.class_weight = kwargs.pop("class_weight", None)
        super().__init__(*args, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # class_weight existe, aplica no CrossEntropyLoss, senão usa o default.
        if self.class_weight is not None:
          loss_fct = CrossEntropyLoss(weight=self.class_weight.to(labels.device))
        else:
          loss_fct = CrossEntropyLoss()

        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
trainer = CustomLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eveal_dataset,
    compute_metrics=compute_metrics,
    class_weight=class_weights,
    tokenizer=tokenizer
)

trainer.train()

In [None]:
example = "Esse trabalho é uma ótimo arrasou!"
local_rank = int(os.environ.get("LOCAL_RANK", 0))
device = torch.device(f"cuda:{local_rank}")
model.to(device)
inputs = tokenizer(example, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

pred = torch.argmax(outputs.logits, dim=-1).item()

# Mostra o resultado
print(f"Frase: {example}")
print(f"Classe prevista: {pred}")