In [1]:
import pandas as pd
import os
import zipfile

## Criando o dataframe

In [2]:
df = pd.read_csv('/content/train_data2.csv')

## Pre-processamento do dataframe
- Remoção de espaços em branco seguidos
- Colocar todos os caracteres em minúsculo
- Tranformar as labels em valores inteiros

In [3]:
def preprocess(df):
    df = df.dropna()
    df = df.drop_duplicates()
    df['query'] = df['query'].str.replace('[^\w\s]','')
    df['query'] = df['query'].str.lower()
    df["label"] = df["label"].astype(int)
    return df

df = preprocess(df)

In [None]:
!pip install datasets

In [5]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

In [6]:
dataset['train'][1]

{'query': "'você tá gorda demais, tá feia' 'eu te amo, mas você precisa se arrumar mais'",
 'label': 1,
 '__index_level_0__': 1420}

## Criação das funções de tokenização

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

def tokenize_function(examples):
    return tokenizer(
        examples["query"],
        padding="max_length",
        truncation=True,
        add_special_tokens=True,
        max_length=512
        )

tokenized_datasets = dataset.map(tokenize_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



Map:   0%|          | 0/1998 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [8]:
from transformers import AutoModelForSequenceClassification, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained("neuralmind/bert-base-portuguese-cased", num_labels=2)

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Definição dos parâmetros para o modelo

In [10]:
!pip install evaluate

import evaluate
import numpy as np
from transformers import TrainingArguments, Trainer


training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="no"
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.577191,0.718
2,0.494700,0.62996,0.724
3,0.494700,1.159619,0.722
4,0.164100,1.245447,0.716
5,0.164100,1.455266,0.712


TrainOutput(global_step=1250, training_loss=0.2785097351074219, metrics={'train_runtime': 1002.9594, 'train_samples_per_second': 9.961, 'train_steps_per_second': 1.246, 'total_flos': 2628479443046400.0, 'train_loss': 0.2785097351074219, 'epoch': 5.0})

## Avaliação do modelo após o treinamento

In [13]:
trainer.evaluate()

{'eval_loss': 1.455265760421753,
 'eval_accuracy': 0.712,
 'eval_runtime': 14.6547,
 'eval_samples_per_second': 34.119,
 'eval_steps_per_second': 4.299,
 'epoch': 5.0}

In [None]:
trainer.save_model("../model")