In [1]:
import pandas as pd
import os
import zipfile

In [2]:
df = pd.read_csv('../Dados/train_data.csv')

In [3]:
def preprocess(df):
    df = df.dropna()
    df = df.drop_duplicates()
    df['query'] = df['query'].str.replace('[^\w\s]','')
    df['query'] = df['query'].str.lower()
    df["label"] = df["label"].astype(int)
    return df

df = preprocess(df)

In [4]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
dataset['train'][1]

{'query': ' a imprensa brasileira não pode dar ênfase a essas notícias. do contrário, não aceitaremos receber refugiados muçulmanos.',
 'label': 0,
 '__index_level_0__': 3259}

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

def tokenize_function(examples):
    return tokenizer(
        examples["query"],
        padding="max_length",
        truncation=True,
        add_special_tokens=True,
        max_length=512
        )

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 3172/3172 [00:00<00:00, 5486.34 examples/s]
Map: 100%|██████████| 794/794 [00:00<00:00, 5752.91 examples/s]


In [7]:
from transformers import AutoModelForSequenceClassification, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained("neuralmind/bert-base-portuguese-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
import evaluate
import numpy as np
from transformers import TrainingArguments, Trainer


training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="no"
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [9]:
trainer.train()

                                                  
 20%|██        | 397/1985 [02:06<07:06,  3.73it/s]

{'eval_loss': 0.5568761825561523, 'eval_accuracy': 0.698992443324937, 'eval_runtime': 8.4921, 'eval_samples_per_second': 93.499, 'eval_steps_per_second': 11.776, 'epoch': 1.0}


 25%|██▌       | 500/1985 [02:37<07:36,  3.25it/s]  

{'loss': 0.5236, 'grad_norm': 1.8097693920135498, 'learning_rate': 3.7405541561712845e-05, 'epoch': 1.26}


                                                  
 40%|████      | 794/1985 [04:15<05:18,  3.74it/s]

{'eval_loss': 0.458881139755249, 'eval_accuracy': 0.8047858942065491, 'eval_runtime': 8.5882, 'eval_samples_per_second': 92.453, 'eval_steps_per_second': 11.644, 'epoch': 2.0}


 50%|█████     | 1000/1985 [05:17<05:00,  3.27it/s]

{'loss': 0.3796, 'grad_norm': 10.678564071655273, 'learning_rate': 2.4811083123425694e-05, 'epoch': 2.52}


                                                   
 60%|██████    | 1191/1985 [06:24<03:35,  3.68it/s]

{'eval_loss': 0.6836014986038208, 'eval_accuracy': 0.7682619647355163, 'eval_runtime': 8.58, 'eval_samples_per_second': 92.541, 'eval_steps_per_second': 11.655, 'epoch': 3.0}


 76%|███████▌  | 1500/1985 [07:57<02:29,  3.23it/s]

{'loss': 0.2354, 'grad_norm': 14.997559547424316, 'learning_rate': 1.2216624685138539e-05, 'epoch': 3.78}


                                                   
 80%|████████  | 1588/1985 [08:33<01:48,  3.66it/s]

{'eval_loss': 0.9864291548728943, 'eval_accuracy': 0.7909319899244333, 'eval_runtime': 8.6859, 'eval_samples_per_second': 91.413, 'eval_steps_per_second': 11.513, 'epoch': 4.0}


                                                   
100%|██████████| 1985/1985 [10:42<00:00,  3.09it/s]

{'eval_loss': 1.1703320741653442, 'eval_accuracy': 0.7884130982367759, 'eval_runtime': 8.6533, 'eval_samples_per_second': 91.757, 'eval_steps_per_second': 11.556, 'epoch': 5.0}
{'train_runtime': 642.5945, 'train_samples_per_second': 24.681, 'train_steps_per_second': 3.089, 'train_loss': 0.3159244950532313, 'epoch': 5.0}





TrainOutput(global_step=1985, training_loss=0.3159244950532313, metrics={'train_runtime': 642.5945, 'train_samples_per_second': 24.681, 'train_steps_per_second': 3.089, 'total_flos': 4172941338009600.0, 'train_loss': 0.3159244950532313, 'epoch': 5.0})

In [10]:
trainer.evaluate()

100%|██████████| 100/100 [00:08<00:00, 11.75it/s]


{'eval_loss': 1.1703320741653442,
 'eval_accuracy': 0.7884130982367759,
 'eval_runtime': 8.647,
 'eval_samples_per_second': 91.824,
 'eval_steps_per_second': 11.565,
 'epoch': 5.0}

In [11]:
trainer.save_model("../model")