## Training without attachments

In [2]:
import pandas as pd
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import evaluate
import numpy as np
from transformers import pipeline

In [None]:
tokenizer = AutoTokenizer.from_pretrained(#"distilbert-base-uncased",
                                          "neuralmind/bert-base-portuguese-cased",
                                          padding=True,
                                          truncation=True,
                                          max_length=512,
                                          add_special_tokens = True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

### Data preprocessing

In [None]:
og_df = pd.read_csv('emails_triagem.csv')

In [None]:
df = og_df[['Descrição', 'Tipo de registro do caso']].dropna().reset_index(drop=True).rename(columns={'Descrição': 'text',
                                                                                                      'Tipo de registro do caso': 'label'})

In [None]:
weights = compute_class_weight(class_weight='balanced', classes=df['label'].unique(), y=df['label'])

In [None]:
id2label = {0: "Solicitação de Cotação",
            1: "Faturamento Incorreto",
            2: "Análise de Crédito",
            3: "Aditamento Contratual",
            4: "Medição Incorreta",
            5: "Solicitação de Contrato",
            6: "Precificação de Condição Comercial",
            7: "Registro Incorreto",
            8: "Garantias",
            9: "Não Recebimento da Fatura",
            10: "Conferência de tabela de preço"}

label2id = {"Solicitação de Cotação": 0,
            "Faturamento Incorreto": 1,
            "Análise de Crédito": 2,
            "Aditamento Contratual": 3,
            "Medição Incorreta": 4,
            "Solicitação de Contrato": 5,
            "Precificação de Condição Comercial": 6,
            "Registro Incorreto": 7,
            "Garantias": 8,
            "Não Recebimento da Fatura": 9,
            "Conferência de tabela de preço": 10}

In [None]:
df['label'] = df['label'].replace(label2id)

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
train_ds = Dataset.from_pandas(df_train.reset_index(drop=True))
test_ds = Dataset.from_pandas(df_test.reset_index(drop=True))

In [None]:
dataset = DatasetDict()

dataset['train'] = train_ds
dataset['test'] = test_ds

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2841
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 711
    })
})

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

In [None]:
tokenized_copel = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2841 [00:00<?, ? examples/s]

Map:   0%|          | 0/711 [00:00<?, ? examples/s]

In [None]:
tokenized_copel['train']

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2841
})

### Training

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    #"distilbert-base-uncased", num_labels=11, id2label=id2label, label2id=label2id
    "neuralmind/bert-base-portuguese-cased", num_labels=11, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="copel_initial_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_copel["train"],
    eval_dataset=tokenized_copel["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.392082,0.902954
2,0.599300,0.370846,0.909986


TrainOutput(global_step=712, training_loss=0.5263417704721515, metrics={'train_runtime': 25543.3338, 'train_samples_per_second': 0.222, 'train_steps_per_second': 0.028, 'total_flos': 1487470540377024.0, 'train_loss': 0.5263417704721515, 'epoch': 2.0})

In [None]:
trainer.save_model('model_without_weights')

AttributeError: ignored

### Assessing the model

In [None]:
text = "Prezados (as), Boa Tarde! Segue o Balanço Energético das Unidades da Mauser, referente ao consumo de Janeiro/2023. Favor registrar 355,000 MWh. Faturar o Montante das NFs da Seguinte forma: 86,944 MWh Mauser TaubatÃ© (CNPJ: 08.246.617/0010-94) 146,182 MWh Mauser MatÃ£o (CNPJ: 08.246.617/0006-08) 8,792 MWh Mauser Queimados (CNPJ: 08.246.617/0009-50) 113,082 MWh Mauser Louveira (CNPJ: 08.246.617/0013-37) Atenciosamente, <https://uploaddeimagens.com.br/images/004/182/811/original/marcelo.png?1669 660813> <https://www.fluxoenergia.com/> <https://wa.me/5511941208776?text=OlÃƒÂ¡> <https://www.linkedin.com/company/fluxoenergia/> <https://www.instagram.com/fluxoenergia/> <https://facebook.com/fluxoenergia>"

In [None]:
num = 40

In [None]:
text = dataset['test']['text'][num]

In [None]:
dataset['test']['label'][num]

1

In [None]:
print(text)

Prezados. Favor encontrar abaixo a memória de cálculo referente ao ressarcimento TUSD para o cliente Reobote. O ressarcimento totaliza o valor de R$ 2.055,81. Por gentileza solicitamos a validação dos percentuais aplicados e o ?de acordo? do valor de ressarcimento. Tão logo tenhamos retorno, daremos sequência a emissão da nota de débito por parte do cliente. Aguardamos o seu retorno e permanecemos à disposição.


In [None]:
classifier = pipeline("sentiment-analysis", model="./model_without_weights")
classifier(text)

ValueError: ignored

In [None]:
from sklearn.metrics import f1_score

X = df['text']
y = df['label']
y_pred = classifier(X)
f1_score(y, y_pred)

NameError: ignored

### With Weights

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    #"distilbert-base-uncased", num_labels=11, id2label=id2label, label2id=label2id
    "neuralmind/bert-base-portuguese-cased", num_labels=11, id2label=id2label, label2id=label2id
)

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
weights

array([ 2.88311688,  8.07272727,  0.1260379 ,  0.72891443, 20.18181818,
        3.3289597 , 21.52727273, 64.58181818,  8.96969697,  1.84519481,
        6.3315508 ])

In [None]:
torch.tensor(weights)

tensor([ 2.8831,  8.0727,  0.1260,  0.7289, 20.1818,  3.3290, 21.5273, 64.5818,
         8.9697,  1.8452,  6.3316], dtype=torch.float64)

In [None]:
# overwriting trainer class to compute weights
from torch import nn
import torch

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(weights))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir="copel_initial_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_copel["train"],
    eval_dataset=tokenized_copel["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


RuntimeError: ignored

### Model evaluation

In [5]:
classifier = pipeline("sentiment-analysis", model="../models/model_without_weights")
classifier(text)

NameError: name 'text' is not defined