In [39]:
from transformers import TFAutoModel, AutoTokenizer
import tensorflow as tf
from datasets import load_dataset

##loading the modedl
model= TFAutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')

#tokenizer
tokenizer= AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')


Some layers from the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
import pandas as pd
df=pd.read_csv("dados_intencoes.csv")
df

Unnamed: 0,Texto,Intencao
0,remarcar consulta,remarcar_consulta
1,remarcar minha consulta,remarcar_consulta
2,remarcar a data da consulta,remarcar_consulta
3,remarcar uma consulta,remarcar_consulta
4,Quero mudar o dia da minha consulta,remarcar_consulta
...,...,...
1325,Jamais,negacao
1326,Não concordo,negacao
1327,"Não, de forma alguma",negacao
1328,não,negacao


In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

df = pd.read_csv('dados_intencoes.csv')  

# Renomear colunas para facilitar
df.columns = ['text', 'label_text']

# Criar um dicionário para mapear as intenções a índices numéricos
label_dict = {label: idx for idx, label in enumerate(df['label_text'].unique())}
df['label'] = df['label_text'].map(label_dict)

# Separar os dados em treino/tsete/validacao 80 10 10
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'])

# Converter DataFrames em datasets compatíveis com Hugging Face
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Criar o DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print(dataset_dict)




DatasetDict({
    train: Dataset({
        features: ['text', 'label_text', 'label', '__index_level_0__'],
        num_rows: 1064
    })
    validation: Dataset({
        features: ['text', 'label_text', 'label', '__index_level_0__'],
        num_rows: 133
    })
    test: Dataset({
        features: ['text', 'label_text', 'label', '__index_level_0__'],
        num_rows: 133
    })
})


In [42]:
dataset_dict['train'] = dataset_dict['train'].remove_columns(['__index_level_0__'])
dataset_dict['validation'] = dataset_dict['validation'].remove_columns(['__index_level_0__'])
dataset_dict['test'] = dataset_dict['test'].remove_columns(['__index_level_0__'])


In [43]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label_text', 'label'],
        num_rows: 1064
    })
    validation: Dataset({
        features: ['text', 'label_text', 'label'],
        num_rows: 133
    })
    test: Dataset({
        features: ['text', 'label_text', 'label'],
        num_rows: 133
    })
})

In [44]:
def tokenize(batch):
    return tokenizer(batch["text"], padding='max_length', truncation=True,max_length=32)

In [45]:
dataset_encoded=dataset_dict.map(tokenize,batched=True,batch_size=None)

Map: 100%|██████████| 1064/1064 [00:00<00:00, 24401.09 examples/s]
Map: 100%|██████████| 133/133 [00:00<00:00, 13174.68 examples/s]
Map: 100%|██████████| 133/133 [00:00<00:00, 14619.66 examples/s]


In [46]:
dataset_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label_text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1064
    })
    validation: Dataset({
        features: ['text', 'label_text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 133
    })
    test: Dataset({
        features: ['text', 'label_text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 133
    })
})

In [47]:
# setting 'input_ids', 'attention_mask', 'token_type_ids', and 'label'
# to the tensorflow format. Now if you access this dataset you will get these
# columns in `tf.Tensor` format

dataset_encoded.set_format('tf', 
                            columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])

# setting BATCH_SIZE to 16, or 8, depend the size of your dataset
BATCH_SIZE = 16

def order(inp):
    '''
    This function will group all the inputs of BERT
    into a single dictionary and then output it with
    labels.
    '''
    data = list(inp.values())
    return {
        'input_ids': data[1],
        'attention_mask': data[2],
        'token_type_ids': data[3]
    }, data[0]

# converting train split of `dataset_encoded` to tf format
train_dataset = tf.data.Dataset.from_tensor_slices(dataset_encoded['train'][:])
# set batch_size and shuffle , depend the size of your dataset
train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(1400)

# mapping 
train_dataset = train_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

# converting to tf format
test_dataset = tf.data.Dataset.from_tensor_slices(dataset_encoded['test'][:])
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

In [48]:
inp, out = next(iter(train_dataset)) # a batch from train_dataset
print(inp, '\n\n', out)

{'input_ids': <tf.Tensor: shape=(16, 32), dtype=int64, numpy=
array([[  101,  3213,   117, 18691, 10551,   123, 16700,   170,  3983,
        22281,   119,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0],
       [  101,   807,   441,   453,   260, 10652,   125, 10995,  6176,
          221, 16700,   136,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0],
       [  101,   231, 22283,   117,  2745, 20885, 22280,   336,  1039,
          136,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0],
       [  101, 20013,  3784,   125, 10010,   578,   146,   644, 16736,
          243,   221,  7122, 16700,   102,     0,     0,     0,     0,
            0,     0,    

In [49]:
class BERTForClassification(tf.keras.Model):
    
    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='softmax')
        
    def call(self, inputs):
        x = self.bert(inputs)[1]
        return self.fc(x)

In [50]:
classifier = BERTForClassification(model, num_classes=14)

classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [51]:
history = classifier.fit(
    train_dataset,
    epochs=3
)


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [52]:
validation_dataset = tf.data.Dataset.from_tensor_slices(dataset_encoded['validation'][:])
validation_dataset = validation_dataset.batch(BATCH_SIZE)
validation_dataset = validation_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

# Avaliar o modelo nos dados de validação
validation_loss, validation_accuracy = classifier.evaluate(validation_dataset)
print(f"Loss de Validação: {validation_loss}")
print(f"Acurácia de Validação: {validation_accuracy}")

Loss de Validação: 0.1251409947872162
Acurácia de Validação: 0.9924812316894531


In [53]:
test_dataset = tf.data.Dataset.from_tensor_slices(dataset_encoded['test'][:])
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

# Avaliar o modelo nos dados de teste
test_loss, test_accuracy = classifier.evaluate(test_dataset)
print(f"Loss de Teste: {test_loss}")
print(f"Acurácia de Teste: {test_accuracy}")

Loss de Teste: 0.13055744767189026
Acurácia de Teste: 0.9774436354637146
