In [1]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [2]:
# Load intent data
def load_intent_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

In [3]:
intent_data = load_intent_data('Dataset Chatbot.json')

In [5]:
# Prepare data
texts = []
labels = []
for intent in intent_data['intents']:
    for pattern in intent['patterns']:
        texts.append(pattern)
        labels.append(intent['tag'])

In [7]:
# Encode labels
unique_labels = list(set(labels))
label_to_id = {label: i for i, label in enumerate(unique_labels)}
id_to_label = {i: label for i, label in enumerate(unique_labels)}

In [8]:
labels = [label_to_id[label] for label in labels]

In [26]:
# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.4)

In [27]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
model = AutoModelForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', num_labels=len(unique_labels))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# Tokenize data
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=128)


In [21]:
train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

In [22]:
# Create datasets
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})
val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels
})

datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

In [23]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # evaluate each epoch
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    num_train_epochs=3,              # number of epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets['train'],
    eval_dataset=datasets['validation']
)

In [24]:
# Train the model
trainer.train()

  0%|          | 0/105 [00:00<?, ?it/s]

{'loss': 3.7448, 'grad_norm': 6.375153064727783, 'learning_rate': 1.8095238095238097e-05, 'epoch': 0.29}
{'loss': 3.6166, 'grad_norm': 6.693673133850098, 'learning_rate': 1.6190476190476193e-05, 'epoch': 0.57}
{'loss': 3.3987, 'grad_norm': 7.8361616134643555, 'learning_rate': 1.4285714285714287e-05, 'epoch': 0.86}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 3.094558000564575, 'eval_runtime': 24.7, 'eval_samples_per_second': 9.514, 'eval_steps_per_second': 0.607, 'epoch': 1.0}
{'loss': 3.1124, 'grad_norm': 6.947027683258057, 'learning_rate': 1.2380952380952383e-05, 'epoch': 1.14}
{'loss': 2.8456, 'grad_norm': 6.974558353424072, 'learning_rate': 1.0476190476190477e-05, 'epoch': 1.43}
{'loss': 2.7218, 'grad_norm': 7.981402397155762, 'learning_rate': 8.571428571428571e-06, 'epoch': 1.71}
{'loss': 2.5002, 'grad_norm': 17.08779525756836, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 2.4892399311065674, 'eval_runtime': 23.3457, 'eval_samples_per_second': 10.066, 'eval_steps_per_second': 0.643, 'epoch': 2.0}
{'loss': 2.3476, 'grad_norm': 7.417869567871094, 'learning_rate': 4.761904761904762e-06, 'epoch': 2.29}
{'loss': 2.2354, 'grad_norm': 6.673740386962891, 'learning_rate': 2.8571428571428573e-06, 'epoch': 2.57}
{'loss': 2.2569, 'grad_norm': 6.796047687530518, 'learning_rate': 9.523809523809525e-07, 'epoch': 2.86}


  0%|          | 0/15 [00:00<?, ?it/s]

{'eval_loss': 2.279984951019287, 'eval_runtime': 2.7567, 'eval_samples_per_second': 85.248, 'eval_steps_per_second': 5.441, 'epoch': 3.0}
{'train_runtime': 535.5774, 'train_samples_per_second': 3.058, 'train_steps_per_second': 0.196, 'train_loss': 2.840890748160226, 'epoch': 3.0}


TrainOutput(global_step=105, training_loss=2.840890748160226, metrics={'train_runtime': 535.5774, 'train_samples_per_second': 3.058, 'train_steps_per_second': 0.196, 'train_loss': 2.840890748160226, 'epoch': 3.0})

In [25]:
# Save the model
model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')

('./trained_model\\tokenizer_config.json',
 './trained_model\\special_tokens_map.json',
 './trained_model\\vocab.txt',
 './trained_model\\added_tokens.json',
 './trained_model\\tokenizer.json')