https://pub.aimind.so/fine-tuning-bert-for-intent-classification-from-scratch-7e04be18b733

In [None]:
from datasets import load_dataset

ds = load_dataset("BuildaByte/Meditation-miniset-v0.2")

In [None]:
ds['train']

In [None]:
ds['train'].features

### Exploration

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(ds['train'])

In [None]:
df.head()

In [None]:
df['system_prompt'].unique()

In [None]:
print(len(df['context'].unique()))

In [None]:
print(len(df['user_prompt'].unique()))

In [None]:
print(len(df['intended_outcome'].unique()))

In [None]:
print(len(df['affirmations_and_mindfulness'].unique()))

In [None]:
# df['user_prompt'].unique()

### Datasets

### Train Model - Classification Model

In [None]:
import torch
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset


In [None]:
MODEL_NAME = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def tokenize_function(example):
    return tokenizer(
        example["user_prompt"], 
        truncation=True, 
        padding="max_length",  # Ensures all sequences in a batch are of the same length
        max_length=128        # Maximum sequence length
    )

In [None]:
intended_outcome_dict = {intent: i for i, intent in enumerate(df['intended_outcome'].unique())}
intended_outcome_inverse_dict = {i: intent for i, intent in enumerate(df['intended_outcome'].unique())}

In [None]:
def add_intended_outcome_id(example):
    example['intended_outcome_id'] = intended_outcome_dict.get(example['intended_outcome'])
    return example

In [None]:
# --- map inputs and label
tokenized_datasets = ds.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.map(add_intended_outcome_id)

In [None]:
tokenized_datasets

In [None]:
# rename column to use Hugging Face Trainer
tokenized_datasets = tokenized_datasets.rename_column("intended_outcome_id", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
dataset = tokenized_datasets["train"]
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
val_split_dataset = split_dataset["test"].train_test_split(test_size=0.5, seed=42)

train_dataset = split_dataset["train"]
test_dataset = val_split_dataset["train"]
val_dataset = val_split_dataset["test"]

In [None]:
# --- training
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(set(train_dataset["labels"])))

training_args = TrainingArguments(
    output_dir="./results",          # output directory
    eval_strategy="epoch",    # evaluate at each epoch
    learning_rate=5e-5,             # learning rate
    per_device_train_batch_size=16, # batch size for training
    per_device_eval_batch_size=16,  # batch size for evaluation
    num_train_epochs=1,             # number of training epochs
    weight_decay=0.01,              # strength of weight decay
    logging_dir="./logs",           # directory for storing logs
    logging_steps=10,
    save_strategy="epoch",
    # logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return accuracy.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

In [None]:
trainer.save_model("./results")
tokenizer.save_pretrained("./results")

In [None]:
results = trainer.evaluate(test_dataset)
print("Test Results:", results)

In [None]:
# ---- prediction ----
MODEL_NAME = "./results"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

In [None]:
def predict_intent(text, model, tokenizer):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=-1).item()
    return predicted_class


text = "I am stressed out at work." # 0
text = "I am anxious at work" # 3
text = "I am not feeling confident in my skills" # 4
predicted_class = predict_intent(text, model, tokenizer)
print(f"Predicted Intent Class: {predicted_class}")

In [None]:
# intended_outcome_dict

### Train Model - Multi-classification model (TODO)