- https://pub.aimind.so/fine-tuning-bert-for-intent-classification-from-scratch-7e04be18b733
- https://huggingface.co/google-bert/bert-base-uncased
- https://huggingface.co/blog/Valerii-Knowledgator/multi-label-classification

### Load Dataset

In [None]:
from datasets import load_dataset

ds = load_dataset("BuildaByte/Meditation-miniset-v0.2")

In [None]:
ds['train']

In [None]:
ds['train'].features

### Exploration

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(ds['train'])

In [None]:
df.head()

In [None]:
df['system_prompt'].unique()

In [None]:
print(len(df['context'].unique()))

In [None]:
print(len(df['suggested_techniques'].unique()))

In [None]:
set([t.strip() for s in df['suggested_techniques'].unique() for t in s.split(',')])

In [None]:
print(len(df['user_prompt'].unique()))

In [None]:
print(len(df['intended_outcome'].unique()))

In [None]:
print(len(df['affirmations_and_mindfulness'].unique()))

In [None]:
# df['user_prompt'].unique()

### Datasets

### Train Model - Classification Model

In [None]:
import torch
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset


In [None]:
MODEL_NAME = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def tokenize_function(example):
    return tokenizer(
        example["user_prompt"], 
        truncation=True, 
        padding="max_length",  # Ensures all sequences in a batch are of the same length
        max_length=128        # Maximum sequence length
    )

In [None]:
intended_outcome_dict = {intent: i for i, intent in enumerate(df['intended_outcome'].unique())}
intended_outcome_inverse_dict = {i: intent for i, intent in enumerate(df['intended_outcome'].unique())}

In [None]:
def add_intended_outcome_id(example):
    example['intended_outcome_id'] = intended_outcome_dict.get(example['intended_outcome'])
    return example

In [None]:
# --- map inputs and label
tokenized_datasets = ds.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.map(add_intended_outcome_id)

In [None]:
tokenized_datasets

In [None]:
# rename column to use Hugging Face Trainer
tokenized_datasets = tokenized_datasets.rename_column("intended_outcome_id", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
dataset = tokenized_datasets["train"]
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
val_split_dataset = split_dataset["test"].train_test_split(test_size=0.5, seed=42)

train_dataset = split_dataset["train"]
test_dataset = val_split_dataset["train"]
val_dataset = val_split_dataset["test"]

In [None]:
# --- training
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(set(train_dataset["labels"])), 
                                                          problem_type="single_label_classification")

training_args = TrainingArguments(
    output_dir="./results",          # output directory
    eval_strategy="epoch",    # evaluate at each epoch
    learning_rate=5e-5,             # learning rate
    per_device_train_batch_size=16, # batch size for training
    per_device_eval_batch_size=16,  # batch size for evaluation
    num_train_epochs=1,             # number of training epochs
    weight_decay=0.01,              # strength of weight decay
    logging_dir="./logs",           # directory for storing logs
    logging_steps=10,
    save_strategy="epoch",
    # logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return accuracy.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

In [None]:
trainer.save_model("./results")
tokenizer.save_pretrained("./results")

In [None]:
results = trainer.evaluate(test_dataset)
print("Test Results:", results)

In [None]:
# ---- prediction ----
MODEL_NAME = "./results"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

In [None]:
def predict_intent(text, model, tokenizer):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=-1).item()
    return predicted_class


text = "I am stressed out at work." # 0
text = "I am anxious at work" # 3
text = "I am not feeling confident in my skills" # 4
predicted_class = predict_intent(text, model, tokenizer)
print(f"Predicted Intent Class: {predicted_class}")

In [None]:
# intended_outcome_dict

In [None]:
ds

### Train Model - Multi-classification model (TODO)

In [None]:
import torch

In [None]:
# ---- make the multi-class dataset

suggested_techniques = set([t.strip() for s in df['suggested_techniques'].unique() for t in s.split(',')])
suggested_technique_dict = {t: i for i, t in enumerate(suggested_techniques)}
suggested_technique_inverse_dict = {i: t for i, t in enumerate(suggested_techniques)}


In [None]:
suggested_technique_dict

In [None]:
def add_suggested_techniques_multi_label(example):
    t = torch.tensor([0]*len(suggested_technique_dict), dtype=torch.float64)
    for e in [t.strip() for t in example["suggested_techniques"].split(',')]:
        idx = suggested_technique_dict.get(e)
        t[idx] = 1
    example["suggested_techniques_multi_label"] = t
    return example

In [None]:
tokenized_datasets = ds.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.map(add_suggested_techniques_multi_label)

In [None]:
print(tokenized_datasets["train"][0]["suggested_techniques_multi_label"])
print(len(tokenized_datasets["train"][0]["suggested_techniques_multi_label"]))

In [None]:
# --- rename column to match HuggingFace Dataset
tokenized_datasets = tokenized_datasets.rename_column("suggested_techniques_multi_label", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
# --- split dataset
dataset = tokenized_datasets["train"]
train_test_split = dataset.train_test_split(train_size=0.8, seed=42)
test_val_split = train_test_split["test"].train_test_split(train_size=0.5, seed=42)

train_dataset = train_test_split["train"]
test_dataset = test_val_split["train"]
val_dataset = test_val_split["test"]

In [None]:
# inputs = tokenizer(["this movie was great!", "it was terrible"], padding=True, return_tensors="pt")


In [None]:
# --- training
MODEL_NAME = "google-bert/bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(suggested_technique_dict), 
                                                          problem_type="multi_label_classification")

training_args = TrainingArguments(
    output_dir="./results",          # output directory
    eval_strategy="epoch",    # evaluate at each epoch
    learning_rate=5e-5,             # learning rate
    per_device_train_batch_size=16, # batch size for training
    per_device_eval_batch_size=16,  # batch size for evaluation
    num_train_epochs=1,             # number of training epochs
    weight_decay=0.01,              # strength of weight decay
    logging_dir="./logs",           # directory for storing logs
    logging_steps=10,
    save_strategy="epoch",
    # logging_strategy="epoch",
    load_best_model_at_end=True,
    # metric_for_best_model="accuracy", # need to define later?
)

In [None]:
print(model.classifier)
print(model.config.num_labels)

In [None]:
# f1 = evaluate.load("f1")
# accuracy = evaluate.load("accuracy")
# precision = evaluate.load("precision")
# recall = evaluate.load("recall")

import numpy as np

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = sigmoid(logits)
    preds = (probs > 0.5).astype(int).reshape(-1)
    labels = labels.astype(int).reshape(-1)

    # compute metrics
    return clf_metrics.compute(predictions=preds, references=labels)
    
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

In [None]:
# --- save model
MUTLICLASS_MODEL_NAME = "./results_multiclass"
trainer.save_model(MUTLICLASS_MODEL_NAME)
tokenizer.save_pretrained(MUTLICLASS_MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MUTLICLASS_MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MUTLICLASS_MODEL_NAME)

In [None]:
results = trainer.evaluate(test_dataset)
print("Test Results:", results)

In [None]:
def predict_multiclass(text, model, tokenizer, THRESHOLD=0.5):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = sigmoid(logits)
        preds = (probs > THRESHOLD).int()
        
    return preds

In [None]:
def get_classes_from_predictions(predicted_classes):
    outputs = []
    for pred in predicted_classes:
        out = [suggested_technique_inverse_dict.get(i) for i, p in enumerate(pred) if p == 1]
        outputs.append(out)
    return outputs

In [None]:
# --- prediction
text = "I am stressed out at work." # ['guided imagery', 'deep breathing', 'progressive muscle relaxation']
text = "I am anxious at work" # ['gentle breathing', 'grounding exercises', 'body scan'
text = "I am not feeling confident in my skills" # ['gentle breathing', 'grounding exercises', 'body scan']
predicted_classes = predict_multiclass(text, model, tokenizer)
predicted_classes_names = get_classes_from_predictions(predicted_classes)
print(f"Predicted Multi-Class: {predicted_classes}; Classes Names: {predicted_classes_names}")