#### LLM Full Finetuning (flan-t5-small)

#### Intent classifier for Travel bot (text classification)

In [25]:
from typing import List, Tuple

import evaluate
import nltk
import numpy as np
import pandas as pd
from datasets import Dataset, concatenate_datasets
from huggingface_hub import HfFolder
from nltk.tokenize import sent_tokenize
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from sklearn.metrics import classification_report
import torch
from tqdm.auto import tqdm

In [2]:
metric = evaluate.load("f1")

In [3]:
def preprocess_function(sample: Dataset, padding: str = "max_length") -> dict:
    """Preprocess the dataset."""

    # add prefix to the input for t5
    inputs = [item for item in sample["utterance"]]

    # tokenize inputs
    model_inputs = tokenizer(
        inputs, max_length=max_source_length, padding=padding, truncation=True
    )

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(
        text_target=sample["intent"],
        max_length=max_target_length,
        padding=padding,
        truncation=True,
    )

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(la if la != tokenizer.pad_token_id else -100) for la in label]
            for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def postprocess_text(
    preds: List[str], labels: List[str]
) -> Tuple[List[str], List[str]]:
    """helper function to postprocess text"""
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, average="macro"
    )
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [5]:
# Load Data

In [7]:
label2id = {"Cab Booking": 0, "Flight Booking": 1, "Fetch Booking ": 2, "Modify Booking": 3,"FAQ":4}
id2label = {id: label for label, id in label2id.items()}

def load_dataset(model_type: str = "") -> Dataset:
    """Load dataset."""
    dataset_intents_pandas = pd.read_excel(
        "Travel Assistant_MLUtterances.xlsx",
        header=None,
        names=["utterance","intent"]
    )

    dataset_intents_pandas["intent"] = dataset_intents_pandas["intent"].astype(str)
    if model_type == "AutoModelForSequenceClassification":
        # Convert labels to integers
        dataset_intents_pandas["intent"] = dataset_intents_pandas["intent"].map(
            label2id
        )

    dataset_intents_pandas["utterance"] = dataset_intents_pandas["utterance"].astype(str)
    dataset = Dataset.from_pandas(dataset_intents_pandas)
    dataset = dataset.shuffle(seed=42)
    dataset = dataset.train_test_split(test_size=0.2)

    return dataset

dataset = load_dataset()

In [8]:
# Load Model

In [9]:
MODEL_ID = "google/flan-t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)



In [13]:
# Combine Data to check calculate max token length for source(utterance) and target(intent)

In [14]:
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(
    lambda x: tokenizer(x["utterance"], truncation=True),
    batched=True,
    remove_columns=["utterance", "intent"],
)
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(
    lambda x: tokenizer(x["intent"], truncation=True),
    batched=True,
    remove_columns=["utterance", "intent"],
)
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=REPOSITORY_ID,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False,  # Overflows with fp16
    learning_rate=3e-4,
    num_train_epochs=10,
    logging_dir=f"{REPOSITORY_ID}/logs",  # logging & evaluation strategies
    logging_strategy="epoch",
    load_best_model_at_end=False,
    report_to="tensorboard"
)

Map:   0%|          | 0/175 [00:00<?, ? examples/s]

Max source length: 25


Map:   0%|          | 0/175 [00:00<?, ? examples/s]

Max target length: 4


In [15]:
# Tokenize Dataset
tokenized_dataset = dataset.map(
        preprocess_function, batched=True, remove_columns=["utterance", "intent"]
    )
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# load model from the hub
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=label_pad_token_id,
        pad_to_multiple_of=8,
    )

# Create Trainer instance
trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        compute_metrics=compute_metrics)



Map:   0%|          | 0/140 [00:00<?, ? examples/s]

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [16]:
# Train Model
trainer.train()

Step,Training Loss
18,2.6928
36,0.5505
54,0.2525
72,0.1372
90,0.1071
108,0.0586
126,0.0658
144,0.0484
162,0.0265
180,0.0191


TrainOutput(global_step=180, training_loss=0.3958520162436697, metrics={'train_runtime': 234.6185, 'train_samples_per_second': 5.967, 'train_steps_per_second': 0.767, 'total_flos': 16265419161600.0, 'train_loss': 0.3958520162436697, 'epoch': 10.0})

In [17]:
# Save Model
local_path_to_Save_model ="D:/projects/llm-finetuning/intent-classifier/flan-t5-small-intent-classifier"
tokenizer.save_pretrained(local_path_to_Save_model)
trainer.save_model(local_path_to_Save_model)

In [18]:
# Load Finetuned Model
tokenizer_finetuned = AutoTokenizer.from_pretrained(local_path_to_Save_model)
model_finetuned = AutoModelForSeq2SeqLM.from_pretrained(local_path_to_Save_model)

In [None]:
# Test

In [19]:
def classify(texts_to_classify: str):
    """Classify a batch of texts using the model."""
    inputs = tokenizer_finetuned(
        texts_to_classify,
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        outputs = model_finetuned.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=150,
            num_beams=2,
            early_stopping=True,
        )

    predictions = [
        tokenizer_finetuned.decode(output, skip_special_tokens=True) for output in outputs
    ]
    return predictions

In [29]:
classify('Book cab from Noida to Gurugram')

['Cab Booking']

In [35]:
# Evaluate

In [27]:
def evaluate():
    """Evaluate the model on the test dataset."""
    predictions_list, labels_list = [], []

    batch_size = 16  # Adjust batch size based GPU capacity
    num_batches = len(dataset["test"]) // batch_size + (
        0 if len(dataset["test"]) % batch_size == 0 else 1
    )
    progress_bar = tqdm(total=num_batches, desc="Evaluating")

    for i in range(0, len(dataset["test"]), batch_size):
        batch_texts = dataset["test"]["utterance"][i : i + batch_size]
        batch_labels = dataset["test"]["intent"][i : i + batch_size]

        batch_predictions = classify(batch_texts)

        predictions_list.extend(batch_predictions)
        labels_list.extend([str(label) for label in batch_labels])

        progress_bar.update(1)

    progress_bar.close()
    report = classification_report(labels_list, predictions_list)
    print(report)

In [28]:
evaluate()

Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

                precision    recall  f1-score   support

   Cab Booking       1.00      1.00      1.00        18
           FAQ       1.00      1.00      1.00         8
 Fetch Booking       1.00      1.00      1.00         2
Flight Booking       1.00      1.00      1.00         5
Modify Booking       1.00      1.00      1.00         2

      accuracy                           1.00        35
     macro avg       1.00      1.00      1.00        35
  weighted avg       1.00      1.00      1.00        35

