## Preliminaries

We begin by installing and loading the required dependencies. We will require the Python version to be ≥ 3.9

In [None]:
!pip install transformers
!pip install bitsandbytes
!pip install accelerate
!pip install huggingface_hub
!pip install peft

In [None]:
from datasets import load_dataset, Dataset
from typing import List, Union

import numpy as np 
import pandas as pd
import torch
import transformers
import wandb
import warnings

transformers.logging.set_verbosity_error()
wandb.init(mode="disabled")
warnings.filterwarnings('ignore')

## Dataset

We will be using MTOP Domain (English), a publicly available dataset hosted on Hugging Face. It consists of user utterances from one of eleven possible domains  shown below - 

['messaging', 'calling', 'event', 'timer', 'music', 'weather', 'alarm', 'people', 'reminder', 'recipes', 'news']

The dataset has 15,667 training samples and 2,235 validation samples.

In [None]:
from datasets import load_dataset
import pandas as pd 

dataset = 'mteb/mtop_domain'
dset = load_dataset(dataset, trust_remote_code=True)
train_data = pd.DataFrame(dset['train'])
val_data = pd.DataFrame(dset['validation'])
print(f"Num train samples : {len(train_data)}")
print(f"Num validation samples : {len(val_data)}")

## Evaluation metric

For the purpose of benchmarking our experiments, we choose Weighted F1 score as the metric. We also display the classification report and confusion matrix for detailed interpretation.

In [None]:
from sklearn.metrics import (classification_report, confusion_matrix,
                              ConfusionMatrixDisplay, f1_score)

import matplotlib.pyplot as plt

def fetch_performance_metrics(y_true: np.ndarray, y_pred: np.ndarray, exp_name: str,
                              display_report: bool = True, display_confusion_matrix: bool = True,
                             label_list: List[str] = ['messaging', 'calling', 'event', 'timer', 'music', 'weather',
                                                      'alarm', 'people', 'reminder', 'recipes', 'news'],
                              num_labels: int = 11) -> dict:
    """
    Util function to compute F1 score and optionally display the classification report and confusion matrix for a given experiment.

    Args:
        y_true (np.ndarray): Array containing true labels.
        y_pred (np.ndarray): Array containing predicted labels.
        exp_name (str): Name of the experiment (used to save results).
        display_report (bool, optional): Boolean flag indicating whether to display classification report (True) or not (False). Defaults to True.
        display_confusion_matrix (bool, optional): Boolean flag indicating whether to display confusion matrix  (True) or not (False). Defaults to True.
        label_list (list, optional): List of labels. Defaults to ['messaging', 'calling', 'event', 'timer', 'music', 'weather',
                                                      'alarm', 'people', 'reminder', 'recipes', 'news'].
        num_labels (int, optional): Number of unique labels. Defaults to 11.

    Returns:
        dict: A dictionary containing F1 score.
    """ 
    if display_report:
        print('\nClassification Report:')
        
        print(classification_report(y_true=y_true, y_pred=y_pred, labels=list(range(num_labels)),
                                   target_names=label_list[:num_labels]))
    
    if display_confusion_matrix:
        cm = confusion_matrix(y_true=y_true, y_pred=y_pred)
        fig, ax = plt.subplots(figsize=(12, 12))
        display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_list)
        display.plot(ax=ax)
        plt.savefig(exp_name)
        
    return {'F1-score' : f1_score(y_true, y_pred, average='weighted')}

## Baseline : LLM with ICL

We will need to login to Hugging Face hub to be able to access the LLM. We do this via Hugging Face's notebook_login

In [None]:
from huggingface_hub import notebook_login

notebook_login()

## Defining the LLM Pre-liminaries

In [None]:
from peft import AutoPeftModelForCausalLM
from tqdm import tqdm
from transformers import (AutoTokenizer, AutoModelForCausalLM,
                            BitsAndBytesConfig, pipeline)

import datasets
import gc

def _generate_predictions(example: datasets.formatting.formatting.LazyBatch, 
                          generator: pipeline, text_column: str, 
                          max_new_tokens: int = 9) -> dict:
    """
    Generates predictions using the text generation model for a given example.

    Args:
        example (datasets.formatting.formatting.LazyBatch): Batch of samples from a dataset.
        generator (pipeline): Huggingface pipeline for text generation.
        text_column (str): Prompt for the text generation model.
        max_new_tokens (int, optional): Maximum number of tokens to generate. Defaults to 9.

    Returns:
        dict: A dictionary containing the generated predictions.
    """
    num_examples = len(dataset)
    predictions = []
    batch_results = generator(example[text_column], max_new_tokens=max_new_tokens, num_return_sequences=1)
    predictions.extend([result[0]["generated_text"] for result in batch_results])
    return {'prediction' : predictions}

def infer_LLM(model_name: str, input_ds: Dataset, batch_size: int = 4, max_new_tokens: int = 9,
             text_column: str = 'domain_prompt', finetuned_model_path: str = None) -> Dataset:
    """
    Util function to run LLM inference

    Args:
        model_name (str): The name or path of the LLM model.
        input_ds (Dataset): Input dataset containing text prompts.
        batch_size (int, optional): Batch size for inference. Defaults to 4.
        max_new_tokens (int, optional): Maximum number of tokens to generate. Defaults to 9.
        text_column (str, optional): Name of the column containing text prompts. Defaults to 'domain_prompt'.
        finetuned_model_path (str, optional): Path to the fine-tuned model. Defaults to None.

    Returns:
        dataset: Dataset with generated predictions.
    """
    quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
    
    if finetuned_model_path is None:
        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto",
                                                quantization_config=quantization_config)
    else:
        model = AutoPeftModelForCausalLM.from_pretrained(finetuned_model_path,
                                                        device_map="auto",
                                                quantization_config=quantization_config)
    
    text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,
                             batch_size=batch_size, truncation=False)
    text_generator.tokenizer.pad_token_id = model.config.eos_token_id
    
    input_ds = input_ds.map(_generate_predictions, fn_kwargs={'generator' : text_generator,
                                                              'text_column' : text_column,
                                                              'max_new_tokens' : max_new_tokens
                                                             },
                           batched=True, batch_size=batch_size)
    
    # Cleanup
    del model
    del text_generator
    del tokenizer
    
    gc.collect()
    torch.cuda.empty_cache()
    
    return input_ds

def build_LLM_prompt(input_ds: Dataset, label_column: str = None, prompt_template: Union[str, None] = None, 
                     with_label: bool = False) -> Dataset:
    """Util function to build the LLM prompt from input text data

    Args:
        input_ds (Dataset): Input dataset containing text
        label_column (str, optional): Label column in the data. Applicable if constructing prompts for in-context samples / finetuning LLM. Defaults to None.
        prompt_template (Union[str, None], optional): Text instruction to prepend to each transformed input text sample. Defaults to None.
        with_label (bool, optional): `True` if the prompts should include labels from the `label_column`. Defaults to False.

    Returns:
        Dataset: Dataset with generated predictions.
    """
    if type(input_ds) == pd.DataFrame:
        input_ds = Dataset.from_pandas(input_ds)
        
    if with_label:
        
        input_ds = input_ds.map(lambda x: {'domain_prompt': '[UTTERANCE]' + x['text'] + '[/UTTERANCE]' + \
                                          '[DOMAIN]' + x[label_column] + '[/DOMAIN]'})
    else:
        input_ds = input_ds.map(lambda x: {'domain_prompt': prompt_template + '[UTTERANCE]' + x['text'] + '[/UTTERANCE]' + \
                                          '[DOMAIN]'})
    
    return input_ds

def _extract_label(sample: datasets.formatting.formatting.LazyRow, label_list: List[str]) -> dict:
    """Util function to extract the domain from the generated LLM prediction

    Args:
        sample (datasets.formatting.formatting.LazyRow): Batch of samples from a dataset
        label_list (List[str]): List of possible domains

    Returns:
        dict: Dictionary of extracted predicted labels
    """
    prompt_length = len(sample['domain_prompt'])
    generated_answer = sample['prediction'][prompt_length:].split('[/DOMAIN]')[0].lower()

    label_matched = False
    predicted_label = None
    
    for label in label_list:        
        if label in generated_answer:
            predicted_label = label
            label_matched = True
            break
                
    if not label_matched:
        predicted_label = "no_match"
    
    return {'predicted_label' : predicted_label}
     
def run_llm(val_data: pd.DataFrame, prompt_template: str, model_name: str, domain_list: List[str], label_mapping: dict, 
            label_column: str = 'label', batch_size: int = 4, finetuned_model_path: str = None,
           num_labels: int = 11, compute_metrics: bool = True) -> dict:
    """Run end-to-end LLM inference (from pre-processing input data to post-processing the predictions) and return the computed performance metrics on input validation data

    Args:
        val_data (pd.DataFrame): Validation data with labels
        prompt_template (str): Text instruction to prepend to each transformed input text sample.
        model_name (str): The name or path of the pre-trained LLM.
        domain_list (List[str]): List of possible domains 
        label_mapping (dict): Dictionary mapping to convert text labels to integers 
        label_column (str, optional): Label column in the data. Defaults to 'label'.
        batch_size (int, optional): Batch size for inference. Defaults to 4.
        finetuned_model_path (str, optional):  Path to the fine-tuned model, if available.. Defaults to None.
        num_labels (int, optional): Number of unique labels. Defaults to 6.
        compute_metrics (bool, optional): Boolean flag indicating whether to compute the performance metrics (True) or not (False)

    Returns:
        dict: A dictionary containing F1 score.
    """
    predicted_label_list = []
    val_ds = build_LLM_prompt(val_data, prompt_template=prompt_template)
    val_ds_with_pred = infer_LLM(model_name, val_ds, batch_size, finetuned_model_path=finetuned_model_path)
    
    predicted_label_list = val_ds_with_pred.map(_extract_label, 
                                  fn_kwargs={"label_list": domain_list[:num_labels]})['predicted_label'] 

    y_pred = [label_mapping[pred] if pred in label_mapping else num_labels for pred in predicted_label_list]
    y_true = val_data[label_column].astype(int).values.tolist()

    if num_labels not in y_pred:
        # All LLM predictions match a valid domain from `domain_list`
        domain_list.remove('no_match')
    
    if compute_metrics:
        return y_pred, fetch_performance_metrics(y_true, y_pred, 'mistral_7b', label_list=domain_list)
    
    return y_pred

* `build_LLM_prompt` transforms the input text into a LLM prompt
* `infer_LLM` and `_generate_predictions` instantiate the LLM and run inference with the constructed input prompts.
* `_extract_label` maps the LLM free text outputs to valid domain predictions. If the generated text has no matching domain, the predicted label is set to "no_match".
* `run_LLM` invokes functions `build_LLM_prompt` and `infer_LLM` to perform inference and return the computed performance metrics on input validation data.

## Build the LLM prompt 
We select one sample at random for each label and build the prompt prefix to run ICL.

In [None]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
seed = 43

sample_data = train_data.groupby('label_text').sample(n=1, random_state=seed).reset_index(drop=True)
domain_list = ['messaging', 'calling', 'event', 'timer', 'music', 'weather',
                'alarm', 'people', 'reminder', 'recipes', 'news']
domain_list_str = ', '.join(domain_list)

transformed_sample_data = build_LLM_prompt(sample_data, with_label=True, label_column='label_text')
samples_str = '\n'.join(transformed_sample_data['domain_prompt'])

prompt_template =  "<s>[INST] You are a helpful, respectful and honest assistant. Choose one option that best describes the domain behind the given utterance based on the following comma separated options: " + domain_list_str + "[/INST] </s>"

## Putting it all to work
We are ready to run our LLM now.

In [None]:
mapping_df = train_data[['label', 'label_text']].drop_duplicates().sort_values(by='label')
text_to_label = dict(zip(mapping_df['label_text'], mapping_df['label']))
llm_domain_list = domain_list + ['no_match']

_, score = run_llm(val_data, prompt_template, model_name, llm_domain_list, text_to_label,
        batch_size=64)
print(score)

## Our Approach : Self-Training using DQC Toolkit
Our proposed self-training approach is comprised of the following three steps - 
1. Generate LLM Predictions on Unlabelled Data
2. Apply Label Correction using DQC Toolkit
3. Fine-tune LLM using Reliably Labelled Data 
### Step 1 - Generate LLM Predictions on Unlabelled Data
We leverage LLM with ICL to generate initial labels for the data to train our model.

In [None]:
predictions = run_llm(train_data, prompt_template, model_name, llm_domain_list, text_to_label,
                      batch_size=64, compute_metrics=False)

As mentioned before, many predictions can end up being mapped to "no_match". We remove such samples from the data.

In [None]:
train_data['llm_predicted_label'] = pd.Series(predictions)
## Only valid label predictions
train_data_with_llm_pred = train_data.loc[train_data['llm_predicted_label'] < len(domain_list), ].reset_index(drop=True)

### Step 2 - Apply Label Correction using DQC Toolkit
Currently, DQC toolkit offers `CrossValCurate` for curation of text classification datasets (binary / multi-class) using cross validation based label prediction. We will leverage this module to acquire better quality labels for our data.

We install DQC Toolkit via pypi.

In [None]:
!pip install dqc-toolkit

Next, we curate the data and return corrected predictions

In [None]:
from dqc import CrossValCurate
cvc = CrossValCurate(random_state=seed, 
                     calibration_method='calibrate_using_baseline' )

train_data_curated = cvc.fit_transform(train_data_with_llm_pred, y_col_name='llm_predicted_label')

`CrossValCurate` accepts two parameters *`random_state`* (random seed for reproducibility) and *`calibration_method`*(whether/how to calibrate the prediction probabilities of the model being trained for label correction). You can check out all the hyper-parameters available for the modules in the documentation [here](https://sumanthprabhu.github.io/DQC-Toolkit/latest/api/crossval/).

The returned object ``train_data_curated`` is a Pandas dataframe similar to the input dataframe ``train_data_with_llm_pred`` with the following additional columns -
* '*`label_correctness_score`*' represents a normalized score quantifying the correctness of *`llm_predicted_label`*.
* '*`is_label_correct`*' is a boolean flag indicating whether the *`llm_predicted_label`* is to be considered correct (True) or incorrect (False).
* '*`predicted_label`*' and '*`prediction_probability`*' represent DQC Toolkit's predicted label for a given sample and the corresponding probability score.

We leverage *`is_label_correct`* to identify reliably labelled samples

In [None]:
train_data_curated = train_data_curated.loc[train_data_curated['is_label_correct']].reset_index(drop=True)

## Step 3 - Fine-tune LLM using Reliably Labelled Data
We fine-tune the LLM Using ``train_data_curated`` with *`llm_predicted_label`* as the target variable. First, we map the integer labels to text labels for LLM interpretability.

In [None]:
label_to_text = {v: k for k, v in text_to_label.items()}
train_data_curated['llm_predicted_label_text'] = train_data_curated['llm_predicted_label'].map(label_to_text)

Next, we transform the data into instruction prompts for better performance

In [None]:
prompt_template =  "<s>[INST] You are a helpful, respectful and honest assistant. Choose one option that best describes the domain that can be mapped to the given utterance based on the following comma separated options: " + domain_list_str + "[/INST] </s>"
label_column = 'llm_predicted_label_text'

train_data_curated_ds = build_LLM_prompt(train_data_curated, with_label=True, label_column=label_column)
train_data_curated_ds = train_data_curated_ds.map(lambda example, prompt_template=prompt_template : {'domain_prompt' : prompt_template + example['domain_prompt']})

Then, we define the LLM fine-tuning function

In [None]:
from peft import get_peft_model, LoraConfig, PeftConfig, PeftModel, prepare_model_for_kbit_training
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer, 
                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
                          pipeline, Trainer, TrainingArguments 
                          )

import bitsandbytes as bnb
import torch.nn as nn

def tokenize(example: datasets.formatting.formatting.LazyRow, tokenizer: AutoTokenizer ) -> dict:
    """Util function to tokenize text data

    Args:
        example (datasets.formatting.formatting.LazyRow): Batch of samples containing text to tokenize.
        tokenizer (AutoTokenizer): Tokenizer object used for tokenization.

    Returns:
        dict: Dictionary containing tokenized text.
    """
    tokenized = tokenizer(
        example['domain_prompt'],
        truncation=False
    )
    
    return {**tokenized}
    
def finetune_LLM(base_model_name: str, train_ds: Dataset,
              save_path: str, seed: int, batch_size: int = 64, num_epochs: int = 1):
    """Function to fine-tune an LLM on the given input training data

    Args:
        base_model_name (str): The name or path of the LLM model to be finetuned
        train_ds (Dataset): Input dataset containing text prompts.
        save_path (str): Path to save the trained model
        seed (int): Random seed for reproducibility
        batch_size (int, optional): Batch size to use during training. Defaults to 64.
        num_epochs (int, optional): Number of training epochs. Defaults to 1.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )

    model = AutoModelForCausalLM.from_pretrained(base_model_name, 
                                                 quantization_config=bnb_config, 
                                                 device_map="auto")
    
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, padding_side="left")
    tokenizer.pad_token = tokenizer.eos_token
    
    train_ds = train_ds.map(
        tokenize,
        batched=False,
        fn_kwargs={"tokenizer": tokenizer},
    )
    
    model = prepare_model_for_kbit_training(model)
    
    peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
    )

    args = TrainingArguments(
            disable_tqdm=False,
            output_dir=save_path,
            warmup_steps=1,
            per_device_train_batch_size=batch_size,
            num_train_epochs=num_epochs,
            learning_rate=2e-4,
            fp16=True,
            optim="paged_adamw_8bit",             
            logging_dir="./logs",        
            save_strategy="no",              
            evaluation_strategy="no",                             
            report_to=None          
        )
    model = get_peft_model(model, peft_config)
    model.config.use_cache = False
    
    trainer = Trainer(
        model=model,
        train_dataset=train_ds.select_columns(['input_ids', 'attention_mask']),
        eval_dataset=None,
        args=args,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

    trainer.train()
    trainer.model.save_pretrained(save_path)
    
    # Cleanup
    del model
    del tokenizer
    del trainer

    gc.collect()
    torch.cuda.empty_cache()
    return

Finally, we are ready to fine-tune the model. The number of training epochs is set to 1 and batch size is set to 64.

In [None]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

finetuned_model_path = "selftrained-mistral-mtop"
finetune_LLM(model_name, train_data_curated_ds, save_path=finetuned_model_path, seed=seed)

## Test the Self-Trained Model's Performance

In [None]:
_, score = run_llm(val_data, prompt_template, model_name, llm_domain_list, text_to_label,
       finetuned_model_path=finetuned_model_path, batch_size=64)
print(score)