- This notebook fine-tunes the SetFit (Sentence Transfromers) Hugging face model on a small labelled dataset of customers reviews to solve  a multiclassifcation problem and generate topics/themes for unlabelled data

In [None]:
# Import required packages
import os
from datetime import date

import numpy as np
import pandas as pd
from loguru import logger

from setfit import SetFitModel, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split

os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

In [None]:
# Set Main Variables
TRAINING_MODEL = 'sentence-transformers/paraphrase-mpnet-base-v2'
TEST_SIZE = 0.15
Training_Arguments = TrainingArguments(
    batch_size=36,
    num_epochs=1,
    seed = 123 
)

In [None]:
def data_prep():
    """
    Reads the raw data from a CSV file, preprocesses it,
    and returns a DataFrame.
    """
    df_raw = pd.read_csv(os.path.join("..", "data", 
                                      'labelled_reviews.csv'))
    df_raw = df_raw[['text', 'label']]
    df_raw = df_raw.dropna(subset=['text', 'label'])
    df_raw['label'] = df_raw['label'].str.strip()
    category_mapping = {
        'Flights and Departures': 1,
        'Entertainment and Food': 2,
        'Cabin Comfort and Baggage': 3,
        'Lounge Experience': 4,
        'Boarding and Crew Experience': 5,
        'Bookings and Refunds': 6}
    df_raw['label'] = df_raw['label'].map(category_mapping)
    df_raw = df_raw.sample(frac=1)
    return df_raw

In [None]:
def data_split_train_test(responses, test_size):
    """
    This function splits the dataset into train, validation, and test
    """
    responses.dropna(subset=['label'], inplace=True)
    train, val = train_test_split(responses, test_size=test_size,
                                  random_state=42, shuffle=True, stratify=responses['label'])
    train_pd, val_pd = map(lambda x: x.reset_index(drop=True), [train, val,])
    return train_pd, val_pd

In [None]:
def data_transformation(train_pd, val_pd):
    """
    This function transforms the pandas to dataset, a format accepted by the transformers
    """
    # transform from pandas into datasets format (accpeted by the transfomers)
    train = Dataset.from_pandas(train_pd)
    val = Dataset.from_pandas(val_pd)
    return train, val

In [None]:
def model_finetuning(MODEL, TrainingArguments, Train_Data, Val_Data) :
    """
    Fine-tunes the specified model on the provided training data and evaluates it on the validation data.
    """
    model = SetFitModel.from_pretrained(MODEL)
    trainer = Trainer(
    model=model,
    args=TrainingArguments,
    train_dataset=Train_Data,
    eval_dataset= Val_Data
    )
    logger.info('fine-tuning the Setfit model on dataset')
    trainer.train()
    logger.info('saving the fine-tuned model')
    model_directory_timestamp = f'{date.today().strftime("%Y%m%d")}-reviews-text-classification'
    trainer.model.save_pretrained(model_directory_timestamp)
    metrics = trainer.evaluate()
    logger.info(f"'Performance of the fine-tuned model: , {metrics}")
    return trainer, metrics

In [None]:
def main():
    df_training = data_prep()
    train_pd, val_pd = data_split_train_test(df_training, TEST_SIZE)
    train_data, val_data = data_transformation(train_pd, val_pd)
    trainer, metrics = model_finetuning(
        TRAINING_MODEL, Training_Arguments, train_data, val_data)
    return trainer, metrics

In [None]:
if __name__ == "__main__":
    trainer, metrics = main()