- This notebook fine-tunes the SetFit (Sentence Transfromers) Hugging face model on a small labelled dataset of customers reviews to solve  a multiclassifcation problem and generate topics/themes for unlabelled data.

- It also includes a quick fine-tuning demo on sample data from Datasets library.

In [None]:
# Install required packages
!pip install setfit==1.0.3
!pip install loguru==0.6.0

In [None]:
# Import required packages
import os
from datetime import date
from google.colab import files

import numpy as np
import pandas as pd
from loguru import logger

from huggingface_hub import notebook_login
from setfit import SetFitModel, Trainer, TrainingArguments
from datasets import Dataset,load_dataset

from sklearn.model_selection import train_test_split

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")


In [None]:
# Set Main Variables
SETFIT_TRAINING_MODEL = 'sentence-transformers/paraphrase-mpnet-base-v2'
TEST_SIZE = 0.15
Training_Arguments = TrainingArguments(
    batch_size=16, # num samples are passed through the model at once before updating the model's parameters
    num_epochs=1, # num of complete passes through training data
    seed = 42
)

### 1. Data Preparation

In [None]:
def data_prep():
    """
    Reads the raw data from a CSV file, preprocesses it,
    and returns aa a DataFrame.
    """
    # Prompt user to upload the reviews csv file
    logger.info('Uploading the labeled reviews dataset from local machine')
    uploaded = files.upload()
    # Read the csv file as pandas df
    df_raw = pd.read_csv('labelled_reviews.csv')
    df_raw = df_raw[['text', 'label']]
    # Drop null values if any
    df_raw = df_raw.dropna(subset=['text', 'label'])
    df_raw['label'] = df_raw['label'].str.strip()
    # Apply a numeric mapping for the categories
    category_mapping = {
        'Flights and Departures': 1,
        'Entertainment and Food': 2,
        'Cabin Comfort and Baggage': 3,
        'Lounge Experience': 4,
        'Boarding and Crew Experience': 5,
        'Bookings and Refunds': 6}
    df_raw['label'] = df_raw['label'].map(category_mapping)
    # Re sample the data
    df_raw = df_raw.sample(frac=1)
    return df_raw

In [None]:
# Upload, load, and prepare the reviews dataset
df_training = data_prep()

### 2. Split Train/test

In [None]:
def data_split_train_test(responses, test_size):
    """
    This function splits the dataset into train and validation based on test size
    """
    responses.dropna(subset=['label'], inplace=True)
    train, val = train_test_split(responses, test_size=test_size,
                                  random_state=42, shuffle=True, stratify=responses['label'])
    train_pd, val_pd = map(lambda x: x.reset_index(drop=True), [train, val,])
    return train_pd, val_pd

In [None]:
# Apply train_test split
train_pd, val_pd = data_split_train_test(df_training, TEST_SIZE)

### 3. Data Transformation

In [None]:
def data_transformation(train_pd, val_pd):
    """
    This function transforms the pandas to dataset, a format accepted by the transformers
    """
    # transform from pandas into datasets format (accpeted by the transfomers)
    train = Dataset.from_pandas(train_pd)
    val = Dataset.from_pandas(val_pd)
    return train, val

In [None]:
# Transform data to datasets format
train_data, val_data = data_transformation(train_pd, val_pd)

### 4. Apply fine-tuning on labeled customers reviews dataset

In [None]:
def model_finetuning(MODEL, TrainingArguments, Train_Data, Val_Data):
    """
    Fine-tunes the specified model on the provided training data and evaluates it on the validation data.
    """
    # Initiate the model
    model = SetFitModel.from_pretrained(MODEL)
    trainer = Trainer(
    model=model,
    args=TrainingArguments,
    train_dataset=Train_Data,
    eval_dataset= Val_Data
    )
    logger.info('fine-tuning the Setfit model on dataset')
    # Start the finetuning job
    trainer.train()
    logger.info('saving the fine-tuned model')
    model_directory_timestamp = f'{date.today().strftime("%Y%m%d")}-reviews-text-classification'
    # Save the model locally
    trainer.model.save_pretrained(model_directory_timestamp)
    # Evaluate the model
    metrics = trainer.evaluate()
    logger.info(f"Performance of the fine-tuned model: , {metrics}")
    return trainer, metrics

In [None]:
# Apply the model_fine_tuning function on the customers reviews train and val data
# This needs some GPU power to get completed due to data size
trainer, metrics = model_finetuning(
        SETFIT_TRAINING_MODEL, Training_Arguments, train_data, val_data)

### 5. Run a Fine-tuning demo on sample dataset from Datasets Library

In [None]:
def data_load():
    """
    Load a sample dataset from Datasets Library
    """
    # Load the dataset from datasets library
    dataset = load_dataset("SetFit/SentEval-CR")
    # Select N examples per class (8 in this case) for the train dataset
    train_ds = dataset["train"].shuffle(seed=42).select(range(8 * 2))
    # Get the test dataset
    test_ds = dataset["test"]
    return train_ds, test_ds

In [None]:
# Load the train and val datasets for a quick demo
train_ds, val_ds = data_load()

In [None]:
# Apply the model fine-tuning function on the demo train and val datasets
# This runs perfectly with CPU
trainer_demo, metrics_demo = model_finetuning(
        SETFIT_TRAINING_MODEL, Training_Arguments, train_ds, val_ds)

### 6. Push the fine-tuned model to hub

In [None]:
# excute this cell to paste the API access token generated from HuggingFace account and push the fine-tuned model to hub
notebook_login()

In [None]:
# Add the repo name followed by the model name
trainer_demo.push_to_hub("sultanaw/customer_reviews_setfit")