### This notebook includes:

1) Fine-Tuning the SetFit Model:
   - Fine-tune the SetFit (Sentence Transformers) Hugging Face model on a small labeled dataset of customer reviews to solve a multi-class classification problem and generate topics/themes for unlabeled data.

2) Quick Fine-Tuning Demo:
   - Demonstration of quick fine-tuning using sample data from the Datasets library.

3) Model Inference:
   - Perform inference on unseen review data points to predict categories.

4) Baseline Evaluation Mechanism:
   - Evaluate model predictions using Fuzzywuzzy and Regex for a baseline comparison.

In [None]:
# Install required packages
!pip install transformers==4.40.2
!pip install setfit==1.0.3
!pip install fuzzywuzzy==0.18.0
!pip install loguru==0.6.0

In [None]:
# Import libraries
import re
from datetime import date
import warnings

import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from google.colab import files
from loguru import logger
from sklearn.model_selection import train_test_split

# Hugging Face and SetFit Imports
from huggingface_hub import notebook_login
from setfit import SetFitModel, Trainer, TrainingArguments
from datasets import Dataset, load_dataset

# Ignore warnings
warnings.filterwarnings("ignore")

In [None]:
# Set Main Variables
SETFIT_TRAINING_MODEL = "sentence-transformers/paraphrase-mpnet-base-v2"
TEST_SIZE = 0.15
Training_Arguments = TrainingArguments(
    batch_size=16,  # num samples are passed through the model at once before updating the model's parameters
    num_epochs=1,  # num of complete passes through training data
    seed=42,
)

### 1. Data Preparation for fine-tuning

In [None]:
def data_prep():
    """
    Reads the raw data from a CSV file, preprocesses it,
    and returns aa a DataFrame.
    """
    # Prompt user to upload the reviews csv file
    logger.info("Uploading the labeled reviews dataset from local machine")
    uploaded = files.upload()
    # Read the csv file as pandas df
    df_raw = pd.read_csv("labelled_reviews.csv")
    df_raw = df_raw[["text", "label"]]
    # Drop null values if any
    df_raw = df_raw.dropna(subset=["text", "label"])
    df_raw["label"] = df_raw["label"].str.strip()
    # Apply a numeric mapping for the categories
    category_mapping = {
        "Flights and Departures": 1,
        "Entertainment and Food": 2,
        "Cabin Comfort and Baggage": 3,
        "Lounge Experience": 4,
        "Boarding and Crew Experience": 5,
        "Bookings and Refunds": 6,
    }
    df_raw["label"] = df_raw["label"].map(category_mapping)
    # Re sample the data
    df_raw = df_raw.sample(frac=1)
    return df_raw

In [None]:
# Upload, load, and prepare the labeled reviews dataset
df_training = data_prep()

### 2. Split Train/test

In [None]:
def data_split_train_test(responses, test_size):
    """
    This function splits the dataset into train and validation based on test size
    """
    responses.dropna(subset=["label"], inplace=True)
    train, val = train_test_split(
        responses,
        test_size=test_size,
        random_state=42,
        shuffle=True,
        stratify=responses["label"],
    )
    train_pd, val_pd = map(
        lambda x: x.reset_index(drop=True),
        [
            train,
            val,
        ],
    )
    return train_pd, val_pd

In [None]:
# Apply train_test split
train_pd, val_pd = data_split_train_test(df_training, TEST_SIZE)

### 3. Data Transformation

In [None]:
def data_transformation(train_pd, val_pd):
    """
    This function transforms the pandas to dataset, a format accepted by the transformers
    """
    # transform from pandas into datasets format (accpeted by the transfomers)
    train = Dataset.from_pandas(train_pd)
    val = Dataset.from_pandas(val_pd)
    return train, val

In [None]:
# Transform data to datasets format
train_data, val_data = data_transformation(train_pd, val_pd)

### 4. Apply Fine-tuning on small labeled customers reviews dataset

In [None]:
def model_finetuning(MODEL, TrainingArguments, Train_Data, Val_Data):
    """
    Fine-tunes the specified model on the provided training data and evaluates it on the validation data.
    """
    # Initiate the model
    model = SetFitModel.from_pretrained(MODEL)
    trainer = Trainer(
        model=model,
        args=TrainingArguments,
        train_dataset=Train_Data,
        eval_dataset=Val_Data,
    )
    logger.info("fine-tuning the Setfit model on dataset")
    # Start the finetuning job
    trainer.train()
    logger.info("saving the fine-tuned model")
    model_directory_timestamp = (
        f'{date.today().strftime("%Y%m%d")}-reviews-text-classification'
    )
    # Save the model locally
    trainer.model.save_pretrained(model_directory_timestamp)
    # Evaluate the model
    metrics = trainer.evaluate()
    logger.info(f"Performance of the fine-tuned model: , {metrics}")
    return trainer, metrics

In [None]:
# Apply the model_fine_tuning function on the customers reviews train and val data
# This needs GPU power to get completed due to data size
trainer = model_finetuning(
    SETFIT_TRAINING_MODEL, Training_Arguments, train_data, val_data
)

### 5. Run a Fine-tuning demo on sample dataset from Datasets Library

In [None]:
def load_sample_training_dataset():
    """
    Load a sample dataset from Datasets Library for fine-tuning demo
    """
    # Load the dataset from datasets library
    dataset = load_dataset("SetFit/SentEval-CR")
    # Select N examples per class (8 in this case) for the train dataset
    train_ds = dataset["train"].shuffle(seed=42).select(range(8 * 2))
    # Get the test dataset
    test_ds = dataset["test"]
    return train_ds, test_ds

In [None]:
# Load the train and val datasets for a quick demo
train_ds, val_ds = load_sample_training_dataset()

In [None]:
# Apply the model fine-tuning function on the demo train and val datasets
# This runs perfectly with CPU
trainer_demo, metrics = model_finetuning(
    SETFIT_TRAINING_MODEL, Training_Arguments, train_ds, val_ds
)

### 6. Push the fine-tuned model to hub

In [None]:
# Execute this cell to paste the API access token generated from HuggingFace account and push the fine-tuned model to hub
notebook_login()

In [None]:
# Add the repo name followed by the assigned model name
trainer_demo.push_to_hub("sultanaw/fine_tuned_setfit_pydata_demo")

### 7. Load the input data for inference

In [None]:
def load_sample_inference_data():
    """
    Uploads the data from local machine and randomly selects 3%
    """
    logger.info(
        "Uploading the reviews csv file from local machine and sampling small data"
    )
    uploaded = files.upload()
    # Read the data as pandas df
    data = pd.read_csv("reviews.csv")
    # Load sample reviews data
    data = data[["ReviewBody"]].sample(frac=0.03)
    data["ReviewBody"] = data["ReviewBody"].astype(str)
    # Select non-null values
    data = data[data["ReviewBody"].notnull()]
    return data

In [None]:
# Load the input reviews data for inference
data = load_sample_inference_data()

### 8. Generate inference using the fine-tuned SF

In [None]:
def generate_inference(data: pd.DataFrame):
    """
    This functuons loads the fine-tuned customers reviews model from the online HF hub and runs inference on sample of reviews
    """
    # Load the fine-tuned model from the hub
    fine_tuned_model = SetFitModel.from_pretrained("sultanaw/customer_reviews_setfit")
    # Run inference on the customers reviews to generate lables/themes
    labels = fine_tuned_model.predict(data["ReviewBody"].tolist())
    data["predicted_numeric_label"] = labels
    # Reverse the key-value pairs of the category mapping dictionary to map predicted_numeric_label back to strings (topic categories)
    category_mapping = {
        "Flights and Departures": 1,
        "Entertainment and Food": 2,
        "Cabin Comfort and Baggage": 3,
        "Lounge Experience": 4,
        "Boarding and Crew Experience": 5,
        "Bookings and Refunds": 6,
    }
    reverse_category_mapping = {v: k for k, v in category_mapping.items()}
    # Apply the reversed mapping dict to the 'predicted numeric label' column and get the theme category
    data["theme_category"] = data["predicted_numeric_label"].map(
        reverse_category_mapping
    )
    logger.info(
        "The completion of fine-tuned setfit model predictions with number of customer reviews: {}".format(
            len(data)
        )
    )
    return fine_tuned_model, reverse_category_mapping, data

In [None]:
# Generate inference (predicted theme categories) for reviews using the loaded fine-tuned model from HF
fine_tuned_model, reverse_category_mapping, df_predictions = generate_inference(data)

In [None]:
# Run inference on sample of given reviews
sample_reviews = [
    "Fantastic entertainment system with a wide variety of movies and shows. Made the flight enjoyable!",
    "Quick and hassle-free refund process.",
    "My baggage arrived damaged, and seats were uncomfortable",
    "Booking was easy, but the flight was delayed. The airline handled it well with regular updates.",
    "The lounge was spacious, clean, and had excellent food and drinks. Great pre-flight relaxation!",
    "Efficient and organized boarding. Clear announcements and helpful staff. Smooth experience!",
]

# Predict numeric labels using the fine_tuned_model
labels = fine_tuned_model.predict(sample_reviews).tolist()

# Map labels to their respective categories
for review, label in zip(sample_reviews, labels):
    category_name = reverse_category_mapping.get(label, "Unknown Category")
    print(f"Review: {review}\nPredicted Label: {label} (Category: {category_name})\n")

### 9. Baseline evaluation mechanism using Fuzzywuzzy and Regex

In [None]:
def assign_keywords(row, mapping_dict):
    """
    Assign list of keywords to be present in the text based on the predicted theme from fine-tuned model
    """
    theme_category = row["theme_category"]
    if theme_category in mapping_dict:
        return mapping_dict[theme_category]
    else:
        return None

In [None]:
def keywords_search(labels, content):
    """
    Search for keywords in the content (review text) based on the established predicted theme by the fine-tuned model
    Returns:
        True if any of the labels are present in the content, False otherwise.
    """
    for label in labels:
        pattern = re.compile(label, re.IGNORECASE)  # Case-insensitive exact search
        match = pattern.search(content)
        similarity = fuzz.ratio(label.lower(), content.lower())  # Similarity search
        if (
            match is not None or similarity > 40
        ):  # Using 40 as a threshold for similarity
            return True
    return False

In [None]:
def model_preds_eval(final_df, topic_mapping_keywords):
    """
    This function applies the evaluation excercise between the predicted themes by the model and
    baseline themes which are generated from the keywords look up using regex and fuzzywuzzy
    """
    # Apply the function to create the 'key_words' column, words to be present in the review
    final_df["key_words"] = final_df.apply(
        assign_keywords, args=(topic_mapping_keywords,), axis=1
    )
    # Apply the keywords search function
    final_df["comparison_result"] = final_df.apply(
        lambda row: keywords_search(row["key_words"], row["ReviewBody"]), axis=1
    )
    value_counts = final_df["comparison_result"].value_counts(normalize=True).round(2)
    # Log the value counts using logger.info()
    logger.info(
        f"The comparison result between the finetuned model's predictions and established themes based on keywords look-up is:\n{value_counts}"
    )
    return final_df

In [None]:
# Define list of relevant key words per topic group
topic_mapping_keywords = {
    "Boarding and Crew Experience": [
        "attendants",
        "pleasant",
        "crew",
        "member",
        "boarding",
    ],
    "Entertainment and Food": [
        "media",
        "wifi",
        "screen",
        "headphones",
        "electronics",
        "snacks",
        "drinks",
        "food",
        "baverages",
        "catering",
        "served",
        "refreshments",
    ],
    "Cabin Comfort and Baggage": [
        "seats",
        "legging",
        "legroom",
        "sleep",
        "space",
        "luggage",
        "baggage",
        "bags",
        "suitcase",
    ],
    "Lounge Experience": ["lounge", "service", "offerings", "business"],
    "Bookings and Refunds": [
        "rebook",
        "tickets",
        "refunds",
        "pound",
        "claim",
        "complaint",
        "call centre",
        "email",
        "online",
        "system",
        "telephoney",
        "points",
    ],
    "Flights and Departures": [
        "takeoff",
        "flights",
        "disruptions",
        "delay",
        "time",
        "cancellations",
        "return",
        "inbound",
        "outbound",
        "desk",
        "terminal",
        "staff",
        "connection",
        "hotel",
    ],
}

In [None]:
# Apply the evaluation exercise to monitor model predictions and compare them with baseline themes using lookup words
final_df = model_preds_eval(df_predictions, topic_mapping_keywords)