In [None]:
# Install required packages and libraries
!pip install setfit==1.0.3
!pip install fuzzywuzzy==0.18.0
!pip install loguru==0.6.0

In [None]:
# Import packages and libraries
import re
import pandas as pd
from fuzzywuzzy import fuzz
from loguru import logger
from setfit import SetFitModel
from google.colab import files

# Ignore warnings
import warnings

warnings.filterwarnings("ignore")

### 1. Load the input data

In [None]:
def load_sample_data():
    """
    Uploads the data from local machine and randomly selects 3%
    """
    logger.info(
        "Uploading the reviews csv file from local machine and sampling small data"
    )
    uploaded = files.upload()
    # Read the data as pandas df
    data = pd.read_csv("reviews.csv")
    # Load sample reviews data
    data = data[["ReviewBody"]].sample(frac=0.03)
    data["ReviewBody"] = data["ReviewBody"].astype(str)
    # Select non-null values
    data = data[data["ReviewBody"].notnull()]
    return data

In [None]:
# Load the input raw data
data = load_sample_data()

### 2. Generate inference using the fine-tuned SF

In [None]:
def generate_inference(data: pd.DataFrame):
    """
    This functuons loads the fine-tuned customers reviews from the online HF hub and runs inference on sample of reviews
    """
    # Load the fine-tuned model from the hub
    fine_tuned_model = SetFitModel.from_pretrained("sultanaw/customer_reviews_setfit")
    # Run inference on the customers reviews to generate lables/themes
    labels = fine_tuned_model.predict(data["ReviewBody"].tolist())
    data["predicted_numeric_label"] = labels
    # Reverse the key-value pairs of the category mapping dictionary to map predicted_numeric_label back to strings (topic categories)
    category_mapping = {
        "Flights and Departures": 1,
        "Entertainment and Food": 2,
        "Cabin Comfort and Baggage": 3,
        "Lounge Experience": 4,
        "Boarding and Crew Experience": 5,
        "Bookings and Refunds": 6,
    }
    reverse_category_mapping = {v: k for k, v in category_mapping.items()}
    # Apply the reversed mapping dict to the 'predicted numeric label' column and get the theme category
    data["theme_category"] = data["predicted_numeric_label"].map(
        reverse_category_mapping
    )
    logger.info(
        "The completion of fine-tuned setfit model predictions with number of customer reviews: {}".format(
            len(data)
        )
    )
    return fine_tuned_model, reverse_category_mapping, data

In [None]:
# Generate inference (predicted themes) for reviews using the fine-tuned model
fine_tuned_model, reverse_category_mapping, data, df_predictions = generate_inference(
    data
)

In [None]:
# Run inference on sample of reviews
sample_reviews = [
    "Fantastic entertainment system with a wide variety of movies and shows. Made the flight enjoyable!",
    "Quick and hassle-free refund process.",
    "Booking was easy, but the flight was delayed. The airline handled it well with regular updates.",
    "The lounge was spacious, clean, and had excellent food and drinks. Great pre-flight relaxation!",
    "Efficient and organized boarding. Clear announcements and helpful staff. Smooth experience!",
]

# Predict numeric labels using the fine_tuned_model
labels = fine_tuned_model.predict(sample_reviews).tolist()

# Map labels to their respective categories and print the reviews with mapped labels
for review, label in zip(sample_reviews, labels):
    category_name = reverse_category_mapping.get(label, "Unknown Category")
    print(f"Review Label: {label} (Category: {category_name})\nReview: {review}\n")

### 3. Baseline evaluation mechanism using Fuzzywuzzy and Regex

In [None]:
def assign_keywords(row, mapping_dict):
    """
    Assign list of keywords to be present in the text based on the predicted theme
    """
    theme_category = row["theme_category"]
    if theme_category in mapping_dict:
        return mapping_dict[theme_category]
    else:
        return None


def search(labels, content):
    """
    Search for keywords in the content (review text) based on the established predicted theme by the fine-tuned model
    Returns:
        True if any of the labels are present in the content, False otherwise.
    """
    for label in labels:
        pattern = re.compile(label, re.IGNORECASE)  # Case-insensitive exact search
        match = pattern.search(content)
        similarity = fuzz.ratio(label.lower(), content.lower())  # Similarity search
        if (
            match is not None or similarity > 40
        ):  # Using 40 as a threshold for similarity
            return True
    return False


def model_results_eval(final_df, topic_mapping_keywords):
    """
    This function applies the evaluation excercise between the predicted themes by the model and
    baseline themes which are generated from the keywords look up using regex and fuzzywuzzy
    """
    # Apply the function to create the 'key_words' column
    final_df["key_words"] = final_df.apply(
        assign_keywords, args=(topic_mapping_keywords,), axis=1
    )
    # Apply the search function to each row
    final_df["comparison_result"] = final_df.apply(
        lambda row: search(row["key_words"], row["ReviewBody"]), axis=1
    )
    final_df["key_words"] = final_df["key_words"].astype(str)
    value_counts = final_df["comparison_result"].value_counts(normalize=True).round(2)
    # Log the value counts using logger.info()
    logger.info(
        f"The comparison result between finetuned model's predictions and established themes based on keywords is:\n{value_counts}"
    )
    return final_df

In [None]:
# Define list of relevant key words per topic group
topic_mapping_keywords = {
    "Boarding and Crew Experience": [
        "attendants",
        "pleasant",
        "crew",
        "member",
        "boarding",
    ],
    "Entertainment and Food": [
        "media",
        "wifi",
        "screen",
        "headphones",
        "electronics",
        "snacks",
        "drinks",
        "food",
        "baverages",
        "catering",
        "served",
        "refreshments",
    ],
    "Cabin Comfort and Baggage": [
        "seats",
        "legging",
        "legroom",
        "sleep",
        "space",
        "luggage",
        "baggage",
        "bags",
        "suitcase",
    ],
    "Lounge Experience": ["lounge", "service", "offerings", "business"],
    "Bookings and Refunds": [
        "rebook",
        "tickets",
        "refunds",
        "pound",
        "claim",
        "complaint",
        "call centre",
        "email",
        "online",
        "system",
        "telephoney",
        "points",
    ],
    "Flights and Departures": [
        "takeoff",
        "flights",
        "disruptions",
        "delay",
        "time",
        "cancellations",
        "return",
        "inbound",
        "outbound",
        "desk",
        "terminal",
        "staff",
        "connection",
        "hotel",
    ],
}

In [None]:
# Apply the evaluation excercise to monitor the model predictions and compare with baseline themes through look-up words
final_df = model_results_eval(df_predictions, topic_mapping_keywords)