In [None]:
import os
import re
import pandas as pd
from fuzzywuzzy import fuzz
from loguru import logger
from setfit import SetFitModel


In [None]:
# set up the name of the fine-tuned model 
FINE_TUNED_MODEL = "sultanaw/customer_reviews_setfit"

In [None]:
def load_sample_data():
    # load sample reviews data  
    data = pd.read_csv(os.path.join("..", "data", "reviews.csv"))
    data = data[["ReviewBody"]].sample(frac=0.05)
    data["ReviewBody"] = data["ReviewBody"].astype(str)
    data = data[data["ReviewBody"].notnull()]
    return data

In [None]:
data = load_sample_data()

In [None]:
def inference(data: pd.DataFrame):
    # load the model
    fine_tuned_model = SetFitModel.from_pretrained(
        "20240504-reviews-text-classification"
    )
    # Run inference on the dataframe
    labels = fine_tuned_model.predict(data["ReviewBody"].tolist())
    data["predicted_numeric_label"] = labels
    # Reverse mapping dictionary to map numbers back to strings
    category_mapping = {
        "Flights and Departures": 1,
        "Entertainment and Food": 2,
        "Cabin Comfort and Baggage": 3,
        "Lounge Experience": 4,
        "Boarding and Crew Experience": 5,
        "Bookings and Refunds": 6,
    }
    reverse_category_mapping = {v: k for k, v in category_mapping.items()}
    # Apply the reverse mapping to the 'predicted numeric label' column
    data["theme_category"] = data["predicted_numeric_label"].map(
        reverse_category_mapping
    )
    logger.info(
        "The completion of fine-tuned setfit model predictions with number of customer reviews: {}".format(
            len(data)
        )
    )
    return data

In [None]:
df_predictions = inference(data)

In [None]:
def assign_keywords(row, mapping_dict):
    """
    assigns list of keywords to be present in the text based on the predicted theme
    """
    theme_category = row["theme_category"]
    if theme_category in mapping_dict:
        return mapping_dict[theme_category]
    else:
        return None


def search(labels, content):
    """Searches for the keywords (labels) in the content (raw response text) based on the established predicted theme
    Returns:
        True if any of the labels are present in the content, False otherwise.
    """
    for label in labels:
        pattern = re.compile(label, re.IGNORECASE)  # Case-insensitive search
        match = pattern.search(content)
        similarity = fuzz.ratio(label.lower(), content.lower())
        if (
            match is not None or similarity > 40
        ):  # Using 40 as a threshold for similarity
            return True
    return False


def model_results_eval(final_df, topic_mapping_keywords):
    """
    a public function which applies a comparison excercise between the predicted themes by the model and
    baseline themes which are generated from the keywords look up"""
    # Apply the function to create the 'key_words' column
    final_df["key_words"] = final_df.apply(
        assign_keywords, args=(topic_mapping_keywords,), axis=1
    )
    # Apply the search function to each row
    final_df["comparison_result"] = final_df.apply(
        lambda row: search(row["key_words"], row["ReviewBody"]), axis=1
    )
    final_df["key_words"] = final_df["key_words"].astype(str)
    value_counts = final_df["comparison_result"].value_counts(normalize=True).round(2)
    # Log the value counts using logger.info()
    logger.info(
        f"The comparison result between finetuned model's predictions and established themes based on keywords is:\n{value_counts}"
    )
    return final_df

In [None]:
topic_mapping_keywords = {
    "Boarding and Crew Experience": [
        "attendants",
        "pleasant",
        "crew",
        "member",
        "boarding",
    ],
    "Entertainment and Food": [
        "media",
        "wifi",
        "screen",
        "headphones",
        "electronics",
        "snacks",
        "drinks",
        "food",
        "baverages",
        "catering",
        "served",
        "refreshments",
    ],
    "Cabin Comfort and Baggage": [
        "seats",
        "legging",
        "legroom",
        "sleep",
        "space",
        "luggage",
        "baggage",
        "bags",
        "suitcase",
    ],
    "Lounge Experience": ["lounge", "service", "offerings", "business"],
    "Bookings and Refunds": [
        "rebook",
        "tickets",
        "refunds",
        "pound",
        "claim",
        "complaint",
        "call centre",
        "email",
        "online",
        "system",
        "telephoney",
        "points",
    ],
    "Flights and Departures": [
        "takeoff",
        "flights",
        "disruptions",
        "delay",
        "time",
        "cancellations",
        "return",
        "inbound",
        "outbound",
        "desk",
        "terminal",
        "staff",
        "connection",
        "hotel",
    ],
}

In [None]:
final_df = model_results_eval(df_predictions, topic_mapping_keywords)