In [0]:
# ---------------
# INITIAL IMPORTS 
# ---------------

import sys
repo_path ='carlos.delacruz@storaenso.com/cupid_ml_api'
sys.path.append(f"/Workspace/Repos/{repo_path}")
import matplotlib.pyplot as plt
import pickle

In [0]:
import pandas as pd 
hotel_rooms = pd.read_csv("/dbfs/FileStore/cupid/referance_rooms_1737378184366.csv")
supplier_rooms = pd.read_csv("/dbfs/FileStore/cupid/updated_core_rooms.csv")

In [0]:
# -----
# MERGE 
# -----
merged1 = pd.merge( hotel_rooms,supplier_rooms, on='lp_id', how='left')
merged2 = pd.merge( hotel_rooms,supplier_rooms, on='lp_id', how='inner')

# EDA observations

## Frequency & Distribution Analysis

In [0]:
match_counts = merged1.groupby('room_id')['supplier_room_name'].nunique()
match_counts.describe()

In [0]:
plt.figure(figsize=(10, 5))
plt.hist(match_counts, bins=50, edgecolor='black')
plt.title("Distribution of Supplier Room Matches per Hotel Room")
plt.xlabel("Number of Supplier Rooms per Hotel Room")
plt.ylabel("Count of Hotel Rooms")
plt.yscale("log")  # Log scale helps if distribution is highly skewed
plt.grid(True)
plt.show()


In [0]:
plt.figure(figsize=(8, 5))
plt.boxplot(match_counts, vert=False, patch_artist=True)
plt.title("Box Plot of Supplier Room Matches per Hotel Room")
plt.xlabel("Number of Supplier Rooms per Hotel Room")
plt.grid(True)
plt.show()

### Observations
- Most hotel rooms have fewer than 20 supplier matches (skewed distribution with a long tail).
- A small number of hotel rooms have 40+ supplier matches, and extreme outliers go up to 100.
- Outliers are significant (seen in the box plot), meaning some rooms are mapped to way too many supplier rooms.

## Text Similarity Score Analysis


In [0]:
# . Define a Multiplicity Handling Strategy
# Since one hotel room can match multiple supplier rooms, you need to decide how to filter or rank these matches:

# Option A: Keep only the Top-N matches per hotel room
# Based on similarity scores (if calculated later).
# Example: Keep only the top 3 or top 5 supplier rooms per hotel room.
# Option B: Apply a similarity score threshold
# If a match is below a certain similarity (e.g., cosine similarity < 0.7), discard it.
# Option C: Business Rule Filtering
# Prioritize supplier rooms from trusted suppliers over others.
# Keep supplier rooms that have the most common words with the hotel room name.

## Model 

# RoomMatcher

In [0]:
from models.room_matcher import RoomMatcher

matcher = RoomMatcher()

In [0]:
# # 1. SAVE MODEL TO DBS This should be stored in repo 
# hotel_rooms_grouped, supplier_rooms_grouped = matcher.preprocess_data(hotel_rooms, supplier_rooms)
# hotel_vectors, supplier_vectors = matcher.vectorize_data(hotel_rooms_grouped, supplier_rooms_grouped)


In [0]:
# 2.  Train and save the model
#matcher.knn.fit(supplier_vectors)  # Train the kNN model
#matcher.save_model()  # Save the trained models for later API usage

In [0]:
# 3 Load models 
vectorizer = pickle.load(open("/dbfs/FileStore/cupid/vectorizer.pkl", "rb"))
knn_model = pickle.load(open("/dbfs/FileStore/cupid/knn_model.pkl", "rb"))

## 4. Save preprocessed data  
# hotel_rooms_grouped.to_csv("/dbfs/FileStore/cupid/preprocessed_hotel_rooms.csv")
# supplier_rooms_grouped.to_csv("/dbfs/FileStore/cupid/preprocessed_supplier_rooms.csv")

# 5. Load room data
#hotel_rooms_grouped = pd.read_csv("datasets/preprocessed_hotel_rooms.csv")  # Save this file earlier
#supplier_rooms_grouped = pd.read_csv("datasets/preprocessed_supplier_rooms.csv")  # Save this file earlier
supplier_rooms_grouped = pd.read_csv("/dbfs/FileStore/cupid/preprocessed_supplier_rooms.csv")  # Save this file earlier

In [0]:
from models.room_matcher import match_rooms_test

results, unmapped_rooms = match_rooms_test(vectorizer=vectorizer, 
                 knn_model=knn_model, 
                 supplier_rooms_grouped=supplier_rooms_grouped, 
                 room_name= 'Roome Deluxe'
                 )

In [0]:
results

# Evaluation Experiments

In [0]:
# Data Splitting: Divide your labeled data into training, validation, and test sets. The test set should remain unseen for final evaluation.

#  Use Existing Datasets
# Leverage your preprocessed datasets (e.g., preprocessed_supplier_rooms.csv and a similar file for hotel rooms) and augment them with labels. This step might involve aligning room names manually for a subset of data.


# 4. Hyperparameter & Model Tuning
# a. Hyperparameters to Consider:

# k in kNN (top_k): Test different values for n_neighbors to see how it affects precision and recall.
# Threshold Value: Experiment with thresholds lower and higher than 0.75 to find the optimal trade-off between false positives and false negatives.
# TF-IDF Parameters: Adjust the n-gram range, max_features, or stop words to see their impact on vector quality.
# b. Tuning Strategy:

# Grid or Random Search: Use grid search or random search methods over the defined hyperparameter space.
# Automated Metrics Logging: Record evaluation metrics (F1, MRR, MAP, etc.) for each configuration to visualize performance improvements.
# 5. Model Explainability
# a. Feature Importance

# Use tools like LIME or SHAP to interpret which words or tokens are contributing most to the similarity scores.
# Case Studies: For a few example room names, generate explanations that highlight key features leading to a match decision.
# b. Transparency:

# Document the explainability findings to provide insights into model decisions, which is particularly useful if you need to justify model behavior to stakeholders.
# 6. Concrete Example & Experiment
# Example Scenario:

# Input: “Deluxe King Suite”
# Expected Matches: You might expect a high similarity score for supplier room names like “deluxe king suite” or “luxury king suite.”
# Evaluation:
# Calculate the cosine similarity for this query against your supplier room vectors.
# If the correct supplier room appears within the top-3 predictions and has a similarity score above the optimal threshold determined through your threshold analysis, it counts as a true positive.
# Record the position of the first true match to compute MRR.
# Experiment:

# Run the model on a subset of the labeled dataset and compute precision, recall, F1, MRR, and MAP.
# Vary the threshold and k values while plotting the corresponding precision-recall curves to visually determine optimal settings.

# 7. Continuous Monitoring & Improvement
# Regular Re-Evaluation: As new data comes in, periodically re-run the evaluation suite to monitor for model drift.
# A/B Testing: Deploy different versions of your API to small subsets of traffic to compare performance in a live environment.
# Summary
# This evaluation plan ensures that you systematically measure both the matching accuracy and the ranking quality of your model. By combining manual labeling, a variety of metrics, automated tests, and hyperparameter tuning, you can continuously improve your room-matching API. This structured approach not only helps in achieving better precision and recall but also enhances the overall robustness and explainability of your model.

# Feel free to ask for further details or clarifications on any of the steps!


In [0]:
from tqdm import tqdm
import pandas as pd
from fuzzywuzzy import fuzz
from concurrent.futures import ThreadPoolExecutor, as_completed
import Levenshtein as lev 

def fuzzy_match_room(hotel_name, hotel_id, filtered_supplier_rooms, fuzzy_threshold):
    """
    Helper function to compare one hotel room name to all filtered supplier room names and return matches.
    """
    matches = []
    for _, supplier_row in filtered_supplier_rooms.iterrows():
        supplier_name = supplier_row["clean_supplier_room_name"]
        supplier_id = supplier_row["supplier_room_id"]
        
        # Compute fuzzy matching score (0 to 100)
        score = fuzz.token_set_ratio(hotel_name, supplier_name)
        if score >= fuzzy_threshold:
            matches.append({
                "hotel_room_id": hotel_id,
                "supplier_room_id": supplier_id,
                "fuzzy_score": score,
                "pseudo_match": 1
            })
    return matches

# ----------------------------------------------------------------------
def filter_supplier_rooms(supplier_rooms_grouped, hotel_name):
    """
    Pre-filter supplier rooms based on the length of the room names. 
    This filters out rooms that are too short or too long compared to the hotel room name.
    """
    average_length = len(hotel_name)
    min_length = average_length - 5  # Allow a little variation
    max_length = average_length + 5  # Allow a little variation
    
    # Filter supplier rooms based on length
    filtered_supplier_rooms = supplier_rooms_grouped[
        supplier_rooms_grouped['clean_supplier_room_name'].apply(lambda x: min_length <= len(x) <= max_length)
    ]
    
    return filtered_supplier_rooms

def filter_by_levenshtein_distance(supplier_rooms_grouped, hotel_name, max_distance=5):
    """
    Filter supplier rooms based on the Levenshtein distance from the hotel room name.
    If the distance exceeds max_distance, the supplier room is excluded.
    """
    filtered_supplier_rooms = supplier_rooms_grouped[
        supplier_rooms_grouped['clean_supplier_room_name'].apply(
            lambda supplier_name: lev.distance(hotel_name, supplier_name) <= max_distance
        )
    ]
    return filtered_supplier_rooms

def sample_supplier_rooms(supplier_rooms_grouped, sample_size=0.2):
    """
    Randomly sample a subset of supplier rooms. 
    The sample size is a fraction (0.2 means 20%) of the total supplier rooms.
    """
    sampled_supplier_rooms = supplier_rooms_grouped.sample(frac=sample_size, random_state=42)
    return sampled_supplier_rooms
# ----------------------------------------------------------------------

def generate_pseudo_labels(hotel_rooms_grouped: pd.DataFrame, 
                           supplier_rooms_grouped: pd.DataFrame,
                           fuzzy_threshold: int = 90) -> pd.DataFrame:
    """
    Generate pseudo ground truth labels by comparing cleaned room names
    using a fuzzy matching score. If the score is above the threshold,
    treat the pair as a match.
    """
    pseudo_matches = []

    # Use ThreadPoolExecutor for parallelization
    with ThreadPoolExecutor() as executor:
        futures = []
        
        # Loop over all hotel room names and submit each comparison task to the pool
        for _, hotel_row in tqdm(hotel_rooms_grouped.iterrows(), total=hotel_rooms_grouped.shape[0], desc="Generating Pseudo Labels"):
            hotel_name = hotel_row["clean_room_name"]
            hotel_id = hotel_row["room_id"]
            
            # Pre-filter the supplier rooms based on room name length
            # TEST 1
            #filtered_supplier_rooms = filter_supplier_rooms(supplier_rooms_grouped, hotel_name)

       
            # Randomly sample supplier rooms for comparison
            supplier_rooms_grouped_sample = sample_supplier_rooms(supplier_rooms_grouped, sample_size=0.05)

             # TEST 2
            filtered_supplier_rooms = filter_by_levenshtein_distance(supplier_rooms_grouped_sample, hotel_name, max_distance=3)

            filtered_supplier_rooms= filtered_supplier_rooms #if use_sample else filtered_supplier_rooms
            
            # Submit task to ThreadPoolExecutor
            futures.append(executor.submit(fuzzy_match_room, hotel_name, hotel_id, filtered_supplier_rooms, fuzzy_threshold))
        
        # Collect results as they complete
        for future in tqdm(as_completed(futures), total=len(futures), desc="Collecting Matches"):
            pseudo_matches.extend(future.result())

    return pd.DataFrame(pseudo_matches)

hotel_rooms_grouped_sample = hotel_rooms_grouped.sample(1000)

supplier_rooms_groupeds_sample = supplier_rooms_grouped.sample(10000)

pseudo_ground_truth_df = generate_pseudo_labels(hotel_rooms_grouped_sample, supplier_rooms_groupeds_sample, fuzzy_threshold=90)

In [0]:
pseudo_ground_truth_df.head()

In [0]:
# Assume matcher is an instance of RoomMatcher
model_matches_df = matcher.match_rooms(hotel_rooms, supplier_rooms)
#print(model_matches_df.head())

In [0]:
model_matches_df.head()

In [0]:
pseudo_ground_truth_df[['hotel_room_id', 'supplier_room_id', 'pseudo_match']]

In [0]:
model_matches_df["hotel_room_id_test"] = model_matches_df["hotel_room_id"].str[0]
model_matches_df["supplier_room_id_test"] = model_matches_df["supplier_room_id"].str[0]

pseudo_ground_truth_df["hotel_room_id_test"] = pseudo_ground_truth_df["hotel_room_id"].str[0]
pseudo_ground_truth_df["supplier_room_id_test"] = pseudo_ground_truth_df["supplier_room_id"].str[0]


In [0]:
model_matches_df.head()

In [0]:
evaluation_df = pd.merge(model_matches_df, 
                         pseudo_ground_truth_df[['hotel_room_id_test', 'supplier_room_id_test', 'pseudo_match']], 
                         on=["hotel_room_id_test", "supplier_room_id_test"],
                         how="left")

In [0]:
evaluation_df.head()

In [0]:
# Merge model predictions with pseudo ground truth on hotel_room_id and supplier_room_id.
evaluation_df = pd.merge(model_matches_df, 
                         pseudo_ground_truth_df[['hotel_room_id_test', 'supplier_room_id_test', 'pseudo_match']], 
                         on=["hotel_room_id_test", "supplier_room_id_test"],
                         how="left")

# Fill missing pseudo_match values with 0 (i.e., not a match in pseudo labels)
evaluation_df["pseudo_match"].fillna(0, inplace=True)

# For evaluation, use the pseudo_match column as the ground truth
y_true = evaluation_df["pseudo_match"]  # 1 if rule-based match, 0 otherwise
# Your model predicted these pairs as matches, so assign predicted labels as 1 for all candidates.
y_pred = [1] * len(evaluation_df)

print("Evaluation Data Sample:")
#print(evaluation_df.head())


In [0]:
y_true

In [0]:
y_pred

In [0]:
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_true, y_pred)
# For recall and F1, you would need a full set of negatives (non-matches), 
# which might require a broader sampling strategy.
# For demonstration, we compute recall and F1 on our candidate set.
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")
