In [0]:
# Remove the %pip install command to avoid conflicts
# %pip install --upgrade pandas

# Build a machine learning API similar to the Cupid API’s Room Match feature. The API should handle POST requests and return sample request/response payloads in a similar format to the Cupid Room Match API. Provide a detailed explanation of your development process, including how you collect and process data, develop models, and scale the system.

# --------------------
# SOLUTION DESCRIPTION
# --------------------

# 1. User expedia has this "supplier_room_names": ["King Deluxe", "Deluxe King Suite", "Standard King"]
# 2. Expedia gets "Deluxe King Room" from Expedia’s Internal Database, A User Searches for a Room
# 3. Expedia sends  "Deluxe King Room" to the api and gets "King Deluxe"
# I guess that the supplier room id we eant to match with the room id ? 


# -----
# TODO
# -----




# -----
# QUESTIONS 
# -----
# Data Integration:
# How should you merge the hotel and supplier datasets (what join key, and how do you handle one-to-many relationships)?

# Matching Logic:
# What is the rationale behind matching room names (e.g., string similarity), and how do you set similarity thresholds?

# Scalability:
# Given the huge data size, what strategies (batch processing, ANN, dimensional


In [0]:
import sys
repo_path ='carlos.delacruz@storaenso.com/cupid_ml_api'
sys.path.append(f"/Workspace/Repos/{repo_path}")
import matplotlib.pyplot as plt

In [0]:
import pandas as pd 
hotel_rooms = pd.read_csv("/dbfs/FileStore/cupid/referance_rooms_1737378184366.csv")
supplier_rooms = pd.read_csv("/dbfs/FileStore/cupid/updated_core_rooms.csv")

In [0]:
import sklearn

In [0]:
sklearn.__version__

In [0]:
# -----
# MERGE 
# -----
merged1 = pd.merge( hotel_rooms,supplier_rooms, on='lp_id', how='left')
merged2 = pd.merge( hotel_rooms,supplier_rooms, on='lp_id', how='inner')

In [0]:
hotel_rooms.head()

In [0]:
supplier_rooms.head()

# Wrapper function test

In [0]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import logging
from typing import Tuple
import pickle
import os
logging.basicConfig(level=logging.INFO)

class RoomMatcher:
    def __init__(self, top_k: int = 10, threshold: float = 0.75):
        self.top_k = top_k
        self.threshold = threshold
        self.vectorizer = TfidfVectorizer()
        self.knn = NearestNeighbors(n_neighbors=self.top_k, metric="cosine", algorithm="auto")
        self.model_path = "/dbfs/FileStore/cupid"

    @staticmethod
    def preprocess_text(text: str) -> str:
        """Lowercase, remove special characters and extra spaces."""
        text = text.lower()
        text = re.sub(r'[^a-z0-9 ]', '', text)  # Keep only alphanumeric
        return re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    def preprocess_data(self, hotel_rooms: pd.DataFrame, supplier_rooms: pd.DataFrame) -> Tuple:
        """Cleans and processes room names for vectorization and matching."""
        hotel_rooms = hotel_rooms.copy()
        supplier_rooms = supplier_rooms.dropna().copy()

        # Apply text preprocessing
        hotel_rooms["clean_room_name"] = hotel_rooms["room_name"].apply(self.preprocess_text)
        supplier_rooms["clean_supplier_room_name"] = supplier_rooms["supplier_room_name"].apply(self.preprocess_text)

        # Group by unique cleaned names
        hotel_rooms_grouped = hotel_rooms.groupby("clean_room_name")["room_id"].apply(list).reset_index()
        supplier_rooms_grouped = supplier_rooms.groupby("clean_supplier_room_name")["supplier_room_id"].apply(
            list).reset_index()

        return hotel_rooms_grouped, supplier_rooms_grouped

    def vectorize_data(self, hotel_rooms_grouped: pd.DataFrame, supplier_rooms_grouped: pd.DataFrame) -> Tuple:
        """Vectorizes room names using TF-IDF."""
        unique_hotel_names = hotel_rooms_grouped["clean_room_name"].tolist()
        unique_supplier_names = supplier_rooms_grouped["clean_supplier_room_name"].tolist()

        all_unique_room_names = unique_hotel_names + unique_supplier_names
        tfidf_matrix = self.vectorizer.fit_transform(all_unique_room_names)

        # Split matrices for hotel and supplier rooms
        hotel_vectors = tfidf_matrix[:len(unique_hotel_names)]
        supplier_vectors = tfidf_matrix[len(unique_hotel_names):]

        return hotel_vectors, supplier_vectors

    def find_best_matches(self, hotel_vectors, supplier_vectors, hotel_rooms_grouped,
                          supplier_rooms_grouped) -> pd.DataFrame:
        """Finds the best matches for hotel rooms using kNN and cosine similarity."""
        self.knn.fit(supplier_vectors)  # Fit on supplier room vectors

        distances, indices = self.knn.kneighbors(hotel_vectors, return_distance=True)

        matches = []
        hotel_room_ids = hotel_rooms_grouped.room_id.values
        supplier_room_ids = supplier_rooms_grouped.supplier_room_id.values

        for i, hotel_room_id in enumerate(hotel_room_ids):
            for j in range(self.top_k):
                supplier_index = indices[i][j]
                similarity_score = 1 - distances[i][j]  # Convert cosine distance to similarity
                if similarity_score > self.threshold:
                    matches.append((hotel_room_id, supplier_room_ids[supplier_index], similarity_score))

        return pd.DataFrame(matches, columns=["hotel_room_id", "supplier_room_id", "similarity_score"])

    def match_rooms(self, hotel_rooms: pd.DataFrame, supplier_rooms: pd.DataFrame) -> pd.DataFrame:
        """End-to-end process to match rooms."""
        hotel_rooms_grouped, supplier_rooms_grouped = self.preprocess_data(hotel_rooms, supplier_rooms)
        hotel_vectors, supplier_vectors = self.vectorize_data(hotel_rooms_grouped, supplier_rooms_grouped)
        return self.find_best_matches(hotel_vectors, supplier_vectors, hotel_rooms_grouped, supplier_rooms_grouped)

    # -------------------------------------------------------
    # TEST

    def save_model(self):
        """Save the vectorizer, kNN model, and precomputed vectors."""
        os.makedirs(self.model_path, exist_ok=True)
        pickle.dump(self.vectorizer, open(os.path.join(self.model_path, "vectorizer.pkl"), "wb"))
        pickle.dump(self.knn, open(os.path.join(self.model_path, "knn_model.pkl"), "wb"))

    def load_model(self):
        """Load the vectorizer, kNN model, and precomputed vectors."""
        self.vectorizer = pickle.load(open(os.path.join(self.model_path, "vectorizer.pkl"), "rb"))
        self.knn = pickle.load(open(os.path.join(self.model_path, "knn_model.pkl"), "rb"))

    # -------------------------------------------------------




In [0]:
https://adb-853125847230356.16.azuredatabricks.net/files/Selfly/sql_db/full_transactions.csv?o=853125847230356

In [0]:
/dbfs/FileStore/cupid/knn_model.pkl

https://adb-8153377270222175.15.azuredatabricks.net/files/cupid/preprocessed_supplier_rooms.csv?o=8153377270222175

In [0]:
# 1. SAVE MODEL TO DBS This should be stored in repo 

matcher = RoomMatcher()
# hotel_rooms = pd.read_csv("datasets/referance_rooms.csv")  # Load from local or cloud
# supplier_rooms = pd.read_csv("datasets/updated_core_rooms.csv")

# hotel_rooms_sample = hotel_rooms.sample(n=1000)
# supplier_rooms_sample = supplier_rooms.sample(n=1000)

hotel_rooms_grouped, supplier_rooms_grouped = matcher.preprocess_data(hotel_rooms, supplier_rooms)
hotel_vectors, supplier_vectors = matcher.vectorize_data(hotel_rooms_grouped, supplier_rooms_grouped)

# # # Train and save the model
# matcher.knn.fit(supplier_vectors)  # Train the kNN model
# matcher.save_model()  # Save the trained models for later API usage


In [0]:
vectorizer = pickle.load(open("/dbfs/FileStore/cupid/vectorizer.pkl", "rb"))
knn_model = pickle.load(open("/dbfs/FileStore/cupid/knn_model.pkl", "rb"))

# # Load room data
# #hotel_rooms_grouped = pd.read_csv("datasets/preprocessed_hotel_rooms.csv")  # Save this file earlier
# supplier_rooms_grouped = pd.read_csv("datasets/preprocessed_supplier_rooms.csv")  # Save this file earlier

# save these files:
#hotel_rooms_grouped, supplier_rooms_grouped


In [0]:
hotel_rooms_grouped.to_csv("/dbfs/FileStore/cupid/preprocessed_hotel_rooms.csv")
supplier_rooms_grouped.to_csv("/dbfs/FileStore/cupid/preprocessed_supplier_rooms.csv")

In [0]:
hotel_rooms_grouped.to

In [0]:
from flask import Flask, request, jsonify

def match_rooms():
    room_name = "brasil" # TEST
    if room_name not in vectorizer.vocabulary_:
        print("WARNING: 'apartment' not found in the vectorizer vocabulary!")

    if not room_name:
        return jsonify({"error": "room_name is required"}), 400

    # Vectorize the input room name
    room_vector = vectorizer.transform([room_name])

    # Find top-k matches
    distances, indices = knn_model.kneighbors(room_vector, return_distance=True)
  
    # Initialize matched and unmatched rooms
    results = []
    unmapped_rooms = []

    # Iterate over the indices and distances
    for idx, dist in zip(indices[0], distances[0]):
        similarity_score = 1 - dist  # Convert cosine distance to similarity
        if similarity_score > 0.75:  # Ensure threshold condition

            try:
                # Get matched room details
                matched_room = {
                    "supplierRoomName": supplier_rooms_grouped.iloc[idx]["clean_supplier_room_name"],
                    "mappedRooms": [
                        {
                            "score": similarity_score,
                            "supplierRoomId": supplier_rooms_grouped.iloc[idx]["supplier_room_id"],
                            "supplierRoomName": supplier_rooms_grouped.iloc[idx]["clean_supplier_room_name"]
                        }
                    ],
            
                }

                results.append(matched_room)

            except IndexError:
                print(f"Warning: Index {idx} is out of bounds for supplier_rooms_grouped!")

        else:
            # Add to unmapped rooms if similarity score is below threshold
            unmapped_room = {
                "supplierRoomName": supplier_rooms_grouped.iloc[idx]["clean_supplier_room_name"],
                "supplierRoomId": supplier_rooms_grouped.iloc[idx]["supplier_room_id"],
            }
            unmapped_rooms.append(unmapped_room)

    return results, unmapped_rooms


    # return jsonify({
    #     "Results": results,
    #     "UnmappedRooms": unmapped_rooms
    # })
   
       
results, unmapped_rooms =  match_rooms()

In [0]:
results

In [0]:
unmapped_rooms

In [0]:
# roomDescription
# propertyId
# propertyName
# core_room_id ---> merge 

supplier_rooms.head()

In [0]:
supplier_rooms[supplier_rooms.supplier_room_id ==215728605]

In [0]:
hotel_rooms.head()

In [0]:
import mlflow.pyfunc
vectorizer = TfidfVectorizer()

mlflow.pyfunc.save_model(python_model=vectorizer, path="")
#mlflow.pyfunc.log_model(python_model=encoder_wrapped, artifact_path=encoder_path)

In [0]:
vectorizer = pickle.load(open("/dbfs/FileStore/cupid/vectorizer.pkl", "rb"))

In [0]:
supplier_rooms[supplier_rooms.supplier_room_name == "room prestige 1"]

supplier_rooms_grouped[supplier_rooms_grouped.clean_supplier_room_name == "room prestige 1"]

In [0]:
supplier_rooms_grouped.head()

In [0]:
supplier_rooms[supplier_rooms.supplier_room_id == 200374592]


In [0]:
supplier_rooms.head()

In [0]:
# ---------------
# TESTING RESULTS 
# ---------------
hotel_rooms[hotel_rooms.room_id == 1143595263]
#hotel_rooms[hotel_rooms.lp_id == "lp6556d8a8"]

In [0]:
supplier_rooms[supplier_rooms.supplier_room_id == 220259669]
#supplier_rooms[supplier_rooms.lp_id == "lp6556d8a8"]

# 0. Preprocess

In [0]:



# ---------
# hotel_rooms
# ---------
# There are more room ids than hottel rooms names 
# There are the sam amount of hotel ids and lp id 
# room ids shows how many availabe rooms are 
# --------------
# supplier_rooms
# --------------
# There are more lp than in hotel_rooms
# There is only 1 supplier name 

# This suggests that supplier room names can be very different from hotel room names and that a single hotel room may have multiple supplier room variations.

# The supplier room names contain additional details such as "Suite (Bohemian)", "Suite, Jetted Tub (Bird's Eye)", and "Plunge Pool Smart & Bold".
# This means a text similarity approach is needed to determine the best match, as supplier rooms include extra features.

# Possible Mismatch in Room Categories:

# The hotel has a basic "Single Room", while suppliers provide "Suites" and "Duplex" options.
# This raises a question: Are all these supplier rooms actually equivalent to the hotel's "Single Room"? If not, filtering is needed to avoid incorrect mappings.


# Differences in core_hotel_id Values:

# All supplier rooms are linked to a different core_hotel_id (505554), meaning the supplier may have its own internal hotel ID system.
# A mapping between hotel_id and core_hotel_id might be necessary to align hotels correctly.

#supplier_rooms.info()

# 1. Frequency & Distribution Analysis

In [0]:
match_counts = merged1.groupby('room_id')['supplier_room_name'].nunique()
match_counts.describe()


In [0]:
plt.figure(figsize=(10, 5))
plt.hist(match_counts, bins=50, edgecolor='black')
plt.title("Distribution of Supplier Room Matches per Hotel Room")
plt.xlabel("Number of Supplier Rooms per Hotel Room")
plt.ylabel("Count of Hotel Rooms")
plt.yscale("log")  # Log scale helps if distribution is highly skewed
plt.grid(True)
plt.show()


In [0]:
plt.figure(figsize=(8, 5))
plt.boxplot(match_counts, vert=False, patch_artist=True)
plt.title("Box Plot of Supplier Room Matches per Hotel Room")
plt.xlabel("Number of Supplier Rooms per Hotel Room")
plt.grid(True)
plt.show()

In [0]:
# Your histogram and box plot reveal several key insights:

# Most hotel rooms have fewer than 20 supplier matches (skewed distribution with a long tail).
# A small number of hotel rooms have 40+ supplier matches, and extreme outliers go up to 100.
# Outliers are significant (seen in the box plot), meaning some rooms are mapped to way too many supplier rooms.

# 2. Text Similarity Score Analysis


In [0]:
# . Define a Multiplicity Handling Strategy
# Since one hotel room can match multiple supplier rooms, you need to decide how to filter or rank these matches:

# Option A: Keep only the Top-N matches per hotel room
# Based on similarity scores (if calculated later).
# Example: Keep only the top 3 or top 5 supplier rooms per hotel room.
# Option B: Apply a similarity score threshold
# If a match is below a certain similarity (e.g., cosine similarity < 0.7), discard it.
# Option C: Business Rule Filtering
# Prioritize supplier rooms from trusted suppliers over others.
# Keep supplier rooms that have the most common words with the hotel room name.

In [0]:
supplier_rooms.isnull().sum()

In [0]:
# ---------------------------------------
# 1 . PRE  PRE PROCESSING & CLEANING DATA 
# ---------------------------------------


import pandas as pd
import re
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# nltk.download('stopwords')
# nltk.download('wordnet')


def preprocess_text(text):
    """Lowercase, remove special characters and extra spaces."""
    text = text.lower()
    text = re.sub(r'[^a-z0-9 ]', '', text)  # Keep only alphanumeric
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply cleaning
hotel_rooms["clean_room_name"] = hotel_rooms["room_name"].apply(preprocess_text)
supplier_rooms.dropna(inplace=True)
supplier_rooms["clean_supplier_room_name"] = supplier_rooms["supplier_room_name"].apply(preprocess_text)

In [0]:
# vectorizer = TfidfVectorizer()
# all_room_names = hotel_rooms["clean_room_name"].tolist() + supplier_rooms["clean_supplier_room_name"].tolist()

# NOTE: THERE ARE DUPLICATES 
# Q: how to treat duplicates ?  
# INSIGHT: Since each duplicate room name has a different room_id, removing duplicates blindly is risky because we might lose important room mappings. Here's a structured way to proceed:



In [0]:


# NTEO: Now, each unique room name has a list of associated room IDs instead of duplicates.

hotel_rooms_grouped = hotel_rooms.groupby("clean_room_name")["room_id"].apply(list).reset_index()
supplier_rooms_grouped = supplier_rooms.groupby("clean_supplier_room_name")["supplier_room_id"].apply(list).reset_index()
# hotel_rooms_grouped[hotel_rooms_grouped.room_id.apply(lambda x:len(x) >1 )] 

# Use only unique room names for TF-IDF vectorization.
# Later, map back to the original room IDs.

# Extract unique room names
unique_hotel_names = hotel_rooms_grouped["clean_room_name"].tolist()
unique_supplier_names = supplier_rooms_grouped["clean_supplier_room_name"].tolist()

# Compute TF-IDF only on unique names
vectorizer = TfidfVectorizer()
all_unique_room_names = unique_hotel_names + unique_supplier_names

tfidf_matrix = vectorizer.fit_transform(all_unique_room_names)
print('FITED  MATRIX')



# Split matrices and hotel_vectors
print('len(unique_hotel_names)', len(unique_hotel_names))

hotel_vectors = tfidf_matrix[:len(unique_hotel_names)]
supplier_vectors = tfidf_matrix[len(unique_hotel_names):]

# ********************************************************************************

print('supplier_vectors before fitted', supplier_vectors.shape)
print('FITED  MATRIX')
# -------------------------
# FIRST TRY : MEMORY ERROR: 
# -------------------------

# # Compute Cosine Similarity
# print('COMPUTE COSINE SIMILARITY MATRIX')
# similarity_matrix = cosine_similarity(hotel_vectors, supplier_vectors)




In [0]:
# compare 1 to 1 : Why is supplier vector not in the same shape ? 

# hotel_rooms_grouped1,supplier_rooms_grouped1,all_unique_room_names1, vectorizer1, tfidf_matrix1, supplier_vectors1

In [0]:

pd.testing.assert_frame_equal(hotel_rooms_grouped, hotel_rooms_grouped1)
pd.testing.assert_frame_equal(supplier_rooms_grouped, supplier_rooms_grouped1)

In [0]:
assert all_unique_room_names == all_unique_room_names1

In [0]:
assert vectorizer1.get_params() == vectorizer.get_params(), "Vectorizer parameters are different!"


In [0]:
assert (tfidf_matrix != tfidf_matrix1).nnz == 0, "CSR matrices are different!"


In [0]:
tfidf_matrix.shape

In [0]:
tfidf_matrix1.shape

In [0]:
supplier_vectors.shape

In [0]:
supplier_vectors1.shape

In [0]:
# ---------
# OPTION 2: 
# ---------
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# Set number of closest matches to retrieve
top_k = 5  

# Initialize kNN model with cosine similarity
knn = NearestNeighbors(n_neighbors=top_k, metric="cosine", algorithm="auto")
knn.fit(supplier_vectors)  # Fit on supplier rooms
print('supplier_vectors after fitted', supplier_vectors.shape)

# Find top-k similar rooms for hotel rooms
distances, indices = knn.kneighbors(hotel_vectors, return_distance=True)
hotel_room_ids = hotel_rooms_grouped.room_id.values  # Ensure correct ordering
supplier_room_ids = supplier_rooms_grouped.supplier_room_id.values

# Convert to DataFrame
matches = []
for i, hotel_room_id in enumerate(hotel_room_ids):
    for j in range(top_k):
        supplier_index = indices[i][j]
        supplier_room_id = supplier_room_ids[supplier_index]
        similarity_score = 1 - distances[i][j]  # Convert cosine distance to similarity

        matches.append((hotel_room_id, supplier_room_id, similarity_score))

matches_df = pd.DataFrame(matches, columns=["hotel_room_id", "supplier_room_id", "similarity_score"])

# Show sample output
best_matches_df  = matches_df[matches_df["similarity_score"] > 0.75]



In [0]:
print(hotel_rooms_grouped.shape)
print(supplier_rooms_grouped.shape)
print(len(unique_hotel_names))
print(len(unique_supplier_names))
print('all_unique_room_names', len(all_unique_room_names))
print('supplier_vectors2 shape',supplier_vectors.shape)

In [0]:
best_matches_df.shape

In [0]:
pd.testing.assert_frame_equal(best_matches_df, best_matches_df1)


In [0]:
(671458, 99006)
(27467, 99006)

(27467, 99006)
(671458, 99006)

27467
671458

len(hotel_room_ids)


In [0]:
len(supplier_room_ids)

In [0]:
best_matches_df2.shape

In [0]:
best_matches_df1.shape

In [0]:
# shape is diferece

In [0]:

# -------------------------
# SECOND TRY : MEMORY ERROR 
# -------------------------

# import faiss
# import numpy as np

# Convert TF-IDF matrices to numpy arrays
# THIS FAILED with memory issue 
# hotel_vectors_np = hotel_vectors.toarray().astype('float32')
# supplier_vectors_np = supplier_vectors.toarray().astype('float32')

# hotel_vectors = hotel_vectors.tocsr()  # Keep as sparse matrix
# supplier_vectors = supplier_vectors.tocsr()

# Ensure vectors are in float32 format and dense
# hotel_vectors = hotel_vectors.toarray().astype(np.float32)  # Convert sparse to dense if needed
# supplier_vectors = supplier_vectors.toarray().astype(np.float32)

In [0]:
# hotel_vectors = hotel_vectors.tocsr()  # Keep as sparse matrix
# supplier_vectors = supplier_vectors.tocsr()

# # Create FAISS index for supplier vectors
# index = faiss.IndexFlatL2(supplier_vectors.shape[1])  # L2 (Euclidean) is similar to cosine similarity for normalized vectors
# index.add(supplier_vectors)

# # Search for top 5 closest supplier rooms per hotel room
# k = 5  # Number of matches per hotel room
# distances, indices = index.search(hotel_vectors, k)

# # Create match dataframe
# matches = []
# for i, hotel_room_id in enumerate(hotel_rooms_grouped["room_id"]):
#     for j in range(k):
#         supplier_index = indices[i][j]
#         if supplier_index != -1:
#             supplier_room_id = supplier_rooms_grouped.iloc[supplier_index]["supplier_room_id"]
#             matches.append((hotel_room_id, supplier_room_id, distances[i][j]))

# matches_df = pd.DataFrame(matches, columns=["hotel_room_id", "supplier_room_id", "similarity_score"])

In [0]:
# from sklearn.metrics.pairwise import cosine_similarity

# batch_size = 100  # Reduce based on available memory
# similarities = []
# hotel_room_ids = hotel_rooms.room_id.unique()
# supplier_room_ids = supplier_rooms.supplier_room_id.unique()

# for i in range(0, hotel_vectors.shape[0], batch_size):
#     batch_hotel_vectors = hotel_vectors[i : i + batch_size]
#     batch_similarity = cosine_similarity(batch_hotel_vectors, supplier_vectors)
#     print('cosine similarity computed')

#     for j, hotel_id in enumerate(hotel_room_ids[i : i + batch_size]):
#         for k, supplier_id in enumerate(supplier_room_ids):
#             similarities.append((hotel_id, supplier_id, batch_similarity[j, k]))

# matches_df = pd.DataFrame(similarities, columns=["hotel_room_id", "supplier_room_id", "similarity_score"])


In [0]:
# from sklearn.metrics.pairwise import cosine_similarity
# import pandas as pd

# batch_size = 100  # Reduce if running out of memory
# similarities = []

# hotel_room_ids = hotel_rooms_grouped.room_id.values  # Ensure correct ordering
# supplier_room_ids = supplier_rooms_grouped.supplier_room_id.values

# assert hotel_vectors.shape[0] == len(hotel_room_ids), "Mismatch in hotel vectors and hotel room IDs"
# assert supplier_vectors.shape[0] == len(supplier_room_ids), "Mismatch in supplier vectors and supplier room IDs"

# for i in range(0, hotel_vectors.shape[0], batch_size):
#     batch_hotel_vectors = hotel_vectors[i : i + batch_size]
#     batch_similarity = cosine_similarity(batch_hotel_vectors, supplier_vectors)
#     print(f'Cosine similarity computed for batch {i} to {i + batch_size}')

#     batch_hotel_ids = hotel_room_ids[i : i + batch_hotel_vectors.shape[0]]  # Ensure alignment

#     for j, hotel_id in enumerate(batch_hotel_ids):
#         for k, supplier_id in enumerate(supplier_room_ids[:batch_similarity.shape[1]]):  # Avoid out-of-bounds
#             similarities.append((hotel_id, supplier_id, batch_similarity[j, k]))

# matches_df = pd.DataFrame(similarities, columns=["hotel_room_id", "supplier_room_id", "similarity_score"])


In [0]:
supplier_rooms[supplier_rooms.supplier_room_id == 221421116]

# 3. Business Rule Consideration

In [0]:
# Define Success Metrics:
# Decide if the primary goal is to return the best match (a single supplier room) or a ranked list of candidates.
# Example Questions:
# Does the business value presenting multiple options ranked by similarity?
# Are there external factors (like supplier partnerships) that might override the similarity ranking?

In [0]:
hotel_rooms[hotel_rooms.room_id == 1143595263]
# 1 Bed in 10-Bed Signature Shared Dormitory, Me...
# Bed in 10-Bed Men Only Shared Dormitory	bed in 10bed men only shared 
# Shared Dormitory, Men only (Bed in 10-Bed Room)	

In [0]:
supplier_rooms[supplier_rooms.duplicated(subset=["clean_supplier_room_name"], keep=False)]

In [0]:
hotel_rooms[hotel_rooms.clean_room_name == 'double or twin room']

In [0]:
supplier_rooms[supplier_rooms.clean_supplier_room_name == 'superior double room']

In [0]:
hotel_rooms[hotel_rooms.duplicated(subset=["clean_room_name"],keep=False)].value_counts()


In [0]:
print(len(all_room_names))
print(len(set(all_room_names)))

In [0]:
# Assume 'similarity_scores' is an array of cosine similarities between a hotel room and its supplier candidates.
plt.hist(similarity_scores, bins=50)
plt.title("Distribution of Similarity Scores")
plt.xlabel("Cosine Similarity")
plt.ylabel("Frequency")
plt.show()

In [0]:
hotel_rooms.room_name.nunique()

In [0]:
supplier_rooms.supplier_room_name.nunique()

In [0]:
hotel_rooms.head()

In [0]:
hotel_rooms[hotel_rooms.room_id == 1142756923]

In [0]:
merged1[merged1.room_id == 1142756923]

In [0]:
# Determine the Join Strategy:
# If your API needs to combine hotel/room details with supplier information, you might perform an inner join or left join using the common column lp_id.

# Text Standardization:
# For text fields like room_name or supplier_room_name, consider lowercasing or removing extra spaces to help with matching queries.

In [0]:

import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process  # Fuzzy matching for minor variations

# ---------- STEP 2: NLP SIMILARITY MATCHING ----------
# Vectorizing text using TF-IDF
vectorizer = TfidfVectorizer()
all_room_names = hotel_rooms['clean_room_name'].unique().tolist() + supplier_rooms['clean_supplier_room_name'].unique().tolist()
tfidf_matrix = vectorizer.fit_transform(all_room_names)
print('matrix created')
# Compute cosine similarity
hotel_vectors = tfidf_matrix[:len(hotel_rooms)]
supplier_vectors = tfidf_matrix[len(hotel_rooms):]
similarity_matrix = cosine_similarity(hotel_vectors, supplier_vectors)

# # ---------- STEP 3: RULE-BASED FILTERING ----------
# def match_rooms(hotel_name, supplier_names):
#     """Match hotel room name with supplier rooms using fuzzy matching."""
#     best_match, score = process.extractOne(hotel_name, supplier_names)
#     return best_match if score > 80 else None  # Threshold to avoid bad matches

# supplier_room_list = supplier_rooms['clean_supplier_room_name'].tolist()
# hotel_rooms['matched_supplier_room'] = hotel_rooms['clean_room_name'].apply(
#     lambda x: match_rooms(x, supplier_room_list)
# )

# # Filter based on bed type, room type, and amenities (Custom Rules)
# def is_valid_match(hotel_room, supplier_room):
#     """Ensure matching rooms have similar attributes (bed type, suite, etc.)."""
#     # Example: "single" should not match with "suite"
#     invalid_keywords = ["suite", "plunge", "jetted", "duplex"]
#     if any(word in supplier_room for word in invalid_keywords) and "single" in hotel_room:
#         return False
#     return True

# hotel_rooms['valid_match'] = hotel_rooms.apply(
#     lambda row: is_valid_match(row['clean_room_name'], row['matched_supplier_room']), axis=1
# )

# # Show final mapping
# print(hotel_rooms[['room_name', 'matched_supplier_room', 'valid_match']])

In [0]:
from sklearn.preprocessing import normalize

# Normalize vectors row-wise (L2 normalization)
hotel_vectors = normalize(hotel_vectors, norm='l2', axis=1)
supplier_vectors = normalize(supplier_vectors, norm='l2', axis=1)

# Compute similarity efficiently using sparse matrix multiplication
similarity_matrix = hotel_vectors @ supplier_vectors.T

# Model 

In [0]:
# ---------------
# CURRENT SOLUTION
# ---------------
# Breaking Down the Descriptions:
# Both the standard room data and the supplier room data are turned into lists of features. For example, parts of the room description (like “double-person”, “one-double-bed”, “classic-room”, etc.) are split into tokens or categories.

# Comparing Feature Tokens:
# The algorithm then compares these tokens between the two sources. If the tokens (or features) match well, it assigns a high similarity score—in these examples, a perfect score of 1.

# Mapping and Unmapping:
# Rooms that match closely (i.e., have very similar tokens) are grouped together as “mappedRooms.” If the supplier’s room description doesn’t match any known standard description, it ends up in “UnmappedRooms.”


# ---------------
# MDOELLING 
# ---------------
# 1. NLP for Better Text Understanding
# Text Cleaning and Tokenization:
# Instead of simply comparing raw text, use NLP to clean and break down room names and descriptions into meaningful parts (tokens). This can include lowercasing, removing stop words (common words that add little value), and even stemming or lemmatization (reducing words to their basic form).

# Word Embeddings:
# Use pre-trained models (like Word2Vec, GloVe, or Sentence Transformers) to convert room descriptions into numerical vectors that capture their meaning. This way, you can compare rooms based on their semantic similarity rather than just keyword matches. For example, two descriptions that use different words but have the same meaning can be recognized as similar.

# Cosine Similarity:
# Once you have vector representations, you can compute the cosine similarity between a hotel room’s description and a supplier room’s description. A high cosine similarity score would indicate a strong match even if the texts are not exactly the same.

# 2. Machine Learning for Predictive Matching
# Supervised Learning:
# If you have historical data showing which supplier rooms correctly matched with hotel rooms, you can train a machine learning model (like a neural network, random forest, or support vector machine) to learn the matching patterns. The model can take in features extracted from text (using NLP) and metadata (such as room type or amenities) and predict the best match.

# Clustering:
# Use clustering algorithms (like K-means or hierarchical clustering) on the vectorized room descriptions to automatically group similar rooms together. This can help in identifying patterns and segmenting the rooms into distinct categories without predefined rules.

# Fuzzy Matching:
# Machine learning models can also incorporate fuzzy matching techniques to handle typos, abbreviations, or slight variations in text. This improves the matching when data is messy or inconsistent.

# Deploy

In [0]:
#  Preparing for API Implementation
# Once your EDA and preprocessing are complete, you can use the cleaned and merged data as the data source for your API. The API might then:

# Receive Search Queries:
# For example, matching room types, filtering by supplier, or price ranges (if you add pricing information later).
# Query the Processed Data:
# Use SQL queries or in-memory filtering (if using a framework like Flask with pandas) to return the results based on user input.
# Return a JSON Response:
# Structure your API response to return only the necessary fields (e.g., room details, supplier information).

# Appendix

In [0]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.neighbors import NearestNeighbors
import logging

logging.basicConfig(level=logging.INFO)

def preprocess_text(text):
    """Lowercase, remove special characters and extra spaces."""
    text = text.lower()
    text = re.sub(r'[^a-z0-9 ]', '', text)  # Keep only alphanumeric
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text


def preprocessing_steps(hotel_rooms, supplier_rooms):

    # PRE  PRE PROCESSING & CLEANING DATA
    hotel_rooms_copy,supplier_rooms_copy = hotel_rooms.copy(), supplier_rooms.copy()
    # Apply cleaning
    hotel_rooms_copy["clean_room_name"] = hotel_rooms_copy["room_name"].apply(preprocess_text)
    supplier_rooms_copy.dropna(inplace=True)
    supplier_rooms_copy["clean_supplier_room_name"] = supplier_rooms_copy["supplier_room_name"].apply(preprocess_text)

    # NOTE: Now, each unique room name has a list of associated room IDs instead of duplicates.
    hotel_rooms_grouped = hotel_rooms_copy.groupby("clean_room_name")["room_id"].apply(list).reset_index()
    supplier_rooms_grouped = supplier_rooms_copy.groupby("clean_supplier_room_name")["supplier_room_id"].apply(
        list).reset_index()
    
    print('hotel_rooms_grouped shape', hotel_rooms_grouped.shape)
    print('supplier_rooms_grouped shape', supplier_rooms_grouped.shape)

    # Use only unique room names for TF-IDF vectorization.
    # Later, map back to the original room IDs.

    # Extract unique room names
    unique_hotel_names = hotel_rooms_grouped["clean_room_name"].tolist()
    unique_supplier_names = supplier_rooms_grouped["clean_supplier_room_name"].tolist()
    print('unique_hotel_names', len(unique_hotel_names))
    print('unique_supplier_names', len(unique_supplier_names))

    # Compute TF-IDF only on unique names
    # TODO: MOVE THIS TO ITS OWN FUNCTION
    vectorizer = TfidfVectorizer()
    all_unique_room_names = unique_hotel_names + unique_supplier_names



    print('all_unique_room_names', len(all_unique_room_names))

    tfidf_matrix = vectorizer.fit_transform(all_unique_room_names)
    print('FITTED  MATRIX')


    # Split matrices and hotel_vectors
    print('len(unique_hotel_names)', len(unique_hotel_names))
    hotel_vectors = tfidf_matrix[:len(unique_hotel_names)]
    supplier_vectors = tfidf_matrix[len(unique_hotel_names):
        ]

    # ******************************************************

    
    #return hotel_rooms_grouped,supplier_rooms_grouped,all_unique_room_names, vectorizer, tfidf_matrix, supplier_vectors

    print('supplier_vectors shape first func', supplier_vectors.shape)

    return hotel_vectors, supplier_vectors, hotel_rooms_grouped, supplier_rooms_grouped


def find_best_matches(supplier_vectors,
                      hotel_vectors,
                      hotel_rooms_grouped,
                      supplier_rooms_grouped, 
                      top_k=5, 
                      threshold=0.75):

    # Initialize kNN model with cosine similarity
    knn = NearestNeighbors(n_neighbors=top_k, metric="cosine", algorithm="auto")
    print('supplier_vectors shape before fitted',supplier_vectors.shape)

    knn.fit(supplier_vectors)  # Fit on supplier rooms
    
    print('supplier_vectors shape after fitted',supplier_vectors.shape)

    # Find top-k similar rooms for hotel rooms
    distances, indices = knn.kneighbors(hotel_vectors, return_distance=True)
    hotel_room_ids = hotel_rooms_grouped.room_id.values  # Ensure correct ordering
    supplier_room_ids = supplier_rooms_grouped.supplier_room_id.values
    print('hotel_room_ids', len(hotel_room_ids))

    # Convert to DataFrame
    matches = []
    for i, hotel_room_id in enumerate(hotel_room_ids):
        for j in range(top_k):
            supplier_index = indices[i][j]
            supplier_room_id = supplier_room_ids[supplier_index]
            similarity_score = 1 - distances[i][j]  # Convert cosine distance to similarity

            matches.append((hotel_room_id, supplier_room_id, similarity_score))

    matches_df = pd.DataFrame(matches, columns=["hotel_room_id", "supplier_room_id", "similarity_score"])

    print('matches_df shape', matches_df.shape)
    print('threshold', threshold)

    best_matches_df = matches_df[matches_df["similarity_score"] >threshold]

    return best_matches_df


hotel_vectors1,supplier_vectors1,hotel_rooms_grouped1,supplier_rooms_grouped1 = preprocessing_steps(hotel_rooms,supplier_rooms 
                                                                                             )

best_matches_df1 = find_best_matches(supplier_vectors1,
                                     hotel_vectors1,
                                     hotel_rooms_grouped1,
                                     supplier_rooms_grouped1,
                                      top_k=5, 
                                      threshold=0.75)
