In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

df = pd.read_json("products.json")  

# Replace null values in "rating" with random integers from 1 to 5
df["rating"] = df["rating"].apply(lambda x: np.random.randint(1, 6) if pd.isnull(x) else int(x))

# Replace NaN values in "price" with the median price of the same category and brand
df["price"] = df.apply(
    lambda row: df[
        (df["category"] == row["category"]) & 
        (df["brand"] == row["brand"]) & 
        (df["price"].notna())  # Exclude NaN prices
    ]["price"].median() if pd.isna(row["price"]) else row["price"], 
    axis=1
)

# Drop the "currency" column
df = df.drop(columns=["currency"])

# df = df.dropna(subset=features)
df.head()
df.shape

# unique_ratings = df["price"].unique()
# print(unique_ratings)
# df.to_json("updated_products.json", orient="records", indent=4)


(503, 18)

In [5]:
features = ["price", "rating", "category", "brand"]

categorical_cols = ["category", "brand"]
numerical_cols = ["price", "rating"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

# Apply transformation
X = preprocessor.fit_transform(df[features])


In [7]:
# Train Nearest Neighbors model
nn_model = NearestNeighbors(n_neighbors=5, metric="euclidean")
nn_model.fit(X)

In [12]:
def recommend_products(product_name, df, model, preprocessor, top_n=5):
    """
    Given a product name, recommend similar products.
    """
    # Get the index of the selected product
    product_idx = df[df["name"].str.lower() == product_name.lower()].index
    if len(product_idx) == 0:
        return "Product not found!"
    
    product_idx = product_idx[0]
    
    # Transform the product's features
    product_features = preprocessor.transform(df.loc[[product_idx], features])
    
    # Find nearest neighbors
    distances, indices = model.kneighbors(product_features, n_neighbors=top_n+1)
    
    # Retrieve recommended products (excluding the input product itself)
    recommended_products = df.iloc[indices[0][1:]]  # Skip first as it's the same product
    
    return recommended_products[["name", "brand", "category", "price"]]

# Example usage
product_to_recommend = "Sante Rouge"
recommendations = recommend_products(product_to_recommend, df, nn_model, preprocessor)
print(recommendations)


                                                  name     brand category  \
685  Pacifica Radiant Shimmer Highlighting Creams F...  pacifica   powder   
587                        Sante Soft Cream Foundation     sante    cream   
759                                 Zorah Liquid Liner     zorah   liquid   
724                        Sante Kajal Eyeliner Pencil     sante   pencil   
614  L'Oreal Paris True Match Lumi Glow Liquid Foun...   l'oreal   liquid   

     price  
685  28.00  
587  27.49  
759  24.00  
724  17.29  
614  16.99  
