In [88]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

df = pd.read_json("products.json")  

# Replace null values in "rating" with random integers from 1 to 5
df["rating"] = df["rating"].apply(lambda x: np.random.randint(1, 6) if pd.isnull(x) else int(x))

# Replace NaN values in "price" with the median price of the same category and brand
df["price"] = df.apply(
    lambda row: df[
        (df["category"] == row["category"]) & 
        (df["brand"] == row["brand"]) & 
        (df["price"].notna()) & (df["price"] > 0)  # Exclude NaN and 0 values
    ]["price"].median() if pd.isna(row["price"]) or row["price"] == 0 else row["price"], 
    axis=1
)

# Clean the category column
default_category = "unknown"

df["category"] = (
    df["category"]
    .astype(str)
    .str.strip()
    .replace("", np.nan)
    .fillna(default_category)
)

# Drop the "currency" column
df = df.drop(columns=["currency"])

df.head()
# df.shape

# unique_ratings = df["price"].unique()
# print(unique_ratings)
# df.to_json("updated_products.json", orient="records", indent=4)

# with open("updated_products.json", "r", encoding="utf-8") as file:
#     json_data = file.read().replace("\\/", "/")

# with open("updated_products.json", "w", encoding="utf-8") as file:
#     file.write(json_data)

Unnamed: 0,id,brand,name,price,price_sign,image_link,product_link,website_link,description,rating,category,product_type,tag_list,created_at,updated_at,product_api_url,api_featured_image,product_colors
0,1048,colourpop,Lippie Pencil,5.0,$,https://cdn.shopify.com/s/files/1/1338/0845/co...,https://colourpop.com/collections/lippie-pencil,https://colourpop.com,Lippie Pencil A long-wearing and high-intensit...,4,pencil,lip_liner,"[cruelty free, Vegan]",2018-07-08 23:45:08.056000+00:00,2018-07-09 00:53:23.301000+00:00,https://makeup-api.herokuapp.com/api/v1/produc...,//s3.amazonaws.com/donovanbailey/products/api_...,"[{'hex_value': '#B28378', 'colour_name': 'BFF ..."
1,1047,colourpop,Blotted Lip,5.5,$,https://cdn.shopify.com/s/files/1/1338/0845/pr...,https://colourpop.com/collections/lippie-stix?...,https://colourpop.com,Blotted Lip Sheer matte lipstick that creates ...,3,lipstick,lipstick,"[cruelty free, Vegan]",2018-07-08 22:01:20.178000+00:00,2018-07-09 00:53:23.287000+00:00,https://makeup-api.herokuapp.com/api/v1/produc...,//s3.amazonaws.com/donovanbailey/products/api_...,"[{'hex_value': '#b72227', 'colour_name': 'Bee'..."
2,1046,colourpop,Lippie Stix,5.5,$,https://cdn.shopify.com/s/files/1/1338/0845/co...,https://colourpop.com/collections/lippie-stix,https://colourpop.com,"Lippie Stix Formula contains Vitamin E, Mango,...",2,lipstick,lipstick,"[cruelty free, Vegan]",2018-07-08 21:47:49.858000+00:00,2018-07-09 00:53:23.274000+00:00,https://makeup-api.herokuapp.com/api/v1/produc...,//s3.amazonaws.com/donovanbailey/products/api_...,"[{'hex_value': '#F2DEC3', 'colour_name': 'Fair..."
3,1045,colourpop,No Filter Foundation,12.0,$,https://cdn.shopify.com/s/files/1/1338/0845/pr...,https://colourpop.com/products/no-filter-matte...,https://colourpop.com/products/no-filter-matte...,"Developed for the Selfie Age, our buildable fu...",5,liquid,foundation,"[cruelty free, Vegan]",2018-07-08 18:22:25.273000+00:00,2018-07-09 00:53:23.313000+00:00,https://makeup-api.herokuapp.com/api/v1/produc...,//s3.amazonaws.com/donovanbailey/products/api_...,"[{'hex_value': '#F2DEC3', 'colour_name': 'Fair..."
4,1044,boosh,Lipstick,26.0,$,https://cdn.shopify.com/s/files/1/1016/3243/pr...,https://www.boosh.ca/collections/all,https://www.boosh.ca/,All of our products are free from lead and hea...,4,lipstick,lipstick,"[Chemical Free, Organic]",2018-07-08 17:32:28.088000+00:00,2018-09-02 22:52:06.669000+00:00,https://makeup-api.herokuapp.com/api/v1/produc...,//s3.amazonaws.com/donovanbailey/products/api_...,"[{'hex_value': '#CB4975', 'colour_name': 'Babs..."


In [73]:
features = ["price", "rating", "category", "brand"]
df = df.dropna(subset=features)

categorical_cols = ["category", "brand"]
numerical_cols = ["price", "rating"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

# Apply transformation
X = preprocessor.fit_transform(df[features])



In [74]:
# Train Nearest Neighbors model
nn_model = NearestNeighbors(n_neighbors=5, metric="euclidean")
nn_model.fit(X)

In [89]:
def recommend_products(product_name, df, model, preprocessor, top_n=5):
    """
    Given a product name, recommend similar products.
    """
    # Get the index of the selected product
    product_idx = df[df["name"].str.lower() == product_name.lower()].index
    if len(product_idx) == 0:
        return "Product not found!"
    
    product_idx = product_idx[0]
    
    # Transform the product's features
    product_features = preprocessor.transform(df.loc[[product_idx], features])
    
    # Find nearest neighbors
    distances, indices = model.kneighbors(product_features, n_neighbors=top_n+1)
    
    # Retrieve recommended products (excluding the input product itself)
    recommended_products = df.iloc[indices[0][1:]]["id"].tolist()  # Skip first as it's the same product
    
    return recommended_products

# Example usage
product_to_recommend = "Pro Lip Cream Palette"
recommendations = recommend_products(product_to_recommend, df, nn_model, preprocessor)
print(recommendations)


[889, 890, 908, 901, 915]
