In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

df = pd.read_json("products.json")  

# Replace null values in "rating" with random integers from 1 to 5
df["rating"] = df["rating"].apply(lambda x: np.random.randint(1, 6) if pd.isnull(x) else int(x))

# Convert "price" to numeric, forcing errors to NaN 
df["price"] = pd.to_numeric(df["price"], errors="coerce")

# Replace NaN values in "price" with the median price of the same category and brand
df["price"] = df.apply(
    lambda row: df[
        (df["category"] == row["category"]) & 
        (df["brand"] == row["brand"]) & 
        (df["price"].notna()) & (df["price"] > 0)  # Exclude NaN and 0 values
    ]["price"].median() if pd.isna(row["price"]) or row["price"] == 0 else row["price"], 
    axis=1
)

# If any prices are still NaN (no median available), replace them with a default value (e.g., 1.0) 
df["price"] = df["price"].fillna(1.0)

# Ensure all prices are greater than zero 
df["price"] = df["price"].apply(lambda x: max(x, 1.0))


# Clean the category column
default_category = "unknown"

df["category"] = (
    df["category"]
    .astype(str)
    .str.strip()
    .replace("", np.nan)
    .fillna(default_category)
)

# Drop the "currency" column
df = df.drop(columns=["currency"])

df.head()
# df.shape

unique_ratings = df["price"].unique()
print(unique_ratings)
df.to_json("updated_products.json", orient="records", indent=4)

with open("updated_products.json", "r", encoding="utf-8") as file:
    json_data = file.read().replace("\\/", "/")

with open("updated_products.json", "w", encoding="utf-8") as file:
    file.write(json_data)

[ 5.    5.5  12.   26.    6.7   6.9   1.    9.95 22.   27.   32.   20.
  9.    7.   15.    9.5   8.    6.    9.75 10.    8.75 13.    6.5   4.75
 11.    4.    4.5   3.5   7.5  18.   14.    8.5  19.   25.   28.   36.
 17.5  24.   29.5  33.   19.5  39.   38.5  37.   22.25 21.   17.   16.5
 28.5  27.3  34.5  23.   34.   18.9  18.5  15.4  20.5  43.   27.5  26.5
 25.5  65.   45.   43.5  22.5  32.5  36.5  39.5  40.   23.5  29.   77.
 44.   75.   42.   30.   38.   16.   31.   35.   49.   50.   55.   52.
 19.99 46.99 10.49 30.96 11.99 13.99  7.99 14.99 20.99 14.49  3.99 10.29
  8.99  9.99 21.99  4.99 15.99 12.99 28.96 23.49  6.99 13.49 16.49 15.49
 51.   11.49 10.99 11.29  8.29 20.49 17.99 22.99 27.49 14.79 18.29 25.99
 29.99 12.49 18.49 16.99 18.99  9.49 26.99 17.29 18.79 24.49 15.79  4.79
  4.49  6.49  5.99 56.49  1.99 60.    7.49  7.29  9.29  8.96  5.49 27.96
  9.39 21.5   7.79 19.29  2.99 22.49 13.96 11.79  3.49 10.79 13.79  9.79
 13.29]


In [73]:
features = ["price", "rating", "category", "brand"]
df = df.dropna(subset=features)

categorical_cols = ["category", "brand"]
numerical_cols = ["price", "rating"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

# Apply transformation
X = preprocessor.fit_transform(df[features])



In [91]:
# Train Nearest Neighbors model
nn_model = NearestNeighbors(n_neighbors=5, metric="euclidean")
nn_model.fit(X)

#Save model
joblib.dump((nn_model, preprocessor, df), "nearest_neighbors_model.pkl")

['nearest_neighbors_model.pkl']

In [89]:
def recommend_products(product_name, df, model, preprocessor, top_n=5):
    """
    Given a product name, recommend similar products.
    """
    # Get the index of the selected product
    product_idx = df[df["name"].str.lower() == product_name.lower()].index
    if len(product_idx) == 0:
        return "Product not found!"
    
    product_idx = product_idx[0]
    
    # Transform the product's features
    product_features = preprocessor.transform(df.loc[[product_idx], features])
    
    # Find nearest neighbors
    distances, indices = model.kneighbors(product_features, n_neighbors=top_n+1)
    
    # Retrieve recommended products (excluding the input product itself)
    recommended_products = df.iloc[indices[0][1:]]["id"].tolist()  # Skip first as it's the same product
    
    return recommended_products

# Example usage
product_to_recommend = "Pro Lip Cream Palette"
recommendations = recommend_products(product_to_recommend, df, nn_model, preprocessor)
print(recommendations)


[889, 890, 908, 901, 915]
