## Set up Kaggle API

In [1]:
%pip install -qU kaggle python-dotenv pandas scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
from dotenv import load_dotenv
from kaggle.api.kaggle_api_extended import KaggleApi

# load environment variables from .env file
load_dotenv()

kaggle_username = os.getenv('KAGGLE_USERNAME')
kaggle_key = os.getenv('KAGGLE_KEY')

# authenticate Kaggle API
api = KaggleApi()
api.authenticate()

dataset_name = 'nadyinky/sephora-products-and-skincare-reviews'  
download_path = 'data/' 

# Download dataset
api.dataset_download_files(dataset_name, path=download_path, unzip=True)

print(f"Dataset '{dataset_name}' has been downloaded and extracted to '{download_path}'")

Dataset URL: https://www.kaggle.com/datasets/nadyinky/sephora-products-and-skincare-reviews
Dataset 'nadyinky/sephora-products-and-skincare-reviews' has been downloaded and extracted to 'data/'


## Load Product Data

These selected columns from products and reviews are crucial for building an Item-Based Collaborative Filtering + Content-Based Filtering hybrid recommendation system.

Each row represents a unique skincare product, with essential features for content-based filtering.

Column	Why It’s Important?
product_id	Unique identifier for each product. Needed to match with reviews.
product_name	Helps display recommendations and analyze product popularity.
brand_name	Some users may prefer products from specific brands.
ingredients	Core feature for content-based filtering—we’ll use TF-IDF to compute similarity between products based on their ingredients.
highlights	Includes skincare concerns and attributes (e.g., “Hydrating,” “Anti-Aging”), which are useful for recommending similar products.
price_usd	Helps users filter recommendations based on budget constraints.
primary_category, secondary_category, tertiary_category	Helps group products by skincare type, allowing for better recommendations (e.g., comparing only moisturizers).

* Content-Based Filtering: Uses ingredients + highlights to recommend similar products.
* Hybrid Model: Helps Item-Based CF refine its recommendations based on product category.

In [4]:
import pandas as pd

products_df = pd.read_csv("data/product_info.csv")

products_df = products_df[["product_id", "product_name", "brand_name", "ingredients", "highlights",
                           "price_usd", "primary_category", "secondary_category", "tertiary_category"]]

print(products_df.head())

  product_id               product_name brand_name  \
0    P473671    Fragrance Discovery Set      19-69   
1    P473668    La Habana Eau de Parfum      19-69   
2    P473662  Rainbow Bar Eau de Parfum      19-69   
3    P473660       Kasbah Eau de Parfum      19-69   
4    P473658  Purple Haze Eau de Parfum      19-69   

                                         ingredients  \
0  ['Capri Eau de Parfum:', 'Alcohol Denat. (SD A...   
1  ['Alcohol Denat. (SD Alcohol 39C), Parfum (Fra...   
2  ['Alcohol Denat. (SD Alcohol 39C), Parfum (Fra...   
3  ['Alcohol Denat. (SD Alcohol 39C), Parfum (Fra...   
4  ['Alcohol Denat. (SD Alcohol 39C), Parfum (Fra...   

                                          highlights  price_usd  \
0  ['Unisex/ Genderless Scent', 'Warm &Spicy Scen...       35.0   
1  ['Unisex/ Genderless Scent', 'Layerable Scent'...      195.0   
2  ['Unisex/ Genderless Scent', 'Layerable Scent'...      195.0   
3  ['Unisex/ Genderless Scent', 'Layerable Scent'...      195.0   
4  

## Load and Combine Review Data

Each row represents a user review for a skincare product, which is essential for Item-Based Collaborative Filtering.

Column	Why It’s Important?
author_id	Unique identifier for each user. Helps build a user-product interaction matrix.
product_id	Links reviews to products in the product table (needed for collaborative filtering).
rating	Explicit feedback on a scale of 1-5, used to compute product similarity.
is_recommended	Can be used as an additional signal to infer user preference.
review_text	Optional—if included, can be used for sentiment analysis.
skin_type	Useful for personalized recommendations (e.g., recommend only dry skin products to dry skin users).

* Item-Based Collaborative Filtering: Uses ratings to compute similarity between products.
* Personalization: Skin type can help filter recommendations for users with specific skincare needs.

In [5]:
review_files = ["data/reviews_0-250.csv", "data/reviews_250-500.csv", "data/reviews_500-750.csv",
                "data/reviews_750-1250.csv", "data/reviews_1250-end.csv"]

# ensures all columns have a consistent format
dtype_dict = {
    "author_id": str,         # Treat user IDs as strings (some IDs may be too large for integers)
    "product_id": str,        # Product IDs should be strings
    "rating": float,          # Ratings should be numeric (float)
    "is_recommended": float,  # Ensure binary values are float
    "review_text": str,       # Review text should always be a string
    "skin_type": str          # Skin type should always be a string
}

# low_memory prevents pandas from guessing column types incorrectly
reviews_list = [pd.read_csv(file, dtype=dtype_dict, low_memory=False) for file in review_files]

# Concatenate all reviews into a single DataFrame
reviews_df = pd.concat(reviews_list, ignore_index=True)

# relevant columns
reviews_df = reviews_df[["author_id", "product_id", "rating", "is_recommended", "review_text", "skin_type"]]

print(reviews_df.head())

     author_id product_id  rating  is_recommended  \
0   1741593524    P504322     5.0             1.0   
1  31423088263    P420652     1.0             0.0   
2   5061282401    P420652     5.0             1.0   
3   6083038851    P420652     5.0             1.0   
4  47056667835    P420652     5.0             1.0   

                                         review_text    skin_type  
0  I use this with the Nudestix “Citrus Clean Bal...          dry  
1  I bought this lip mask after reading the revie...          NaN  
2  My review title says it all! I get so excited ...          dry  
3  I’ve always loved this formula for a long time...  combination  
4  If you have dry cracked lips, this is a must h...  combination  


## Clean and Process the Data

In [6]:
# handling missing ingredient values with an empty string
products_df["ingredients"] = products_df["ingredients"].fillna("")
reviews_df["review_text"] = reviews_df["review_text"].fillna("")
reviews_df["skin_type"] = reviews_df["skin_type"].fillna("Unknown")

In [7]:
# normalize text columns
products_df["ingredients"] = products_df["ingredients"].str.lower()
products_df["highlights"] = products_df["highlights"].str.lower()

In [8]:
# convert ratings to numeric
reviews_df["rating"] = pd.to_numeric(reviews_df["rating"], errors="coerce")

## Build the Content Based Filtering Model

### Convert Ingredients & Highlights to TF-IDF Vectors

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Ensure 'ingredients' and 'highlights' exist before combining
products_df["ingredients"] = products_df["ingredients"].fillna("")
products_df["highlights"] = products_df["highlights"].fillna("")

# Create the 'combined_features' column
products_df["combined_features"] = products_df["ingredients"] + " " + products_df["highlights"]

# Now fill NaN values for safety
products_df["combined_features"] = products_df["combined_features"].fillna("")

# Convert text to TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf_vectorizer.fit_transform(products_df["combined_features"])

# Check shape
print(tfidf_matrix.shape)

(8494, 8973)


### Compute Cosine Similarity

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity scores between products
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Convert similarity matrix to DataFrame
cosine_sim_df = pd.DataFrame(cosine_sim, index=products_df["product_id"], columns=products_df["product_id"])

# Check similarity for a product
print(cosine_sim_df.head())

product_id   P473671   P473668   P473662   P473660   P473658   P473661  \
product_id                                                               
P473671     1.000000  0.749470  0.745681  0.744664  0.717759  0.744664   
P473668     0.749470  1.000000  0.749865  0.868884  0.703762  0.868884   
P473662     0.745681  0.749865  1.000000  0.865095  0.871173  0.865095   
P473660     0.744664  0.868884  0.865095  1.000000  0.797130  1.000000   
P473658     0.717759  0.703762  0.871173  0.797130  1.000000  0.797130   

product_id   P473659   P473666   P472300   P473667  ...   P467660  P306857  \
product_id                                          ...                      
P473671     0.717759  0.673687  0.757190  0.673687  ...  0.029302      0.0   
P473668     0.703762  0.708656  0.742881  0.708656  ...  0.028391      0.0   
P473662     0.871173  0.643791  0.779213  0.643791  ...  0.012342      0.0   
P473660     0.797130  0.665166  0.769431  0.665166  ...  0.012752      0.0   
P473658     1

## Build the Item-Based Collaborative Filtering Model

### Create User-Product Rating Matrix

In [11]:
# "Index contains duplicate entries, cannot reshape"
# aggregate by taking the mean rating per (product_id, author_id)
reviews_df = reviews_df.groupby(["product_id", "author_id"], as_index=False).agg({"rating": "mean"})

# Now pivot the table
user_product_matrix = reviews_df.pivot(index="product_id", columns="author_id", values="rating")

# Fill missing values with 0 (unrated products)
user_product_matrix = user_product_matrix.fillna(0)

print(user_product_matrix.shape)

(2351, 503216)


### Compute Product Similarity

In [12]:
# Compute cosine similarity between products based on user ratings
item_similarity = cosine_similarity(user_product_matrix)

# Convert to DataFrame
item_sim_df = pd.DataFrame(item_similarity, index=user_product_matrix.index, columns=user_product_matrix.index)

# Check product similarity
print(item_sim_df.head())

product_id   P107306   P114902    P12045   P122651   P122661   P122718  \
product_id                                                               
P107306     1.000000  0.001792  0.003287  0.000000  0.000582  0.000000   
P114902     0.001792  1.000000  0.005654  0.000000  0.003570  0.008067   
P12045      0.003287  0.005654  1.000000  0.001163  0.001090  0.005243   
P122651     0.000000  0.000000  0.001163  1.000000  0.022310  0.006326   
P122661     0.000582  0.003570  0.001090  0.022310  1.000000  0.033923   

product_id   P122727   P122762   P122767   P122774  ...    P54509     P6028  \
product_id                                          ...                       
P107306     0.004229  0.002202  0.000000  0.002752  ...  0.002027  0.008101   
P114902     0.000000  0.003998  0.000526  0.008915  ...  0.001398  0.005952   
P12045      0.000000  0.004006  0.000000  0.002525  ...  0.003766  0.026335   
P122651     0.000000  0.007804  0.000000  0.013150  ...  0.003443  0.012123   
P122661

## Hybrid Recommender – Combining Both Models

In [13]:
# Find missing products
missing_products = list(set(cosine_sim_df.index) - set(item_sim_df.index))

# Create a DataFrame with zeros for missing products
missing_sim_matrix = pd.DataFrame(0, index=missing_products, columns=item_sim_df.columns)

# Append to the collaborative filtering similarity matrix
item_sim_df = pd.concat([item_sim_df, missing_sim_matrix])

# Now add missing products as columns (ensuring symmetry)
missing_sim_matrix = pd.DataFrame(0, index=item_sim_df.index, columns=missing_products)
item_sim_df = pd.concat([item_sim_df, missing_sim_matrix], axis=1)

# Sort rows and columns to match content-based filtering
item_sim_df = item_sim_df.loc[cosine_sim_df.index, cosine_sim_df.index]

In [14]:
# Get product IDs in each matrix
content_products = set(cosine_sim_df.index)
cf_products = set(item_sim_df.index)

# Find missing products in each matrix
missing_in_content = cf_products - content_products
missing_in_cf = content_products - cf_products

print(f"Products in CF but missing in Content-Based: {len(missing_in_content)}")
print(f"Products in Content-Based but missing in CF: {len(missing_in_cf)}")

Products in CF but missing in Content-Based: 0
Products in Content-Based but missing in CF: 0


In [15]:
alpha = 0.7  # Weight for Content-Based Filtering (higher means more importance)
hybrid_sim = (alpha * cosine_sim_df) + ((1 - alpha) * item_sim_df)

# Check hybrid similarity scores
print(hybrid_sim.head())

product_id   P473671   P473668   P473662   P473660   P473658   P473661  \
product_id                                                               
P473671     0.700000  0.524629  0.521977  0.521265  0.502432  0.521265   
P473668     0.524629  0.700000  0.524906  0.608219  0.492633  0.608219   
P473662     0.521977  0.524906  0.700000  0.605566  0.609821  0.605566   
P473660     0.521265  0.608219  0.605566  0.700000  0.557991  0.700000   
P473658     0.502432  0.492633  0.609821  0.557991  0.700000  0.557991   

product_id   P473659   P473666   P472300   P473667  ...   P467660  P306857  \
product_id                                          ...                      
P473671     0.502432  0.471581  0.530033  0.471581  ...  0.020511      0.0   
P473668     0.492633  0.496059  0.520016  0.496059  ...  0.019873      0.0   
P473662     0.609821  0.450654  0.545449  0.450654  ...  0.008639      0.0   
P473660     0.557991  0.465616  0.538602  0.465616  ...  0.008926      0.0   
P473658     0

## Making Product Recommendations

In [16]:
products_df["product_id"].sample(5)

3716    P417222
4426    P504465
6160    P378660
8467    P500265
1934    P416366
Name: product_id, dtype: object

In [17]:
def recommend_products(product_id, num_recommendations=5):
    # get product similarity scores
    scores = hybrid_sim[product_id].sort_values(ascending=False)
    
    # exclude itself from recommendations
    recommended_products = scores.iloc[1:num_recommendations+1]
    
    # merge with product info
    return products_df[products_df["product_id"].isin(recommended_products.index)][["product_name", "brand_name", "price_usd"]]

# recommend similar products for a given product ID
recommend_products("P473658") 

Unnamed: 0,product_name,brand_name,price_usd
2,Rainbow Bar Eau de Parfum,19-69,195.0
3,Kasbah Eau de Parfum,19-69,195.0
5,Kasbah Eau de Parfum Travel Spray,19-69,30.0
6,Purple Haze Eau de Parfum Travel Spray,19-69,30.0
10,Rainbow Bar Eau de Parfum Travel Spray,19-69,30.0


## Save the Trained Models

In [33]:
import pickle

# Save the hybrid similarity matrix
with open("hybrid_similarity.pkl", "wb") as file:
    pickle.dump(hybrid_sim, file)

# Save product metadata for lookup
products_df.to_csv("metadata/products.csv", index=False)