# 
Importing the required libraries and loading the scraped Data

In [1]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from scipy.sparse import hstack, coo_matrix
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet') 
nltk.download('omw-1.4') 
from nltk.stem import WordNetLemmatizer
sns.set_style("darkgrid")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
df = pd.read_csv("kilimall_beauty_health_products_scraped.csv")
df.head()

Unnamed: 0,Product_name,Rating,Price,Discount_rate,Customer_reviews,Category,Sub_Category
0,4 Pcs Men Perfume Set Perfumes Different Fragr...,4.5,999.0,47% off,2923 Customer reviews,Beauty,Fragrance
1,[Promotion] USB Electric Hair Cutting Machine ...,4.4,367.0,(-54%),2782 Customer reviews,Hair Cutting Tools,Hair Clippers & Trimmers
2,(Clearance Price)RichRipple Vintage T9 Hair cu...,4.5,367.0,(-81%),1445 Customer reviews,Hair Cutting Tools,Hair Clippers & Trimmers
3,2 In 1 Curling Iron Hair Straightener Flat Iro...,4.4,859.0,(-70%),1775 Customer reviews,Hair Styling Tools & Appliances,Hair Straighteners
4,2 in 1 Hot Comb Straightener Electric Hair Str...,4.3,898.0,53% off,1540 Customer reviews,Hair Styling Tools & Appliances,Hair Combs


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 718 entries, 0 to 717
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Product_name      718 non-null    object 
 1   Rating            718 non-null    float64
 2   Price             718 non-null    float64
 3   Discount_rate     718 non-null    object 
 4   Customer_reviews  718 non-null    object 
 5   Category          718 non-null    object 
 6   Sub_Category      718 non-null    object 
dtypes: float64(2), object(5)
memory usage: 39.4+ KB


In [4]:
df.isnull().sum()

Product_name        0
Rating              0
Price               0
Discount_rate       0
Customer_reviews    0
Category            0
Sub_Category        0
dtype: int64

In [5]:
df["Category"].value_counts()

Category
Beauty                             212
Face                                64
Body                                42
Men's                               30
Wigs                                29
Hair Styling Tools & Appliances     29
Oral Care                           18
Foot & Hand Care                    17
Vitamins & Dietary Supplements      16
Massage Tools & Equipment           16
Health Monitors                     14
Women's                             14
Skin Care Tools                     14
Sports Nutrition                    14
Hair Cutting Tools                  13
Bath                                12
Styling Products                    11
Feminine Care                       11
Skin Care                           10
Lip Care                            10
Eyes                                10
Home Tests                           9
Deodorants & Antiperspirants         9
Hair Care                            9
Shampoo & Conditioner                9
Sunscreens & Tan

In [6]:
df["Sub_Category"].value_counts()

Sub_Category
Make Up                          117
Fragrance                         45
Make Up Tool & Accessories        32
Facial Essence                    23
Other Wigs                        21
                                ... 
Rollers&Pens                       1
Panty Liners                       1
Hair Color                         1
Bandages & Bandaging Supplies      1
Breast Enhancement Creams          1
Name: count, Length: 179, dtype: int64

# 
Cleaning the Customer reviews column and Discount rate column using RegX 

In [7]:
df['Customer_reviews'] = df['Customer_reviews'].astype(str).apply(lambda x: re.search(r'\d+', x).group(0) if re.search(r'\d+', x) else '0').astype(float)
df['Discount_rate'] = pd.to_numeric(
    df['Discount_rate'].astype(str)
    .str.replace('%', '', regex=False)
    .str.replace('off', '', regex=False)
    .str.replace('(', '', regex=False)
    .str.replace(')', '', regex=False)
    .str.replace('-', '', regex=False)
    .str.strip(),
    errors='coerce' 
)
df.head()

Unnamed: 0,Product_name,Rating,Price,Discount_rate,Customer_reviews,Category,Sub_Category
0,4 Pcs Men Perfume Set Perfumes Different Fragr...,4.5,999.0,47,2923.0,Beauty,Fragrance
1,[Promotion] USB Electric Hair Cutting Machine ...,4.4,367.0,54,2782.0,Hair Cutting Tools,Hair Clippers & Trimmers
2,(Clearance Price)RichRipple Vintage T9 Hair cu...,4.5,367.0,81,1445.0,Hair Cutting Tools,Hair Clippers & Trimmers
3,2 In 1 Curling Iron Hair Straightener Flat Iro...,4.4,859.0,70,1775.0,Hair Styling Tools & Appliances,Hair Straighteners
4,2 in 1 Hot Comb Straightener Electric Hair Str...,4.3,898.0,53,1540.0,Hair Styling Tools & Appliances,Hair Combs


# 
Performing EDA

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 718 entries, 0 to 717
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Product_name      718 non-null    object 
 1   Rating            718 non-null    float64
 2   Price             718 non-null    float64
 3   Discount_rate     718 non-null    int64  
 4   Customer_reviews  718 non-null    float64
 5   Category          718 non-null    object 
 6   Sub_Category      718 non-null    object 
dtypes: float64(3), int64(1), object(3)
memory usage: 39.4+ KB


In [9]:
df.describe()

Unnamed: 0,Rating,Price,Discount_rate,Customer_reviews
count,718.0,718.0,718.0,718.0
mean,4.477298,734.718663,57.172702,166.023677
std,0.403541,682.1596,20.600219,324.099987
min,1.9,89.0,1.0,0.0
25%,4.3,329.0,42.0,10.0
50%,4.5,527.5,57.0,53.0
75%,4.7,899.0,72.0,173.25
max,5.0,7139.0,99.0,2923.0


In [10]:
df.Rating.value_counts()

Rating
4.5    138
5.0     92
4.7     74
4.3     64
4.6     63
4.4     63
4.8     58
4.2     47
4.1     36
4.0     28
4.9     16
3.0      7
3.9      6
3.8      4
3.7      3
3.4      3
3.6      3
3.5      3
2.7      3
3.1      2
3.3      2
1.9      1
2.6      1
2.3      1
Name: count, dtype: int64

In [11]:
df.columns

Index(['Product_name', 'Rating', 'Price', 'Discount_rate', 'Customer_reviews',
       'Category', 'Sub_Category'],
      dtype='object')

In [12]:
plt.figure(figsize=(12,6))
fig = px.scatter(
    df,
    x='Customer_reviews',   
    y='Discount_rate',      
    color='Rating',           
    hover_name='Product_name',
    hover_data=['Price', 'Discount_rate', 'Customer_reviews','Category', 'Sub_Category', 'Rating'],
    #opacity= 0.7,
    title='Discount Rates vs. Customer Reviews',
    color_continuous_scale=px.colors.sequential.Viridis 
)

fig.update_layout(
    xaxis_title_font_size=14,
    yaxis_title_font_size=14,
    title_font_size=18,
    legend_title_text='Rating'
)

fig.show()

<Figure size 1200x600 with 0 Axes>

# Text Preprocessing

In [13]:
# Initializing NLTK components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if not isinstance(text, str):
        return "" 
    text = text.lower()
    
    
    text = re.sub(r'[^\w\s]', '', text) 

    # 3. Remove Numbers (optional, uncomment if desired)
    # text = re.sub(r'\d+', '', text)
    
    # 4. Remove Extra Whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    
    words = text.split() 
    words = [word for word in words if word not in stop_words]
    
    
    words = [lemmatizer.lemmatize(word) for word in words]
    
    
    return ' '.join(words)

print("Applying Text Preprocessing ")

df1 = df.copy()

df1['Product_name'] = df1['Product_name'].apply(preprocess_text)

df1['Category'] = df1['Category'].apply(preprocess_text)

df1['Sub_Category'] = df1['Sub_Category'].apply(preprocess_text)

df1.loc[df1["Sub_Category"] == "make tool accessory", "Sub_Category"] = \
    df1.loc[df1["Sub_Category"] == "make tool accessory", "Sub_Category"].str.replace("make tool accessory", "make up tool accessory")
    
df1.loc[df1["Sub_Category"] == "make", "Sub_Category"] = \
    df1.loc[df1["Sub_Category"] == "make", "Sub_Category"].str.replace("make", "make up")
        
df1.head()

Applying Text Preprocessing 


Unnamed: 0,Product_name,Rating,Price,Discount_rate,Customer_reviews,Category,Sub_Category
0,4 pc men perfume set perfume different fragran...,4.5,999.0,47,2923.0,beauty,fragrance
1,promotion usb electric hair cutting machine re...,4.4,367.0,54,2782.0,hair cutting tool,hair clipper trimmer
2,clearance pricerichripple vintage t9 hair cutt...,4.5,367.0,81,1445.0,hair cutting tool,hair clipper trimmer
3,2 1 curling iron hair straightener flat iron s...,4.4,859.0,70,1775.0,hair styling tool appliance,hair straightener
4,2 1 hot comb straightener electric hair straig...,4.3,898.0,53,1540.0,hair styling tool appliance,hair comb


# Feature Engeering

In [14]:
df1['Popularity_Score'] = (df1['Rating'] * 0.6) + (df1['Customer_reviews'] / df1['Customer_reviews'].max() * 0.4)

# Initializing the TF-IDF Vectorizer for cosine similarity calculation
tfidf_vectorizer = TfidfVectorizer(max_features=5000) 

tfidf_matrix = tfidf_vectorizer.fit_transform(df1['Product_name'])
print(f"TF-IDF matrix created for Product_name_processed. Shape: {tfidf_matrix.shape}")

onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

categories_combined = df1[['Category', 'Sub_Category']]
onehot_matrix = onehot_encoder.fit_transform(categories_combined)

# Add this line here to inspect the shape
print(f"Shape of onehot_matrix immediately after fit_transform: {onehot_matrix.shape}")

# Get feature names for the one-hot encoded columns
onehot_feature_names = onehot_encoder.get_feature_names_out(['Category', 'Sub_Category'])
onehot_df = pd.DataFrame(onehot_matrix, columns=onehot_feature_names, index=df1.index)


#Combining Features for Recommendation
# we start by Convertin popularity score to a sparse matrix 
popularity_sparse = coo_matrix(df1['Popularity_Score'].values.reshape(-1, 1))


if not hasattr(onehot_matrix, 'shape') or not hasattr(onehot_matrix, 'dtype'): 
    onehot_matrix = pd.DataFrame(onehot_matrix).sparse.to_coo()

# Ensuring all inputs to hstack are sparse matrices
if not hasattr(tfidf_matrix, 'shape') or not hasattr(tfidf_matrix, 'dtype'):
    tfidf_matrix = pd.DataFrame(tfidf_matrix).sparse.to_coo()


# Combining all relevant features into a single matrix for a content-based model
combined_features_matrix = hstack([tfidf_matrix, onehot_matrix, popularity_sparse])

# Cosine Similarity Calculation 
cosine_sim_matrix = cosine_similarity(combined_features_matrix)
print(f"\nCosine Similarity Matrix calculated. Shape: {cosine_sim_matrix.shape}")


TF-IDF matrix created for Product_name_processed. Shape: (718, 2483)
Shape of onehot_matrix immediately after fit_transform: (718, 223)

Cosine Similarity Matrix calculated. Shape: (718, 718)


# Building the popularity-based + content-based recommendation system

In [15]:
#  Recommendation System Function 
def get_recommendations(product_name, df, cosine_sim_matrix, top_n=5):
    """
    Generates product recommendations based on cosine similarity from a hybrid model.

    Args:
        product_name (str): The name of the product for which to get recommendations.
        df (pd.DataFrame): Your DataFrame containing product information.
        cosine_sim_matrix (np.array): The pre-calculated cosine similarity matrix.
        top_n (int): The number of top recommendations to return.

    Returns:
        pd.DataFrame: A DataFrame of recommended products.
    """
    # Get the index of the product that matches the name (case-insensitive, partial match)
    product_indices = df[df['Product_name'].str.contains(product_name, case=False, na=False)].index.tolist()

    if not product_indices:
        print(f"Product '{product_name}' not found in the DataFrame. Please check the product name.")
        return pd.DataFrame() # Return empty DataFrame if product not found

    # Use the first matching product's index
    product_idx = product_indices[0]

    # Get the similarity scores for this product with all other products
    sim_scores = list(enumerate(cosine_sim_matrix[product_idx]))

    # Sort the products based on the similarity scores in descending order
    # Exclude the product itself (similarity score of 1 with itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    # Get the product indices and similarity scores
    recommended_product_indices = [i[0] for i in sim_scores]
    similar_products_scores = [i[1] for i in sim_scores]

    # Return the recommended products with relevant details
    recommended_df = df.iloc[recommended_product_indices].copy()
    recommended_df['Similarity_Score'] = similar_products_scores
    return recommended_df[['Product_name', 'Category', 'Sub_Category', 'Rating', 'Price', 'Popularity_Score', 'Similarity_Score']]

print("\n--- Recommendation system (get_recommendations function) defined. ---")

# --- Example Usage ---
print("\n--- Example Recommendations ---")

# Replace 'Bluetooth Headphones' with a product name from your dataset for a real test
# You can view df1['Product_name'].tolist() to get exact names.
example_product_category = 'Beauty'
recommendations = get_recommendations(example_product_category, df1, cosine_sim_matrix, top_n=2)

if not recommendations.empty:
    print(f"\nTop 5 Recommendations for '{example_product_category}':")
    print(recommendations)
else:
    print(f"\nNo recommendations found for '{example_product_category}'.")

# Another example
example_product_cat_2 = 'Skin Care'
recommendations_2 = get_recommendations(example_product_cat_2, df1, cosine_sim_matrix, top_n=3)

if not recommendations_2.empty:
    print(f"\nTop 3 Recommendations for '{example_product_cat_2}':")
    print(recommendations_2)
else:
    print(f"\nNo recommendations found for '{example_product_cat_2}'.")


--- Recommendation system (get_recommendations function) defined. ---

--- Example Recommendations ---

Top 5 Recommendations for 'Beauty':
                                          Product_name  \
393  2 1 hair straightener ceramic flat iron straig...   
398  mini hair straightener iron pink ceramic strai...   

                        Category       Sub_Category  Rating   Price  \
393  hair styling tool appliance  hair straightener     4.7  1199.0   
398  hair styling tool appliance  hair straightener     4.3   449.0   

     Popularity_Score  Similarity_Score  
393          2.858590          0.978172  
398          2.630496          0.964583  

Top 3 Recommendations for 'Skin Care':
                                          Product_name   Category  \
15   27pcs skin care set vitamin c face cleanser mo...  skin care   
708  3 1 collagen serum retinol serum eye essence c...  skin care   
267  sadoer 3 1 set acne repair serum cream cleanse...  skin care   

      Sub_Category  Rating 

In [16]:
if 'df1' in locals() and not df1.empty:
    unique_combinations = df1[['Category', 'Sub_Category']].drop_duplicates().sort_values(by=['Category', 'Sub_Category'])

    if not unique_combinations.empty:
        output_filename = 'unique_category_sub_category_combinations.csv'
        unique_combinations.to_csv(output_filename, index=False)
        print(f"\nAll unique combinations saved to '{output_filename}'")
    else:
        print("No unique category-sub_category combinations found. Nothing to save.")
else:
    print("df1 is not loaded or is empty. Please ensure your DataFrame is correctly loaded and processed.")


All unique combinations saved to 'unique_category_sub_category_combinations.csv'


In [17]:
def get_recommendations(product_name_query, df, cosine_sim_matrix, top_n=5):
    """
    This function Generates product recommendations based on cosine similarity from a hybrid model.
    It takes a product name query (partial match allowed) to find a seed product.

    Args:
        product_name_query (str): The name of the product for which to get recommendations.
        df (pd.DataFrame): Your DataFrame containing product information.
        cosine_sim_matrix (np.array): The pre-calculated cosine similarity matrix.
        top_n (int): The number of top recommendations to return.

    Returns:
        pd.DataFrame: A DataFrame of recommended products.
    """
    
    product_indices = df[df['Product_name'].str.contains(product_name_query, case=False, na=False)].index.tolist()

    if not product_indices:
        return pd.DataFrame()

    # Use the first matching product's index as the seed
    product_idx = product_indices[0]

    # Get the similarity scores for this product with all other products
    sim_scores = list(enumerate(cosine_sim_matrix[product_idx]))

    # Sort the products based on the similarity scores in descending order Excluding the product itself 
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    # Get the product indices and similarity scores
    recommended_product_indices = [i[0] for i in sim_scores]
    similar_products_scores = [i[1] for i in sim_scores]

    # Return the recommended products with relevant details (including original Similarity_Score for internal use)
    recommended_df = df.iloc[recommended_product_indices].copy()
    recommended_df['Similarity_Score'] = similar_products_scores
    return recommended_df[['Product_name', 'Category', 'Sub_Category', 'Rating', 'Price', 'Popularity_Score', 'Similarity_Score', 'Discount_rate', 'Customer_reviews']]



def get_recommendations_by_category(category_name, sub_category_name, df, cosine_sim_matrix, top_n=5):
    """
    This Generates product recommendations by finding the most popular product within a
    specified category and sub-category, then recommending similar products.
    Returns only Product_name, Rating, Discount_rate, and Customer_reviews.

    Args:
        category_name (str): The desired category.
        sub_category_name (str): The desired sub-category.
        df (pd.DataFrame): Your DataFrame containing product information.
        cosine_sim_matrix (np.array): The pre-calculated cosine similarity matrix.
        top_n (int): The number of top recommendations to return.

    Returns:
        pd.DataFrame: A DataFrame of recommended products with specified columns.
    """
    clean_input_category = re.sub(r'\W+', ' ', str(category_name)).lower().strip()
    clean_input_sub_category = re.sub(r'\W+', ' ', str(sub_category_name)).lower().strip()

    filtered_products = df[
        (df['Category'] == clean_input_category) &
        (df['Sub_Category'] == clean_input_sub_category)
    ].copy()

    if filtered_products.empty:
        print(f"No products found for Category: '{category_name}' and Sub-Category: '{sub_category_name}'.")
        return pd.DataFrame()

    most_popular_product = filtered_products.sort_values(by='Popularity_Score', ascending=False).iloc[0]
    seed_product_name = most_popular_product['Product_name']

    recommendations_df = get_recommendations(seed_product_name, df, cosine_sim_matrix, top_n=top_n)

    if recommendations_df.empty:
        print(f"Could not find similar recommendations for '{seed_product_name}'.")
        return pd.DataFrame()

    return recommendations_df[['Product_name', 'Rating', 'Discount_rate', 'Customer_reviews']]




user_category = input("Enter the desired product Category (e.g., Health & Beauty): ")
user_sub_category = input("Enter the desired product Sub-Category (e.g., Skincare): ")
num_recommendations = int(input("Enter the number of recommendations you want (e.g., 5): ") or 5) 

category_recommendations = get_recommendations_by_category(
    user_category,
    user_sub_category,
    df1, 
    cosine_sim_matrix, 
    top_n=num_recommendations
)

if not category_recommendations.empty:
    print(f"\nTop {num_recommendations} Recommendations for Category '{user_category}' and Sub-Category '{user_sub_category}':")
    
    for index, product in category_recommendations.iterrows():
        print(f"\nProduct Name: {product['Product_name']}")
        print(f"Rating: {product['Rating']}")
        print(f"Discount Rate: {product['Discount_rate']}%") 
        print(f"Customer Reviews: {product['Customer_reviews']}")
else:
    print(f"\nNo recommendations found for the specified Category and Sub-Category. Please confirm the possible combinations an try again.")


Top 4 Recommendations for Category 'body' and Sub-Category 'body wash':

Product Name: 3pcs kojie san skin lightening soap original classic kojic acid soap dark spot hyperpigmentation whitening scar beauty coconut tea tree oil fair glowing flawless even skin
Rating: 4.6
Discount Rate: 58%
Customer Reviews: 13.0

Product Name: 3 piece kojie san skin lightening soap original classic kojic acid soap dark spot hyperpigmentation whitening scar beauty bar coconut tea tree oil fair skin
Rating: 4.2
Discount Rate: 69%
Customer Reviews: 176.0

Product Name: kojie san skin lightening soap original classic kojic acid soap dark spot hyperpigmentation whitening scar beauty bar coconut tea tree oil fair glowing flawless even skin tone
Rating: 4.2
Discount Rate: 75%
Customer Reviews: 16.0

Product Name: kojie san skin lightening soap original classic kojic acid soap dark spot hyperpigmentation whitening scar beauty bar coconut tea tree oil fair glowing flawless even skin tone
Rating: 4.1
Discount Ra