In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import Dependencies

In [2]:
import os
import pandas as pd
import numpy as np
import sklearn
import matplotlib as plt
import seaborn as sns
from IPython import display

## Data Collection & Preprocessing

In [3]:
# File path
train_file_path = '/content/drive/MyDrive/product_info.csv'

# Columns to extract
columns_to_keep = [
    "product_id",
    "product_name",
    "brand_id",
    "brand_name",
    "loves_count",
    "rating",
    "reviews",
    "price_usd",
    "limited_edition",
    "new",
    "highlights",
    "primary_category",
    "secondary_category"
]

# Load the dataset with only the specified columns
try:
    df = pd.read_csv(train_file_path, usecols=columns_to_keep, na_values='?')
    print("Dataset loaded successfully.")
    print("Shape of the filtered dataset:", df.shape)
    print(df.head())  # Display the first few rows of the filtered dataset
except Exception as e:
    print(f"Error loading the dataset: {e}")


Dataset loaded successfully.
Shape of the filtered dataset: (8494, 13)
  product_id               product_name  brand_id brand_name  loves_count  \
0    P473671    Fragrance Discovery Set      6342      19-69         6320   
1    P473668    La Habana Eau de Parfum      6342      19-69         3827   
2    P473662  Rainbow Bar Eau de Parfum      6342      19-69         3253   
3    P473660       Kasbah Eau de Parfum      6342      19-69         3018   
4    P473658  Purple Haze Eau de Parfum      6342      19-69         2691   

   rating  reviews  price_usd  limited_edition  new  \
0  3.6364     11.0       35.0                0    0   
1  4.1538     13.0      195.0                0    0   
2  4.2500     16.0      195.0                0    0   
3  4.4762     21.0      195.0                0    0   
4  3.2308     13.0      195.0                0    0   

                                          highlights primary_category  \
0  ['Unisex/ Genderless Scent', 'Warm &Spicy Scen...        Fr

Number of rows and columns

In [4]:
df.shape

(8494, 13)

Transposes the first 5 rows of the DataFrame

In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
product_id,P473671,P473668,P473662,P473660,P473658
product_name,Fragrance Discovery Set,La Habana Eau de Parfum,Rainbow Bar Eau de Parfum,Kasbah Eau de Parfum,Purple Haze Eau de Parfum
brand_id,6342,6342,6342,6342,6342
brand_name,19-69,19-69,19-69,19-69,19-69
loves_count,6320,3827,3253,3018,2691
rating,3.6364,4.1538,4.25,4.4762,3.2308
reviews,11.0,13.0,16.0,21.0,13.0
price_usd,35.0,195.0,195.0,195.0,195.0
limited_edition,0,0,0,0,0
new,0,0,0,0,0


Getting some statistical measures about the data

In [6]:
df.describe()

Unnamed: 0,brand_id,loves_count,rating,reviews,price_usd,limited_edition,new
count,8494.0,8494.0,8216.0,8216.0,8494.0,8494.0,8494.0
mean,5422.440546,29179.57,4.194513,448.545521,51.655595,0.070285,0.071698
std,1709.595957,66092.12,0.516694,1101.982529,53.669234,0.255642,0.258002
min,1063.0,0.0,1.0,1.0,3.0,0.0,0.0
25%,5333.0,3758.0,3.981725,26.0,25.0,0.0,0.0
50%,6157.5,9880.0,4.28935,122.0,35.0,0.0,0.0
75%,6328.0,26841.25,4.530525,418.0,58.0,0.0,0.0
max,8020.0,1401068.0,5.0,21281.0,1900.0,1.0,1.0


Checking the unique values

In [7]:
df.nunique()

Unnamed: 0,0
product_id,8494
product_name,8415
brand_id,304
brand_name,304
loves_count,7436
rating,4394
reviews,1556
price_usd,298
limited_edition,2
new,2


Getting some informations about the data

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8494 entries, 0 to 8493
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   product_id          8494 non-null   object 
 1   product_name        8494 non-null   object 
 2   brand_id            8494 non-null   int64  
 3   brand_name          8494 non-null   object 
 4   loves_count         8494 non-null   int64  
 5   rating              8216 non-null   float64
 6   reviews             8216 non-null   float64
 7   price_usd           8494 non-null   float64
 8   limited_edition     8494 non-null   int64  
 9   new                 8494 non-null   int64  
 10  highlights          6287 non-null   object 
 11  primary_category    8494 non-null   object 
 12  secondary_category  8486 non-null   object 
dtypes: float64(3), int64(4), object(6)
memory usage: 862.8+ KB


Check the number of missing values in each column

In [9]:
df.isna().sum()

Unnamed: 0,0
product_id,0
product_name,0
brand_id,0
brand_name,0
loves_count,0
rating,278
reviews,278
price_usd,0
limited_edition,0
new,0


## Handling the Missing Values

In [10]:
from sklearn.impute import SimpleImputer

# Separate numerical and categorical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Impute numerical columns with the mean
num_imp = SimpleImputer(strategy='mean')
df_num_imp = pd.DataFrame(num_imp.fit_transform(df[num_cols]), columns=num_cols)

# Impute categorical columns with the most frequent value
cat_imp = SimpleImputer(strategy='most_frequent')
df_cat_imp = pd.DataFrame(cat_imp.fit_transform(df[cat_cols]), columns=cat_cols)

# Combine the imputed numerical and categorical data back into one dataframe
df_imp = pd.concat([df_num_imp, df_cat_imp], axis=1)

# Check the shape of the imputed dataframe
print(df_imp.shape)

# Print the first few rows to verify the changes
print(df_imp.head())

(8494, 13)
   brand_id  loves_count  rating  reviews  price_usd  limited_edition  new  \
0    6342.0       6320.0  3.6364     11.0       35.0              0.0  0.0   
1    6342.0       3827.0  4.1538     13.0      195.0              0.0  0.0   
2    6342.0       3253.0  4.2500     16.0      195.0              0.0  0.0   
3    6342.0       3018.0  4.4762     21.0      195.0              0.0  0.0   
4    6342.0       2691.0  3.2308     13.0      195.0              0.0  0.0   

  product_id               product_name brand_name  \
0    P473671    Fragrance Discovery Set      19-69   
1    P473668    La Habana Eau de Parfum      19-69   
2    P473662  Rainbow Bar Eau de Parfum      19-69   
3    P473660       Kasbah Eau de Parfum      19-69   
4    P473658  Purple Haze Eau de Parfum      19-69   

                                          highlights primary_category  \
0  ['Unisex/ Genderless Scent', 'Warm &Spicy Scen...        Fragrance   
1  ['Unisex/ Genderless Scent', 'Layerable Scent'

In [11]:
print(df_cat_imp.isna().sum())
print(df_num_imp.isna().sum())

product_id            0
product_name          0
brand_name            0
highlights            0
primary_category      0
secondary_category    0
dtype: int64
brand_id           0
loves_count        0
rating             0
reviews            0
price_usd          0
limited_edition    0
new                0
dtype: int64


## Outlier Detection and Handling

In [12]:
# List of numerical columns to check for outliers
numerical_columns = ['rating', 'loves_count', 'reviews']

# Loop through each numerical column to detect outliers
for col in numerical_columns:
    Q1 = df[col].quantile(0.25)  # 25th percentile
    Q3 = df[col].quantile(0.75)  # 75th percentile
    IQR = Q3 - Q1  # Interquartile range

    # Calculate the lower and upper bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify outliers
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]

# Remove outliers for each numerical column
for col in numerical_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter out rows where the value is an outlier
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

print("Data after removing outliers:")
print(df.head())

Data after removing outliers:
  product_id               product_name  brand_id brand_name  loves_count  \
0    P473671    Fragrance Discovery Set      6342      19-69         6320   
1    P473668    La Habana Eau de Parfum      6342      19-69         3827   
2    P473662  Rainbow Bar Eau de Parfum      6342      19-69         3253   
3    P473660       Kasbah Eau de Parfum      6342      19-69         3018   
4    P473658  Purple Haze Eau de Parfum      6342      19-69         2691   

   rating  reviews  price_usd  limited_edition  new  \
0  3.6364     11.0       35.0                0    0   
1  4.1538     13.0      195.0                0    0   
2  4.2500     16.0      195.0                0    0   
3  4.4762     21.0      195.0                0    0   
4  3.2308     13.0      195.0                0    0   

                                          highlights primary_category  \
0  ['Unisex/ Genderless Scent', 'Warm &Spicy Scen...        Fragrance   
1  ['Unisex/ Genderless Scent'

## Feature Scaling / Normalization

In [13]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Apply Min-Max Scaling
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Display the scaled data
print(df.head())

  product_id               product_name  brand_id brand_name  loves_count  \
0    P473671    Fragrance Discovery Set      6342      19-69     0.096675   
1    P473668    La Habana Eau de Parfum      6342      19-69     0.058540   
2    P473662  Rainbow Bar Eau de Parfum      6342      19-69     0.049760   
3    P473660       Kasbah Eau de Parfum      6342      19-69     0.046165   
4    P473658  Purple Haze Eau de Parfum      6342      19-69     0.041163   

     rating   reviews  price_usd  limited_edition  new  \
0  0.258913  0.013717       35.0                0    0   
1  0.540109  0.016461      195.0                0    0   
2  0.592391  0.020576      195.0                0    0   
3  0.715326  0.027435      195.0                0    0   
4  0.038478  0.016461      195.0                0    0   

                                          highlights primary_category  \
0  ['Unisex/ Genderless Scent', 'Warm &Spicy Scen...        Fragrance   
1  ['Unisex/ Genderless Scent', 'Layerable

## Feature Engineering

In [14]:
# Create interaction terms
df['rating_reviews'] = df['rating'] * df['reviews']
df['rating_loves_count_reviews'] = df['rating'] * df['loves_count'] * df['reviews']

# Display the updated DataFrame with interaction terms
print(df[['rating', 'loves_count', 'reviews', 'rating_reviews', 'rating_loves_count_reviews']].head())

     rating  loves_count   reviews  rating_reviews  rating_loves_count_reviews
0  0.258913     0.096675  0.013717        0.003552                    0.000343
1  0.540109     0.058540  0.016461        0.008891                    0.000520
2  0.592391     0.049760  0.020576        0.012189                    0.000607
3  0.715326     0.046165  0.027435        0.019625                    0.000906
4  0.038478     0.041163  0.016461        0.000633                    0.000026


Calculate sentiment score

In [15]:
df['sentiment_score'] = (df['rating'] * 0.6) + (df['loves_count'] * 0.2) + (df['reviews'] * 0.2)
print(df['sentiment_score'])

0       0.177426
1       0.339065
2       0.369502
3       0.443916
4       0.034612
          ...   
8487    0.120585
8488    0.608762
8489    0.423856
8490    0.640891
8491    0.604787
Name: sentiment_score, Length: 6374, dtype: float64


In [19]:
import pandas as pd

# Assuming 'df' is your DataFrame and it contains 'primary_category' and 'sentiment_score'
# Create a dictionary to hold the top 10 entries for each primary_category
category_dict = {}

# Group the data by 'primary_category'
grouped = df.groupby('primary_category')

# Extract the top 10 entries for each category based on 'sentiment_score'
for category, group in grouped:
    category_dict[category] = group.nlargest(10, 'sentiment_score')  # Get top 10 by 'sentiment_score'

# Create a plot for each primary_category
for category, top_10 in category_dict.items():
    # Sort by sentiment_score for better visualization
    top_10_sorted = top_10.sort_values(by='sentiment_score', ascending=False)

    # Example plot (replace with your plotting code)
    print(f"Plotting for category: {category}")
    print(top_10_sorted[['product_name', 'sentiment_score']])  # Example output


Plotting for category: Bath & Body
                                           product_name  sentiment_score
6947                 Resurface+ AHA Renewing Body Cream         0.847858
5369  The Body Serum - With Hyaluronic Acid, Niacina...         0.832778
2790                                   Milk Body Lotion         0.755192
2789                                 Milk Body Cleanser         0.744267
4290                            Almond Milk Concentrate         0.730640
5084                                   Night Body Serum         0.716820
2642                   Ingrown Hair Pads with BHA & AHA         0.707600
5380  The Body Cream - With 5 Ceramides, Colloidal O...         0.701473
2796                          Milk Hydrating Hand Cream         0.687708
5829              Melrose Place Moisturizing Body Cream         0.672937
Plotting for category: Fragrance
                      product_name  sentiment_score
8455         Libre Eau de Toilette         0.782176
1929         Sauvage Eau 