In [37]:
import time
start = time.time()

import pandas as pd
import numpy as np

!pip install pandas plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import re

from collections import defaultdict
!pip install rapidfuzz
from rapidfuzz import fuzz, process

import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import (
    make_scorer, mean_absolute_error, mean_squared_error, r2_score
)

df = pd.read_csv('df_full.csv')
df.shape

print(f"Time taken: {time.time() - start:.4f} seconds")

Time taken: 7.2203 seconds


In [38]:
term_dict = {
    'Niacinamide': ['niacinamide', 'vitamin b3', 'b3', 'nicotinamide'],
    'Retinol': ['retinol', 'vitamin a1', 'a1', 'retinyl', 'retinyl acetate', 'retinyl palmitate', 'retinaldehyde', 'hydroxypinacolone retinoate', 'retinoate', 'tretinoin', 'retinoic acid', 'retin-a', 'all-trans retinoic acid'],
    'Peptides': ['peptide', 'peptides', 'oligopeptides', 'polypeptides', 'copper peptides', 'acetyl hexapeptide-8', 'matrixyl'],
    'Ascorbic Acid': ['ascorbic', 'l-ascorbic', 'ascorbic acid', 'vitamin c', 'l-ascorbic acid', 'magnesium ascorbyl phosphate', 'sodium ascorbyl phosphate', 'tetrahexyldecyl ascorbate'],
    'Hyaluronic Acid': ['hyaluronic', 'hyaluronic acid'],
    'Glycolic Acid': ['glycolic', 'glycolic acid', 'alpha hydroxy acid (aha)', 'aha'],
    'Salicylic Acid': ['salicylic', 'salicylic acid', 'bha', 'beta hydroxy acid'],
    'Lactic Acid': ['lactic', 'lactic acid'],
    'Vitamin E': ['vitamin e', 'tocopherol', 'alpha tocopherol'],
    'Ceramides': ['ceramide', 'ceramides', 'lipids', 'phytosphingosine'],
    'Benzoyl Peroxide': ['benzoyl peroxide', 'bp'],
    'Collagen': ['collagen', 'hydrolyzed collagen'],
    'Hydrate': ['hydrate', 'hydration', 'moisturize', 'moisturizing', 'water-infused', 'quench', 'dewy', 'plump'],
    'Anti-Aging': ['firm', 'tighten', 'lifting', 'contouring', 'tone', 'sculpt', 'tightening', 'anti-aging', 'age-defying', 'youthful', 'wrinkle-reducing', 'anti-wrinkle', 'rejuvenate', 'restore', 'age-reversing'],
    'Repair': ['nourish', 'repair', 'restore', 'renew', 'regenerate', 'heal', 'repairing', 'recovery'],
    'Protect': ['protect', 'shield', 'defend', 'barrier', 'spf', 'sun protection', 'anti-pollution', 'blue light protection'],
    'Soothing': ['soothing', 'calming', 'relieving', 'cooling', 'comforting', 'anti-inflammatory'],
    'Balance': ['balance', 'oil-control', 'mattifying', 'balancing', 'pore-minimizing', 'sebum control'],
    'Brightening': ['brighten', 'even-tone', 'lightening', 'radiant', 'skin-brightening', 'luminosity','glow', 'radiance', 'illuminate', 'brighten', 'luminous', 'shine', 'light-reflecting', 'even-tone'],
    'Vegan': ['vegan', 'plant-based', 'cruelty-free', 'no animal testing'],
    'Clean': ['clean', 'chemical-free', 'natural', 'organic', 'non-toxic', 'pure', 'green beauty'],
}

###Análisis Exploratorio

In [41]:
import plotly.express as px
import plotly.graph_objects as go

# Distribución de ratings
fig_rating_kde = px.histogram(df, x='rating', nbins=25, marginal='violin',
                               title='Distribución de Ratings',
                               labels={'rating': 'Rating'},
                               color_discrete_sequence=['#636EFA'],
                               opacity=0.8)
fig_rating_kde.update_layout(bargap=0.05)

# Distribución de precios
fig_price = px.histogram(df, x='price_usd', nbins=30,
                         title='Distribución de Precios (USD)',
                         labels={'price_usd': 'Precio (USD)'},
                         color_discrete_sequence=['#EF553B'])
fig_price.update_layout(bargap=0.1)

# Scatter interactivo: precio vs rating
fig_scatter = px.scatter(df, x='price_usd', y='rating',
                         title='Relación entre Precio y Rating',
                         labels={'price_usd': 'Precio (USD)', 'rating': 'Rating'},
                         hover_data=['product_name'],
                         color_discrete_sequence=['#636EFA'])

# Violin plot
fig_violin = px.violin(df, y='rating', box=True, points='all',
                       title='Distribución General de Ratings',
                       labels={'rating': 'Rating'},
                       color_discrete_sequence=['#00CC96'])

fig_rating_kde.show()
fig_price.show()
fig_scatter.show()
fig_violin.show()


###Preparación Training DataFrame

In [47]:
start = time.time()

# Dataframe para training del modelo, solo columnas relevantes
df_training = df[['product_name', 'rating', 'price_usd']].dropna()

# Aplicar detección de términos
for key, terms in term_dict.items():
    df_training[key] = df_training['product_name'].str.lower().apply(
        lambda x: any(term in x for term in terms)
    ).astype(int)

# Crear grupo de precios para stratification
df_training['price_group'] = pd.qcut(df_training['price_usd'], q=10, labels=False, duplicates='drop')

print(f"Time taken: {time.time() - start:.4f} seconds")

df_training.head(10)

Time taken: 0.5926 seconds


Unnamed: 0,product_name,rating,price_usd,Niacinamide,Retinol,Peptides,Ascorbic Acid,Hyaluronic Acid,Glycolic Acid,Salicylic Acid,...,Hydrate,Anti-Aging,Repair,Protect,Soothing,Balance,Brightening,Vegan,Clean,price_group
0,GENIUS Sleeping Collagen Moisturizer,4.36,98.0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,7
1,GENIUS Liquid Collagen Serum,3.98,115.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
2,Triple Algae Eye Renewal Balm Eye Cream,4.06,68.0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,5
3,GENIUS Liquid Collagen Lip Treatment,3.98,29.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,SUBLIME DEFENSE Ultra Lightweight UV Defense F...,4.26,28.0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
5,GENIUS Ultimate Anti-Aging Cream,4.18,112.0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,8
6,GENIUS Ultimate Anti-Aging Melting Cleanser,4.3826,38.0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,3
7,Gentle Rejuvenating Cleanser,4.3808,28.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
8,10 Day Results Kit,3.67,88.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
9,Advanced Anti-Aging Repairing Oil,4.457,82.0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,6


In [44]:
# Chequeo cantidad de productos por término
term_cols = list(term_dict.keys())
df_training[term_cols].sum().sort_values(ascending=False)


Unnamed: 0,0
Hydrate,1533
Brightening,1261
Anti-Aging,1248
Clean,1202
Protect,932
Repair,596
Ascorbic Acid,592
Hyaluronic Acid,444
Glycolic Acid,419
Retinol,413


###Entrenamiento XGBoost

In [45]:
start = time.time()

# Prepare data (X and y) and split into train and test sets
X = df_training.drop(columns=['product_name', 'rating', 'price_group'])
y = df_training['rating']
groups = df_training['price_group']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=groups, random_state=42
)

# Custom scoring function
def custom_score(y_true, y_pred):
    r2, mae = r2_score(y_true, y_pred), mean_absolute_error(y_true, y_pred)
    return r2 - (0 if r2 >= 0.7 else (0.7 - r2) * 10) - (0 if mae <= 0.1 else (mae - 0.1) * 100)

# Define parameter grid for XGBoost and perform RandomizedSearchCV
params = {'n_estimators': [50, 100, 200], 'max_depth': [3, 6, 10], 'learning_rate': [0.01, 0.1, 0.3], 'subsample': [0.7, 0.9, 1], 'colsample_bytree': [0.7, 0.9, 1]}
xg_reg = RandomizedSearchCV(xgb.XGBRegressor(random_state=42), params, n_iter=30, cv=5, scoring=make_scorer(custom_score, greater_is_better=True), random_state=42, error_score='raise')
xg_reg.fit(X_train, y_train)

# Get best model and metrics
best_xg_model = xg_reg.best_estimator_

# Compute model metrics
metrics = lambda y_true, y_pred: [mean_absolute_error(y_true, y_pred), mean_squared_error(y_true, y_pred), np.sqrt(mean_squared_error(y_true, y_pred)), r2_score(y_true, y_pred)]
metrics_df = pd.DataFrame({'Metric': ['MAE', 'MSE', 'RMSE', 'R²'], 'Train': metrics(y_train, best_xg_model.predict(X_train)), 'Test': metrics(y_test, best_xg_model.predict(X_test))})

# Print results
print(metrics_df)

print(f"Time taken: {time.time() - start:.4f} seconds")

  Metric     Train      Test
0    MAE  0.168939  0.186393
1    MSE  0.095792  0.106601
2   RMSE  0.309503  0.326498
3     R²  0.657896  0.632417
Time taken: 43.8424 seconds


In [48]:
# Mostrar algunas predicciones del modelo vs las rating reales
y_pred = best_xg_model.predict(X_test)

comparison_df = X_test.copy()
comparison_df['Actual Rating'] = y_test.values
comparison_df['Predicted Rating'] = y_pred
comparison_df = comparison_df.round(2)

comparison_df.head(5)


Unnamed: 0,price_usd,Niacinamide,Retinol,Peptides,Ascorbic Acid,Hyaluronic Acid,Glycolic Acid,Salicylic Acid,Lactic Acid,Vitamin E,...,Anti-Aging,Repair,Protect,Soothing,Balance,Brightening,Vegan,Clean,Actual Rating,Predicted Rating
5977,64.94,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,4.57,4.62
9318,70.37,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.43,4.39
5383,172.3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.13,3.59
5753,84.71,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,4.01,3.99
6262,25.49,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,4.41,4.4


In [49]:
# Get feature importances
feature_importances = best_xg_model.feature_importances_

features_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort features by importance
features_df = features_df.sort_values(by='Importance', ascending=False)

print(features_df)


             Feature  Importance
13           Hydrate    0.205282
19       Brightening    0.132697
4      Ascorbic Acid    0.101473
14        Anti-Aging    0.101114
2            Retinol    0.081117
16           Protect    0.080546
5    Hyaluronic Acid    0.047461
7     Salicylic Acid    0.044372
21             Clean    0.036839
1        Niacinamide    0.028294
15            Repair    0.023637
3           Peptides    0.023291
6      Glycolic Acid    0.021972
9          Vitamin E    0.017209
11  Benzoyl Peroxide    0.013703
17          Soothing    0.011664
8        Lactic Acid    0.007474
18           Balance    0.005965
12          Collagen    0.005521
10         Ceramides    0.005121
20             Vegan    0.004096
0          price_usd    0.001153


###Prueba

In [51]:
def predict_rating(name, price):

    # Extract words from the product name in lowercase
    words = set(re.findall(r'\b\w+\b', name.lower()))

    # Find matching terms
    matched = set()

    for category_dict in [term_dict]:
        for category, terms in category_dict.items():
            terms = [term.lower() for term in terms]

            if any(term in name for term in terms):
                matched.add(category)

    # Display matched terms
    if matched:
        print("Matched Terms:")
        for term in matched:
            print(f"- {term}")
    else:
        print("No terms matched!")

    # Initialize input features
    input_features = {col: 0 for col in X.columns}

    # Assign 1 for matched terms
    for term in matched:
        input_features[term] = 1
        if f"{term}_x_rating" in X.columns:
            input_features[f"{term}_x_rating"] = X.loc[X[f"{term}_x_rating"] > 0, f"{term}_x_rating"].mean()

    # Assign price
    input_features['price_usd'] = price if 'price_usd' in X.columns else 0

    # Convert to DataFrame with correct column order
    input_features = pd.DataFrame([input_features])[X.columns]

    # Predict and return rating
    return best_xg_model.predict(input_features)[0]

# Get user input
name, price = input("Enter product name: "), float(input("Enter price: "))

# Predict and display rating
print(f"\nPredicted Rating: {predict_rating(name, price):.2f}")


Enter product name: retinol anti-aging serum
Enter price: 78
Matched Terms:
- Retinol
- Anti-Aging

Predicted Rating: 4.75
