In [5]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from catboost import CatBoostRegressor
import os
import pickle

In [7]:
# Load the preprocessed data into a dataframe
data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')

df = pd.read_csv(os.path.join(data_path, 'clean_data.csv'))

df.head(5)

Unnamed: 0,product_star_rating,product_num_offers,product_minimum_offer_price,is_fast_shipping,product,discount_rate,has_storage,brand,log_product_num_ratings,log_storage_capacity,product_x_brand
0,4.3,20,72.9,False,Phone,0.111123,1,HONOR,6.948897,4.859812,Phone_x_HONOR
1,4.5,1,129.99,False,Phone,0.235308,1,Samsung,8.368693,4.859812,Phone_x_Samsung
2,4.3,6,71.99,False,Phone,0.078223,1,Samsung,4.564348,4.174387,Phone_x_Samsung
3,4.5,18,143.1,False,Phone,0.493762,1,Samsung,5.826,4.859812,Phone_x_Samsung
4,4.5,19,125.0,False,Phone,0.07915,1,Samsung,7.646831,4.859812,Phone_x_Samsung


In [9]:
df.dtypes

product_star_rating            float64
product_num_offers               int64
product_minimum_offer_price    float64
is_fast_shipping                  bool
product                         object
discount_rate                  float64
has_storage                      int64
brand                           object
log_product_num_ratings        float64
log_storage_capacity           float64
product_x_brand                 object
dtype: object

In [11]:
# Encode the boolean variable: 'is_fast_shipping' 
df['is_fast_shipping'] = np.where(df['is_fast_shipping'],1,0)

In [91]:
# Create a custom class that imputes the 'product_star_rating' using the median, grouped by the Product Type
class GroupedMedianImputer(BaseEstimator, TransformerMixin):
    def __init__(self, grouped, target):
        self.grouped = grouped
        self.target = target

    def fit(self, X,y=None):
        self.median_ = X.groupby(self.grouped)[self.target].median()
        return self

    def transform(self, X):
        X = X.copy()
        for product, median in self.median_.items():
            # Impute missing values with the median when grouped by product
            X.loc[X[self.grouped] == product, self.target] = X.loc[X[self.grouped] == product, self.target].fillna(median)
        return X


In [93]:
# Prepare the data for model training by creating a train/test split
# Apply a log transformation to the target variable 'product_minimum_offer_price' to stabilise variance and reduce skewness.
# 'has_storage' and 'is_fast_shipping' features are dropped due to their low feature importance after model evaluation. 
X = df.drop(['product_minimum_offer_price', 'has_storage', 'is_fast_shipping'], axis=1)
y = np.log1p(df['product_minimum_offer_price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [88]:
# Features to be processed
numerical_features = ['product_star_rating', 'log_product_num_ratings', 'product_num_offers', 'log_storage_capacity']
categorical_features = ['brand', 'product', 'product_x_brand']

# Apply the imputer to the product star rating
grouped_median_imputer = Pipeline(steps=[('imputer', GroupedMedianImputer('product', 'product_star_rating'))])

# Preprocess categorical and numerical features
preprocessor = ColumnTransformer(transformers=[('scaler', StandardScaler(), numerical_features),
                                               ('encoder', TargetEncoder(target_type='continuous', cv=20), categorical_features)], remainder='passthrough')
           
            
# Construct the pipeline
pipeline = Pipeline(steps=[('grouped_median_imputer', grouped_median_imputer),
                           ('preprocessor', preprocessor),
                           ('model', RandomForestRegressor(random_state = 42))])

# Parameter grid for RandomizedSearchCV
param_dist = {
    'model__n_estimators': [450, 455, 460],  
    'model__max_depth': [10,13, 15],  
    'model__min_samples_split': [4, 5], 
    'model__min_samples_leaf': [3, 4], 
    'model__max_features': ['log2'], 
}
# RandomizedSearchCV for hyperparameter tuning
cv = RepeatedKFold(n_splits=5, n_repeats=15, random_state=42)
random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=20,
                                   cv=cv, scoring='r2', n_jobs=-1, verbose=1)


# Fit the model
random_search.fit(X_train, y_train)

# Evaluate the best model
y_pred = random_search.best_estimator_.predict(X_test)
print("Best R2 score:", r2_score(y_test, y_pred))
print("Best Root Mean Squared Error (Log-scale):", {np.sqrt(mean_squared_error(y_test, y_pred))})
print("Best Params:", random_search.best_params_)


Fitting 75 folds for each of 20 candidates, totalling 1500 fits
Best R2 score: 0.7280601211493469
Best Root Mean Squared Error (Log-scale): {0.6566183134052632}
Best Params: {'model__n_estimators': 450, 'model__min_samples_split': 4, 'model__min_samples_leaf': 3, 'model__max_features': 'log2', 'model__max_depth': 13}


In [92]:
# Investigate feature importance to guide the feature engineering process
optimal_model = random_search.best_estimator_
rfr = optimal_model.named_steps['model']
feature_importance = rfr.feature_importances_
preprocessor = optimal_model.named_steps['preprocessor']

feature_importance = pd.DataFrame({'Feature': preprocessor.get_feature_names_out(), 'Importance': feature_importance})

feature_importance.sort_values(by='Importance', ascending=False)


Unnamed: 0,Feature,Importance
3,scaler__log_storage_capacity,0.236765
5,encoder__product,0.225644
6,encoder__product_x_brand,0.205903
1,scaler__log_product_num_ratings,0.134592
4,encoder__brand,0.100413
0,scaler__product_star_rating,0.040645
2,scaler__product_num_offers,0.030621
7,remainder__discount_rate,0.025417


In [97]:
# Explore alternative models - Cat Boost Regressor

# Features to be processed
cat_pipeline = Pipeline(steps=[('grouped_median_imputer', grouped_median_imputer),
                           ('preprocessor', preprocessor), 
                           ('model', CatBoostRegressor(iterations = 8000, learning_rate=0.15, depth = 10, random_state=42, verbose=0))])


cat_pipeline.fit(X_train, y_train)

# Model Evaluation
y_pred = cat_pipeline.predict(X_test)
print(f"R2 Score: {r2_score(y_test, y_pred)}")
print(f"Log-scale RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")

R2 Score: 0.6899218831925331
Log-scale RMSE: 0.7011518758428291


In [None]:
# Save the top performing model - Random Forest Regressor
model_path = os.path.join(os.path.dirname(os.getcwd()), 'models')

with open(os.path.join(model_path,'ecommerce_product_price_regressor.pkl'), 'wb') as f:
    pickle.dump(optimal_model, f) 

# Model Building and Evaluation Summary

## Feature Engineering

- **Outliers Removal**: Removed outliers for the `storage_capacity` feature.
- **Log Transformation**: Applied a log transformation to heavily right-skewed numerical features, including `product_num_ratings` and `storage_capacity`.
- **Interaction Feature**: Created an interaction feature between `product` and `brand`, leveraging their high feature importance.
- **Threshold-based Filtering**: Dropped any records where the count of `brand` was less than 10.

## Model Building

- **Log Transformation on Target Variable**: Applied a log transformation to the target variable (`product_minimum_offer_price`) before fitting the model, given how right-skewed the distribution was. The model performed better after applying the transformation.
- **Feature Importance Evaluation**: Evaluated feature importance and dropped `has_storage` and `is_fast_shipping`, as these features demonstrated very minimal impact on the model's performance.
- **Missing Data Imputation**: Created a custom column transformer to impute missing `product_star_rating` values. Imputed missing values with the median, grouped by product type, to address variations across different product categories.
- **Categorical Encoding**: Used TargetEncoder for categorical features, which replaced each distinct category with the mean of the target variable for that category, outperforming the standard OneHotEncoding.
- **Feature Scaling**: Applied StandardScaler to scale numerical features.
- **Model Training and Hyperparameter Tuning**: Trained a Random Forest Regressor model, experimenting with various hyperparameters using **RandomizedSearchCV** to find the model with the best $R^2$ score.
- **Model Comparison**: Trained a CatBoost Regressor, which yielded a comparable $R^2$ score of 0.69 (log-scale RMSE = 0.70). However, the Random Forest Regressor outperformed and responded better to hyperparameter tuning and feature engineering.

## Performance Improvement

- **Pre-feature engineering/hyperparameter tuning $R^2$**: 0.56
- **Post-feature engineering/hyperparameter tuning $R^2$**: 0.73

The model now explains 73% of the variance in the target variable, demonstrating a significant improvement in its ability to predict the product minimum offer price.

## Next Steps

- To further improve the model, more data across all product types would be beneficial to help the model generalise better. Additional features could also be explored to better understand nuanced differences in pricing strategies between products.
