In [33]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold
from sklearn.inspection import permutation_importance

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from category_encoders import CatBoostEncoder

from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import optuna as opt

from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
# Define the base directory for the data files
data_dir = 'data'

# Load the dataset
df = pd.read_csv(os.path.join(data_dir, 'df_final.csv'))

df.head()

Unnamed: 0,price,freight_value,product_category_name,product_weight_g,product_length_cm,product_height_cm,product_width_cm,customer_city,customer_state,review_score,...,volume,density,actual_delivery_time,estimated_delivery_time,approval_order_time,distance,purchase_month,purchase_day_of_week,black_friday,christmas
0,29.99,8.72,utilidades_domesticas,500.0,19.0,8.0,13.0,sao paulo,SP,4,...,1976.0,0.253036,8.0,15,0.0,18.566632,10,0,0,0
1,118.7,22.76,perfumaria,400.0,19.0,13.0,19.0,barreiras,BA,4,...,4693.0,0.085233,13.0,19,1.0,847.437333,7,1,0,0
2,159.9,19.22,automotivo,420.0,24.0,19.0,21.0,vianopolis,GO,5,...,9576.0,0.04386,9.0,26,0.0,512.100044,8,2,0,0
3,45.0,27.2,pet_shop,450.0,30.0,10.0,20.0,sao goncalo do amarante,RN,5,...,6000.0,0.075,13.0,26,0.0,1816.085655,11,5,0,0
4,19.9,8.72,papelaria,250.0,51.0,15.0,15.0,santo andre,SP,5,...,11475.0,0.021786,2.0,12,0.0,29.684401,2,1,0,0


In [4]:
# Splitting in X and Y
X = df.drop(columns='freight_value', axis=1)
y = df.freight_value

In [5]:
# splitting in train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

In [6]:
# Initializing models
model_XGBoost = XGBRegressor(n_estimators = 1000, max_depth = 8, learning_rate = 1e-3, random_state = 0)
model_LightGBM = LGBMRegressor(n_estimators = 1000, max_depth = 8, num_leaves = 2^8, learning_rate = 1e-3, n_jobs = -1, verbose = -1, random_state = 0)
model_Catboost = CatBoostRegressor(n_estimators = 1000, max_depth = 8, learning_rate = 1e-3, random_state = 0, verbose = 0)
model_DecisionTree = DecisionTreeRegressor(random_state = 0, max_depth = 8, min_samples_split = 2)

In [7]:
# Feature importance

encoder = CatBoostEncoder()
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

for col in X_train_encoded.select_dtypes(include=['object']).columns:
    X_train_encoded[col] = encoder.fit_transform(X_train_encoded[col], y_train)
    X_test_encoded[col] = encoder.transform(X_test_encoded[col])

model_XGBoost.fit(X_train_encoded, y_train)
r = permutation_importance(model_XGBoost, X_test_encoded, y_test, n_repeats=30, random_state=0)

In [8]:
importances = pd.DataFrame({'Feature': X_test_encoded.columns, 'importance': r.importances_mean})
importances = importances.sort_values(by='importance', ascending=False)
importances

Unnamed: 0,Feature,importance
16,distance,0.211973
11,volume,0.198656
2,product_weight_g,0.178174
0,price,0.080163
7,customer_state,0.025644
3,product_length_cm,0.022553
9,seller_city,0.021714
10,seller_state,0.013614
5,product_width_cm,0.004467
12,density,0.003321


In [9]:
less_important_columns = ['purchase_day_of_week', 'approval_order_time', 'estimated_delivery_time', 'christmas', 'black_friday']

X_train = X_train.drop(columns=less_important_columns)
X_test = X_test.drop(columns=less_important_columns)

In [10]:
# Feature importance

encoder = CatBoostEncoder()
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

for col in X_train_encoded.select_dtypes(include=['object']).columns:
    X_train_encoded[col] = encoder.fit_transform(X_train_encoded[col], y_train)
    X_test_encoded[col] = encoder.transform(X_test_encoded[col])

model_XGBoost.fit(X_train_encoded, y_train)
r_v2 = permutation_importance(model_XGBoost, X_test_encoded, y_test, n_repeats=30, random_state=0)

In [11]:
importances_v2 = pd.DataFrame({'Feature': X_test_encoded.columns, 'importance': r_v2.importances_mean})
importances_v2 = importances_v2.sort_values(by='importance', ascending=False)
importances_v2

Unnamed: 0,Feature,importance
14,distance,0.211323
11,volume,0.20385
2,product_weight_g,0.182162
0,price,0.090672
7,customer_state,0.024785
3,product_length_cm,0.024383
9,seller_city,0.022433
10,seller_state,0.012966
5,product_width_cm,0.004509
6,customer_city,0.003811


In [12]:
k = 5

folds = KFold(n_splits=k, shuffle=True, random_state=42)

absolute_errors = list()
squared_errors = list()
r2 = list()

for k, (train_index, test_index) in enumerate(folds.split(X, y)):
    
    print("#"*10 + f" Fold: {k+1} " + "#"*10)
    
    X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
    X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

    encoder = CatBoostEncoder()
    
    cat_imputer = SimpleImputer(strategy='most_frequent')
    num_imputer = SimpleImputer(strategy='median')

    cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
    num_pipeline = Pipeline([('imputer', num_imputer)])

    cat_cols = X_train_internal.select_dtypes(include=['object']).columns
    num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

    X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
    X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

    X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
    X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

    model_XGBoost.fit(X_train_internal, y_train_internal)
    y_pred = model_XGBoost.predict(X_test_internal)
    r2score = r2_score(y_test_internal, y_pred)
    mse = mean_squared_error(y_test_internal, y_pred)
    mae = mean_absolute_error(y_test_internal, y_pred)

    absolute_errors.append(mae)
    squared_errors.append(mse)
    r2.append(r2score)

    print(f'MAE: {mae:.3f}')
    print(f'MSE: {mse:.3f}')
    print(f'R2: {r2score:.3f}')


########## Fold: 1 ##########
MAE: 5.044
MSE: 98.518
R2: 0.600
########## Fold: 2 ##########


MAE: 5.188
MSE: 106.660
R2: 0.597
########## Fold: 3 ##########
MAE: 5.146
MSE: 97.921
R2: 0.605
########## Fold: 4 ##########
MAE: 5.078
MSE: 112.953
R2: 0.584
########## Fold: 5 ##########
MAE: 4.990
MSE: 88.761
R2: 0.620


In [13]:
absolute_errors = np.array(absolute_errors)
squared_errors = np.array(squared_errors)
r2 = np.array(r2)

avg_mae = np.mean(absolute_errors)
avg_mse = np.mean(squared_errors)
avg_r2 = np.mean(r2)

std_mae = np.std(absolute_errors)
std_mse = np.std(squared_errors)
std_r2 = np.std(r2)

print("#"*5 + f" Displaying Average of Obtained Metrics : " + "#"*5)
print(f"Average MAE: {avg_mae:.3f} +/- {std_mae:.3f}")
print(f'Average MSE: {avg_mse:.3f} +/- {std_mse:.3f}')
print(f'Average R2: {avg_r2:.3f} +/- {std_r2:.3f}')

##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.089 +/- 0.071
Average MSE: 100.963 +/- 8.250
Average R2: 0.601 +/- 0.012


In [45]:
def cross_validation(X, y, model, k):
    folds = KFold(n_splits=k, shuffle=True, random_state=42)

    absolute_errors = list()
    squared_errors = list()
    r2 = list()

    for k, (train_index, test_index) in enumerate(folds.split(X, y)):
        
        print("#"*10 + f" Fold: {k+1} " + "#"*10)
        
        X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
        X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

        encoder = CatBoostEncoder()
        
        cat_imputer = SimpleImputer(strategy='most_frequent')
        num_imputer = SimpleImputer(strategy='median')

        cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
        num_pipeline = Pipeline([('imputer', num_imputer)])

        cat_cols = X_train_internal.select_dtypes(include=['object']).columns
        num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

        X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
        X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

        X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
        X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

        model.fit(X_train_internal, y_train_internal)
        y_pred = model.predict(X_test_internal)

        r2score = r2_score(y_test_internal, y_pred)
        mse = mean_squared_error(y_test_internal, y_pred)
        mae = mean_absolute_error(y_test_internal, y_pred)

        absolute_errors.append(mae)
        squared_errors.append(mse)
        r2.append(r2score)

        print(f'MAE: {mae:.3f}')
        print(f'MSE: {mse:.3f}')
        print(f'R2: {r2score:.3f}')
    
    absolute_errors = np.array(absolute_errors)
    squared_errors = np.array(squared_errors)
    r2 = np.array(r2)

    avg_mae = np.mean(absolute_errors)
    avg_mse = np.mean(squared_errors)
    avg_r2 = np.mean(r2)

    std_mae = np.std(absolute_errors)
    std_mse = np.std(squared_errors)
    std_r2 = np.std(r2)

    print("#"*5 + f" Displaying Average of Obtained Metrics : " + "#"*5)
    print(f"Average MAE: {avg_mae:.3f} +/- {std_mae:.3f}")
    print(f'Average MSE: {avg_mse:.3f} +/- {std_mse:.3f}')
    print(f'Average R2: {avg_r2:.3f} +/- {std_r2:.3f}')

### Modelo XGBoost

In [46]:
# Modelo XGBoost
cross_validation(X, y, model_XGBoost, k=5)

2025/04/16 17:59:01 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd234fb710552474b817ffa6d0ad4ffd1', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


########## Fold: 1 ##########


2025/04/16 17:59:05 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b9c8dc103a7441d4a688a4d285252c9d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run stylish-rook-717 at: http://localhost:5000/#/experiments/535394779431182411/runs/d234fb710552474b817ffa6d0ad4ffd1
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411




🏃 View run trusting-slug-864 at: http://localhost:5000/#/experiments/535394779431182411/runs/b9c8dc103a7441d4a688a4d285252c9d
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


2025/04/16 17:59:11 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd354ced7a6404532a49d0bdc3117ed23', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


MAE: 3.785
MSE: 64.887
R2: 0.736
########## Fold: 2 ##########


2025/04/16 17:59:15 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '69d6b823955b47bdb3077e54d192b895', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run vaunted-wren-574 at: http://localhost:5000/#/experiments/535394779431182411/runs/d354ced7a6404532a49d0bdc3117ed23
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411




🏃 View run skittish-rat-681 at: http://localhost:5000/#/experiments/535394779431182411/runs/69d6b823955b47bdb3077e54d192b895
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


2025/04/16 17:59:22 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '24a8d61a1e6b4643a8c90b0126f3862c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


MAE: 3.925
MSE: 67.987
R2: 0.743
########## Fold: 3 ##########


2025/04/16 17:59:25 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f74e00750fe64db99a02a5fcbf31f587', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run resilient-robin-357 at: http://localhost:5000/#/experiments/535394779431182411/runs/24a8d61a1e6b4643a8c90b0126f3862c
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411




🏃 View run ambitious-donkey-774 at: http://localhost:5000/#/experiments/535394779431182411/runs/f74e00750fe64db99a02a5fcbf31f587
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


2025/04/16 17:59:32 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '4e55bae30f3349ae86fcdce3e1b0a7fd', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


MAE: 3.843
MSE: 62.634
R2: 0.747
########## Fold: 4 ##########


2025/04/16 17:59:36 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b0f1d0e04ec644b4bb0717d5b7b67826', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run delicate-wolf-115 at: http://localhost:5000/#/experiments/535394779431182411/runs/4e55bae30f3349ae86fcdce3e1b0a7fd
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411




🏃 View run stylish-cow-841 at: http://localhost:5000/#/experiments/535394779431182411/runs/b0f1d0e04ec644b4bb0717d5b7b67826
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


2025/04/16 17:59:42 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b3953979673548d8a404393bf57de845', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


MAE: 3.821
MSE: 72.702
R2: 0.732
########## Fold: 5 ##########


2025/04/16 17:59:45 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '546f2c8ef0394c6693dcb1dc4ac012bb', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run spiffy-shoat-616 at: http://localhost:5000/#/experiments/535394779431182411/runs/b3953979673548d8a404393bf57de845
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411




🏃 View run wise-ray-21 at: http://localhost:5000/#/experiments/535394779431182411/runs/546f2c8ef0394c6693dcb1dc4ac012bb
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411
MAE: 3.774
MSE: 55.732
R2: 0.761
##### Displaying Average of Obtained Metrics : #####
Average MAE: 3.829 +/- 0.054
Average MSE: 64.788 +/- 5.648
Average R2: 0.744 +/- 0.010


### Modelo Catboost

In [16]:
cross_validation(X, y, model_Catboost, k=5)

########## Fold: 1 ##########
MAE: 5.352
MSE: 110.291
R2: 0.552
########## Fold: 2 ##########
MAE: 5.520
MSE: 117.569
R2: 0.555
########## Fold: 3 ##########
MAE: 5.452
MSE: 108.974
R2: 0.560
########## Fold: 4 ##########
MAE: 5.403
MSE: 127.166
R2: 0.532
########## Fold: 5 ##########
MAE: 5.321
MSE: 100.226
R2: 0.571
##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.410 +/- 0.071
Average MSE: 112.845 +/- 9.035
Average R2: 0.554 +/- 0.013


### Modelo LightGBM

In [17]:
# Modelo LightGBM
cross_validation(X, y, model_LightGBM, k=5)

########## Fold: 1 ##########
MAE: 5.664
MSE: 120.227
R2: 0.511
########## Fold: 2 ##########
MAE: 5.827
MSE: 128.344
R2: 0.515
########## Fold: 3 ##########
MAE: 5.773
MSE: 119.646
R2: 0.517
########## Fold: 4 ##########
MAE: 5.715
MSE: 136.550
R2: 0.497
########## Fold: 5 ##########
MAE: 5.612
MSE: 108.576
R2: 0.535
##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.718 +/- 0.076
Average MSE: 122.669 +/- 9.366
Average R2: 0.515 +/- 0.012


### Modelo Decision Tree

In [18]:
# Modelo Decision Tree
cross_validation(X, y, model_LightGBM, k=5)

########## Fold: 1 ##########
MAE: 5.664
MSE: 120.227
R2: 0.511
########## Fold: 2 ##########
MAE: 5.827
MSE: 128.344
R2: 0.515
########## Fold: 3 ##########
MAE: 5.773
MSE: 119.646
R2: 0.517
########## Fold: 4 ##########
MAE: 5.715
MSE: 136.550
R2: 0.497
########## Fold: 5 ##########
MAE: 5.612
MSE: 108.576
R2: 0.535
##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.718 +/- 0.076
Average MSE: 122.669 +/- 9.366
Average R2: 0.515 +/- 0.012


## Fine-Tuning

In [20]:
?XGBRegressor

[31mInit signature:[39m
XGBRegressor(
    *,
    objective: Union[str, xgboost.sklearn._SklObjWProto, Callable[[Any, Any], Tuple[numpy.ndarray, numpy.ndarray]], NoneType] = [33m'reg:squarederror'[39m,
    **kwargs: Any,
) -> [38;5;28;01mNone[39;00m
[31mDocstring:[39m     
Implementation of the scikit-learn API for XGBoost regression.
See :doc:`/python/sklearn_estimator` for more information.

Parameters
----------

    n_estimators : typing.Optional[int]
        Number of gradient boosted trees.  Equivalent to number of boosting
        rounds.

    max_depth :  typing.Optional[int]

        Maximum tree depth for base learners.

    max_leaves : typing.Optional[int]

        Maximum number of leaves; 0 indicates no limit.

    max_bin : typing.Optional[int]

        If using histogram-based algorithm, maximum number of bins per feature

    grow_policy : typing.Optional[str]

        Tree growing policy.

        - depthwise: Favors splitting at nodes closest to the node,
    

In [22]:
def fine_tuning(trial, k=5):
    # tuning
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True)
    max_depth = trial.suggest_int('max_depth', 1, 10)
    subsample = trial.suggest_float('subsample', 0.5, 1, step=0.1)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1, step=0.1)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)

    folds = KFold(n_splits=k, shuffle=True, random_state=42)

    absolute_errors = list()
    squared_errors = list()
    r2 = list()

    for k, (train_index, test_index) in enumerate(folds.split(X, y)):
        
        print("#"*10 + f" Fold: {k+1} " + "#"*10)
        
        X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
        X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

        encoder = CatBoostEncoder()
        
        cat_imputer = SimpleImputer(strategy='most_frequent')
        num_imputer = SimpleImputer(strategy='median')

        cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
        num_pipeline = Pipeline([('imputer', num_imputer)])

        cat_cols = X_train_internal.select_dtypes(include=['object']).columns
        num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

        X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
        X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

        X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
        X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

        model_XGBoost.fit(X_train_internal, y_train_internal)
        y_pred = model_XGBoost.predict(X_test_internal)
        r2score = r2_score(y_test_internal, y_pred)
        mse = mean_squared_error(y_test_internal, y_pred)
        mae = mean_absolute_error(y_test_internal, y_pred)

        absolute_errors.append(mae)
        squared_errors.append(mse)
        r2.append(r2score)

    
    absolute_errors = np.array(absolute_errors)
    squared_errors = np.array(squared_errors)
    r2 = np.array(r2)

    avg_mae = np.mean(absolute_errors)
    avg_mse = np.mean(squared_errors)
    avg_r2 = np.mean(r2)

    std_mae = np.std(absolute_errors)
    std_mse = np.std(squared_errors)
    std_r2 = np.std(r2)

    return avg_mse

study = opt.create_study(direction='minimize')
study.optimize(fine_tuning, n_trials=20)

[I 2025-04-16 16:55:45,010] A new study created in memory with name: no-name-5d095bf9-e599-4aa5-bec4-1e8d3ff7bed7


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 16:56:30,932] Trial 0 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.008335068262687652, 'max_depth': 5, 'subsample': 0.6, 'colsample_bytree': 0.9, 'min_child_weight': 4}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 16:57:16,785] Trial 1 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.006119232988146066, 'max_depth': 10, 'subsample': 0.5, 'colsample_bytree': 0.6, 'min_child_weight': 3}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 16:58:02,090] Trial 2 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.0034636305272420635, 'max_depth': 9, 'subsample': 1.0, 'colsample_bytree': 0.5, 'min_child_weight': 4}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 16:58:47,717] Trial 3 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.005697358260035766, 'max_depth': 7, 'subsample': 0.6, 'colsample_bytree': 0.8, 'min_child_weight': 10}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 16:59:33,414] Trial 4 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.003344880924745632, 'max_depth': 10, 'subsample': 0.6, 'colsample_bytree': 0.6, 'min_child_weight': 10}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 17:00:18,873] Trial 5 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.01200696680026771, 'max_depth': 1, 'subsample': 0.7, 'colsample_bytree': 0.8, 'min_child_weight': 8}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 17:01:04,625] Trial 6 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.0030691185221282773, 'max_depth': 5, 'subsample': 0.8, 'colsample_bytree': 0.5, 'min_child_weight': 9}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 17:01:50,346] Trial 7 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.09468144339979574, 'max_depth': 7, 'subsample': 0.7, 'colsample_bytree': 0.8, 'min_child_weight': 7}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 17:02:48,160] Trial 8 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.06958181703505487, 'max_depth': 9, 'subsample': 0.9, 'colsample_bytree': 0.5, 'min_child_weight': 10}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 17:03:41,373] Trial 9 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.007804584388729891, 'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 0.7, 'min_child_weight': 6}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 17:04:27,987] Trial 10 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.001273247204695463, 'max_depth': 3, 'subsample': 0.5, 'colsample_bytree': 1.0, 'min_child_weight': 2}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 17:05:13,992] Trial 11 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.030150621300125302, 'max_depth': 4, 'subsample': 0.5, 'colsample_bytree': 1.0, 'min_child_weight': 3}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 17:06:00,354] Trial 12 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.01654439614305352, 'max_depth': 3, 'subsample': 0.5, 'colsample_bytree': 0.9, 'min_child_weight': 1}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 17:06:48,371] Trial 13 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.025899880362387308, 'max_depth': 8, 'subsample': 0.6, 'colsample_bytree': 0.7, 'min_child_weight': 4}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 17:07:47,710] Trial 14 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.0010751044269778122, 'max_depth': 5, 'subsample': 0.8, 'colsample_bytree': 0.9, 'min_child_weight': 5}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 17:08:43,427] Trial 15 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.0053961193844775665, 'max_depth': 1, 'subsample': 0.5, 'colsample_bytree': 0.6, 'min_child_weight': 3}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 17:09:33,376] Trial 16 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.01686601742394893, 'max_depth': 10, 'subsample': 0.7, 'colsample_bytree': 0.9, 'min_child_weight': 1}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 17:10:30,768] Trial 17 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.0018858714912222496, 'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 0.6, 'min_child_weight': 5}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 17:11:20,080] Trial 18 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.008382868491898527, 'max_depth': 4, 'subsample': 0.5, 'colsample_bytree': 0.7, 'min_child_weight': 3}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-16 17:12:06,901] Trial 19 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.0302625415564257, 'max_depth': 8, 'subsample': 0.7, 'colsample_bytree': 0.9, 'min_child_weight': 6}. Best is trial 0 with value: 100.96271617963595.


In [23]:
params = {'learning_rate': 0.008335068262687652, 'max_depth': 5, 'subsample': 0.6, 'colsample_bytree': 0.9, 'min_child_weight': 4}

In [24]:
model_XGBoost = XGBRegressor(n_estimators=1000, n_jobs=-1, random_state=0, **params)

In [25]:
cross_validation(X, y, model_XGBoost, k=5)

########## Fold: 1 ##########
MAE: 3.785
MSE: 64.887
R2: 0.736
########## Fold: 2 ##########
MAE: 3.925
MSE: 67.987
R2: 0.743
########## Fold: 3 ##########
MAE: 3.843
MSE: 62.634
R2: 0.747
########## Fold: 4 ##########
MAE: 3.821
MSE: 72.702
R2: 0.732
########## Fold: 5 ##########
MAE: 3.774
MSE: 55.732
R2: 0.761
##### Displaying Average of Obtained Metrics : #####
Average MAE: 3.829 +/- 0.054
Average MSE: 64.788 +/- 5.648
Average R2: 0.744 +/- 0.010


In [28]:
def feature_engineering(X_train, X_test):

    X_train['volume'] = X_train['product_length_cm'] * X_train['product_height_cm'] * X_train['product_width_cm']
    X_test['volume'] = X_test['product_length_cm'] * X_test['product_height_cm'] * X_test['product_width_cm']

    # other important features

    return X_train, X_test

In [29]:
def cross_validation(X, y, model, k):
    folds = KFold(n_splits=k, shuffle=True, random_state=42)

    absolute_errors = list()
    squared_errors = list()
    r2 = list()

    for k, (train_index, test_index) in enumerate(folds.split(X, y)):
        
        print("#"*10 + f" Fold: {k+1} " + "#"*10)
        
        X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
        X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

        encoder = CatBoostEncoder()
        
        cat_imputer = SimpleImputer(strategy='most_frequent')
        num_imputer = SimpleImputer(strategy='median')

        cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
        num_pipeline = Pipeline([('imputer', num_imputer)])

        X_train_internal, X_test_internal = feature_engineering(X_train_internal, X_test_internal)

        cat_cols = X_train_internal.select_dtypes(include=['object']).columns
        num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

        X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
        X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

        X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
        X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

        model.fit(X_train_internal, y_train_internal)
        y_pred = model.predict(X_test_internal)
        r2score = r2_score(y_test_internal, y_pred)
        mse = mean_squared_error(y_test_internal, y_pred)
        mae = mean_absolute_error(y_test_internal, y_pred)

        absolute_errors.append(mae)
        squared_errors.append(mse)
        r2.append(r2score)

        print(f'MAE: {mae:.3f}')
        print(f'MSE: {mse:.3f}')
        print(f'R2: {r2score:.3f}')
    
    absolute_errors = np.array(absolute_errors)
    squared_errors = np.array(squared_errors)
    r2 = np.array(r2)

    avg_mae = np.mean(absolute_errors)
    avg_mse = np.mean(squared_errors)
    avg_r2 = np.mean(r2)

    std_mae = np.std(absolute_errors)
    std_mse = np.std(squared_errors)
    std_r2 = np.std(r2)

    print("#"*5 + f" Displaying Average of Obtained Metrics : " + "#"*5)
    print(f"Average MAE: {avg_mae:.3f} +/- {std_mae:.3f}")
    print(f'Average MSE: {avg_mse:.3f} +/- {std_mse:.3f}')
    print(f'Average R2: {avg_r2:.3f} +/- {std_r2:.3f}')

In [30]:
cross_validation(X, y, model_LightGBM, k=5)

########## Fold: 1 ##########
MAE: 5.664
MSE: 120.227
R2: 0.511
########## Fold: 2 ##########
MAE: 5.827
MSE: 128.344
R2: 0.515
########## Fold: 3 ##########
MAE: 5.773
MSE: 119.646
R2: 0.517
########## Fold: 4 ##########
MAE: 5.715
MSE: 136.550
R2: 0.497
########## Fold: 5 ##########
MAE: 5.612
MSE: 108.576
R2: 0.535
##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.718 +/- 0.076
Average MSE: 122.669 +/- 9.366
Average R2: 0.515 +/- 0.012
