# Freight Value Prediction Model
This notebook focuses on building and evaluating machine learning models to predict freight values using the Olist dataset. The steps include data preparation, feature engineering, model training, evaluation, and fine-tuning.

## 1. Importing Required Libraries
We start by importing the necessary libraries for data manipulation, visualization, and machine learning.

In [29]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold
from sklearn.inspection import permutation_importance

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from category_encoders import CatBoostEncoder

from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import optuna as opt
import math

from warnings import filterwarnings
filterwarnings('ignore')

## 2. Loading the Dataset
We load the preprocessed dataset `df_final.csv` from the `data` directory. This dataset contains features and the target variable `freight_value`.

In [3]:
# Define the base directory for the data files
data_dir = 'data'

# Load the dataset
df = pd.read_csv(os.path.join(data_dir, 'df_final.csv'))

df.head()

Unnamed: 0,price,freight_value,product_category_name,product_weight_g,product_length_cm,product_height_cm,product_width_cm,customer_city,customer_state,review_score,...,volume,density,actual_delivery_time,estimated_delivery_time,approval_order_time,distance,purchase_month,purchase_day_of_week,black_friday,christmas
0,29.99,8.72,utilidades_domesticas,500.0,19.0,8.0,13.0,sao paulo,SP,4,...,1976.0,0.253036,8.0,15,0.0,18.566632,10,0,0,0
1,118.7,22.76,perfumaria,400.0,19.0,13.0,19.0,barreiras,BA,4,...,4693.0,0.085233,13.0,19,1.0,847.437333,7,1,0,0
2,159.9,19.22,automotivo,420.0,24.0,19.0,21.0,vianopolis,GO,5,...,9576.0,0.04386,9.0,26,0.0,512.100044,8,2,0,0
3,45.0,27.2,pet_shop,450.0,30.0,10.0,20.0,sao goncalo do amarante,RN,5,...,6000.0,0.075,13.0,26,0.0,1816.085655,11,5,0,0
4,19.9,8.72,papelaria,250.0,51.0,15.0,15.0,santo andre,SP,5,...,11475.0,0.021786,2.0,12,0.0,29.684401,2,1,0,0


## 3. Splitting Data into Features and Target
We separate the dataset into features (`X`) and the target variable (`y`), which is `freight_value`.

In [4]:
# Splitting in X and Y
X = df.drop(columns='freight_value', axis=1)
y = df.freight_value

## 4. Train-Test Split
We split the data into training and testing sets using an 80-20 split to evaluate the model's performance on unseen data.

In [5]:
# splitting in train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

## 5. Initializing Models
We initialize multiple regression models, including XGBoost, LightGBM, CatBoost, and Decision Tree, with default or predefined hyperparameters.

In [6]:
# Initializing models
model_XGBoost = XGBRegressor(n_estimators = 1000, max_depth = 8, learning_rate = 1e-3, random_state = 0)
model_LightGBM = LGBMRegressor(n_estimators = 1000, max_depth = 8, num_leaves = 2^8, learning_rate = 1e-3, n_jobs = -1, verbose = -1, random_state = 0)
model_DecisionTree = DecisionTreeRegressor(random_state = 0, max_depth = 8, min_samples_split = 2)

## 6. Feature Encoding and Importance
We encode categorical features using CatBoostEncoder and calculate feature importance using permutation importance with the XGBoost model.

In [7]:
# Feature importance

encoder = CatBoostEncoder()
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

for col in X_train_encoded.select_dtypes(include=['object']).columns:
    X_train_encoded[col] = encoder.fit_transform(X_train_encoded[col], y_train)
    X_test_encoded[col] = encoder.transform(X_test_encoded[col])

model_XGBoost.fit(X_train_encoded, y_train)
r = permutation_importance(model_XGBoost, X_test_encoded, y_test, n_repeats=30, random_state=0)

In [8]:
importances = pd.DataFrame({'Feature': X_test_encoded.columns, 'importance': r.importances_mean})
importances = importances.sort_values(by='importance', ascending=False)
importances

Unnamed: 0,Feature,importance
16,distance,0.211973
11,volume,0.198656
2,product_weight_g,0.178174
0,price,0.080163
7,customer_state,0.025644
3,product_length_cm,0.022553
9,seller_city,0.021714
10,seller_state,0.013614
5,product_width_cm,0.004467
12,density,0.003321


## 7. Feature Selection
Based on feature importance, we drop less important features to simplify the model and improve performance.

In [9]:
less_important_columns = ['purchase_day_of_week', 'approval_order_time', 'estimated_delivery_time', 'christmas', 'black_friday']

X_train = X_train.drop(columns=less_important_columns)
X_test = X_test.drop(columns=less_important_columns)

In [10]:
# Feature importance

encoder = CatBoostEncoder()
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

for col in X_train_encoded.select_dtypes(include=['object']).columns:
    X_train_encoded[col] = encoder.fit_transform(X_train_encoded[col], y_train)
    X_test_encoded[col] = encoder.transform(X_test_encoded[col])

model_XGBoost.fit(X_train_encoded, y_train)
r_v2 = permutation_importance(model_XGBoost, X_test_encoded, y_test, n_repeats=30, random_state=0)

In [11]:
importances_v2 = pd.DataFrame({'Feature': X_test_encoded.columns, 'importance': r_v2.importances_mean})
importances_v2 = importances_v2.sort_values(by='importance', ascending=False)
importances_v2

Unnamed: 0,Feature,importance
14,distance,0.211323
11,volume,0.20385
2,product_weight_g,0.182162
0,price,0.090672
7,customer_state,0.024785
3,product_length_cm,0.024383
9,seller_city,0.022433
10,seller_state,0.012966
5,product_width_cm,0.004509
6,customer_city,0.003811


## 8. Cross-Validation
We perform k-fold cross-validation to evaluate the model's performance across multiple splits of the data.

In [12]:
k = 5

folds = KFold(n_splits=k, shuffle=True, random_state=42)

absolute_errors = list()
squared_errors = list()
r2 = list()

for k, (train_index, test_index) in enumerate(folds.split(X, y)):
    
    print("#"*10 + f" Fold: {k+1} " + "#"*10)
    
    X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
    X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

    encoder = CatBoostEncoder()
    
    cat_imputer = SimpleImputer(strategy='most_frequent')
    num_imputer = SimpleImputer(strategy='median')

    cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
    num_pipeline = Pipeline([('imputer', num_imputer)])

    cat_cols = X_train_internal.select_dtypes(include=['object']).columns
    num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

    X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
    X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

    X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
    X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

    model_XGBoost.fit(X_train_internal, y_train_internal)
    y_pred = model_XGBoost.predict(X_test_internal)
    r2score = r2_score(y_test_internal, y_pred)
    mse = mean_squared_error(y_test_internal, y_pred)
    mae = mean_absolute_error(y_test_internal, y_pred)

    absolute_errors.append(mae)
    squared_errors.append(mse)
    r2.append(r2score)

    print(f'MAE: {mae:.3f}')
    print(f'MSE: {mse:.3f}')
    print(f'R2: {r2score:.3f}')


########## Fold: 1 ##########


MAE: 5.044
MSE: 98.518
R2: 0.600
########## Fold: 2 ##########
MAE: 5.188
MSE: 106.660
R2: 0.597
########## Fold: 3 ##########
MAE: 5.146
MSE: 97.921
R2: 0.605
########## Fold: 4 ##########
MAE: 5.078
MSE: 112.953
R2: 0.584
########## Fold: 5 ##########
MAE: 4.990
MSE: 88.761
R2: 0.620


In [13]:
absolute_errors = np.array(absolute_errors)
squared_errors = np.array(squared_errors)
r2 = np.array(r2)

avg_mae = np.mean(absolute_errors)
avg_mse = np.mean(squared_errors)
avg_r2 = np.mean(r2)

std_mae = np.std(absolute_errors)
std_mse = np.std(squared_errors)
std_r2 = np.std(r2)

print("#"*5 + f" Displaying Average of Obtained Metrics : " + "#"*5)
print(f"Average MAE: {avg_mae:.3f} +/- {std_mae:.3f}")
print(f'Average MSE: {avg_mse:.3f} +/- {std_mse:.3f}')
print(f'Average R2: {avg_r2:.3f} +/- {std_r2:.3f}')

##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.089 +/- 0.071
Average MSE: 100.963 +/- 8.250
Average R2: 0.601 +/- 0.012


In [14]:
def cross_validation(X, y, model, k):
    folds = KFold(n_splits=k, shuffle=True, random_state=42)

    absolute_errors = list()
    squared_errors = list()
    r2 = list()

    for k, (train_index, test_index) in enumerate(folds.split(X, y)):
        
        print("#"*10 + f" Fold: {k+1} " + "#"*10)
        
        X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
        X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

        encoder = CatBoostEncoder()
        
        cat_imputer = SimpleImputer(strategy='most_frequent')
        num_imputer = SimpleImputer(strategy='median')

        cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
        num_pipeline = Pipeline([('imputer', num_imputer)])

        cat_cols = X_train_internal.select_dtypes(include=['object']).columns
        num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

        X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
        X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

        X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
        X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

        model.fit(X_train_internal, y_train_internal)
        y_pred = model.predict(X_test_internal)

        r2score = r2_score(y_test_internal, y_pred)
        mse = mean_squared_error(y_test_internal, y_pred)
        mae = mean_absolute_error(y_test_internal, y_pred)

        absolute_errors.append(mae)
        squared_errors.append(mse)
        r2.append(r2score)

        print(f'MAE: {mae:.3f}')
        print(f'MSE: {mse:.3f}')
        print(f'R2: {r2score:.3f}')
    
    absolute_errors = np.array(absolute_errors)
    squared_errors = np.array(squared_errors)
    r2 = np.array(r2)

    avg_mae = np.mean(absolute_errors)
    avg_mse = np.mean(squared_errors)
    avg_r2 = np.mean(r2)

    std_mae = np.std(absolute_errors)
    std_mse = np.std(squared_errors)
    std_r2 = np.std(r2)

    print("#"*5 + f" Displaying Average of Obtained Metrics : " + "#"*5)
    print(f"Average MAE: {avg_mae:.3f} +/- {std_mae:.3f}")
    print(f'Average MSE: {avg_mse:.3f} +/- {std_mse:.3f}')
    print(f'Average R2: {avg_r2:.3f} +/- {std_r2:.3f}')

### Modelo XGBoost

In [16]:
# Modelo XGBoost
# cross_validation(X, y, model_XGBoost, k=5)

In [17]:
cross_validation_with_mlflow(X, y, model_XGBoost, k=5, model_name="XGBoost")



########## Fold: 1 ##########
MAE: 5.044
MSE: 98.518
R2: 0.600
########## Fold: 2 ##########
MAE: 5.188
MSE: 106.660
R2: 0.597
########## Fold: 3 ##########
MAE: 5.146
MSE: 97.921
R2: 0.605
########## Fold: 4 ##########
MAE: 5.078
MSE: 112.953
R2: 0.584
########## Fold: 5 ##########
MAE: 4.990
MSE: 88.761
R2: 0.620
##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.089 +/- 0.071
Average MSE: 100.963 +/- 8.250
Average R2: 0.601 +/- 0.012




🏃 View run XGBoost Cross-Validation at: http://localhost:5000/#/experiments/535394779431182411/runs/0959333936e34847a2765182e7305baa
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


### Modelo LightGBM

In [None]:
# Modelo LightGBM
cross_validation(X, y, model_LightGBM, k=5)

########## Fold: 1 ##########
MAE: 5.664
MSE: 120.227
R2: 0.511
########## Fold: 2 ##########
MAE: 5.827
MSE: 128.344
R2: 0.515
########## Fold: 3 ##########
MAE: 5.773
MSE: 119.646
R2: 0.517
########## Fold: 4 ##########
MAE: 5.715
MSE: 136.550
R2: 0.497
########## Fold: 5 ##########
MAE: 5.612
MSE: 108.576
R2: 0.535
##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.718 +/- 0.076
Average MSE: 122.669 +/- 9.366
Average R2: 0.515 +/- 0.012




🏃 View run LightGBM Cross-Validation at: http://localhost:5000/#/experiments/535394779431182411/runs/56c657c45fb04961af4f9b58ff06c9d1
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


### Modelo Decision Tree

In [None]:
# Modelo Decision Tree
cross_validation(X, y, model_DecisionTree, k=5)

## 9. Fine-Tuning
We use Optuna to perform hyperparameter optimization for the XGBoost model to improve its performance.

In [21]:
?XGBRegressor

[31mInit signature:[39m
XGBRegressor(
    *,
    objective: Union[str, xgboost.sklearn._SklObjWProto, Callable[[Any, Any], Tuple[numpy.ndarray, numpy.ndarray]], NoneType] = [33m'reg:squarederror'[39m,
    **kwargs: Any,
) -> [38;5;28;01mNone[39;00m
[31mDocstring:[39m     
Implementation of the scikit-learn API for XGBoost regression.
See :doc:`/python/sklearn_estimator` for more information.

Parameters
----------

    n_estimators : typing.Optional[int]
        Number of gradient boosted trees.  Equivalent to number of boosting
        rounds.

    max_depth :  typing.Optional[int]

        Maximum tree depth for base learners.

    max_leaves : typing.Optional[int]

        Maximum number of leaves; 0 indicates no limit.

    max_bin : typing.Optional[int]

        If using histogram-based algorithm, maximum number of bins per feature

    grow_policy : typing.Optional[str]

        Tree growing policy.

        - depthwise: Favors splitting at nodes closest to the node,
    

In [25]:
def fine_tuning(trial, k=5):
    # tuning
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True)
    max_depth = trial.suggest_int('max_depth', 1, 10)
    subsample = trial.suggest_float('subsample', 0.5, 1, step=0.1)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1, step=0.1)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)

    folds = KFold(n_splits=k, shuffle=True, random_state=42)

    absolute_errors = list()
    squared_errors = list()
    r2 = list()

    for k, (train_index, test_index) in enumerate(folds.split(X, y)):
        
        print("#"*10 + f" Fold: {k+1} " + "#"*10)
        
        X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
        X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

        encoder = CatBoostEncoder()
        
        cat_imputer = SimpleImputer(strategy='most_frequent')
        num_imputer = SimpleImputer(strategy='median')

        cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
        num_pipeline = Pipeline([('imputer', num_imputer)])

        cat_cols = X_train_internal.select_dtypes(include=['object']).columns
        num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

        X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
        X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

        X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
        X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

        model_XGBoost.fit(X_train_internal, y_train_internal)
        y_pred = model_XGBoost.predict(X_test_internal)
        r2score = r2_score(y_test_internal, y_pred)
        mse = mean_squared_error(y_test_internal, y_pred)
        mae = mean_absolute_error(y_test_internal, y_pred)

        absolute_errors.append(mae)
        squared_errors.append(mse)
        r2.append(r2score)

    
    absolute_errors = np.array(absolute_errors)
    squared_errors = np.array(squared_errors)
    r2 = np.array(r2)

    avg_mae = np.mean(absolute_errors)
    avg_mse = np.mean(squared_errors)
    avg_r2 = np.mean(r2)

    std_mae = np.std(absolute_errors)
    std_mse = np.std(squared_errors)
    std_r2 = np.std(r2)

    return avg_mse

study = opt.create_study(direction='minimize')
study.optimize(fine_tuning, n_trials=20)

[I 2025-04-18 15:17:22,694] A new study created in memory with name: no-name-0c3bb4f5-7a75-4b8f-8360-5542ee3aeb87
2025/04/18 15:17:22 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '328795bf6cf744a0987713588a0e3bfa', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


########## Fold: 1 ##########


2025/04/18 15:17:26 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f66aac0007c84634865853e865e96a1f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run debonair-mouse-545 at: http://localhost:5000/#/experiments/535394779431182411/runs/328795bf6cf744a0987713588a0e3bfa
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411




🏃 View run valuable-bass-654 at: http://localhost:5000/#/experiments/535394779431182411/runs/f66aac0007c84634865853e865e96a1f
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


2025/04/18 15:17:29 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '8524486f365941aca9b32e90567d2dfe', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


🏃 View run sedate-fly-98 at: http://localhost:5000/#/experiments/535394779431182411/runs/8524486f365941aca9b32e90567d2dfe
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


2025/04/18 15:17:43 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '957e7168f7864e989ee03bdf5ea6a85c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


########## Fold: 2 ##########


2025/04/18 15:17:46 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e2c683a6329841bc89590a24ef2ec685', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run bustling-horse-116 at: http://localhost:5000/#/experiments/535394779431182411/runs/957e7168f7864e989ee03bdf5ea6a85c
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411




🏃 View run mysterious-conch-600 at: http://localhost:5000/#/experiments/535394779431182411/runs/e2c683a6329841bc89590a24ef2ec685
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


2025/04/18 15:17:50 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a1c36534d82248a4a669254eec26ee1c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


🏃 View run traveling-hare-891 at: http://localhost:5000/#/experiments/535394779431182411/runs/a1c36534d82248a4a669254eec26ee1c
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


2025/04/18 15:18:03 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '5b5382df27d34d43a7d71580e19210f7', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


########## Fold: 3 ##########


2025/04/18 15:18:07 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6c19d3d0efb745db84b80a31b50407de', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run sedate-fly-826 at: http://localhost:5000/#/experiments/535394779431182411/runs/5b5382df27d34d43a7d71580e19210f7
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411




🏃 View run painted-bug-375 at: http://localhost:5000/#/experiments/535394779431182411/runs/6c19d3d0efb745db84b80a31b50407de
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


2025/04/18 15:18:10 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'cf95535fa0c842eb9e00090babf9c044', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


🏃 View run carefree-swan-788 at: http://localhost:5000/#/experiments/535394779431182411/runs/cf95535fa0c842eb9e00090babf9c044
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


2025/04/18 15:18:23 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd25df08390b241ac88033e22202b00d1', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


########## Fold: 4 ##########


2025/04/18 15:18:27 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '28648ad3ebe74152b2bbee18c2fdc19e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run able-newt-806 at: http://localhost:5000/#/experiments/535394779431182411/runs/d25df08390b241ac88033e22202b00d1
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411




🏃 View run indecisive-wren-794 at: http://localhost:5000/#/experiments/535394779431182411/runs/28648ad3ebe74152b2bbee18c2fdc19e
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


2025/04/18 15:18:30 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd7876829d72a43ceb31b5dfbd5bf8e53', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


🏃 View run overjoyed-sheep-309 at: http://localhost:5000/#/experiments/535394779431182411/runs/d7876829d72a43ceb31b5dfbd5bf8e53
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


2025/04/18 15:18:45 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a829cba16f764bcd8cf7def70d60ee62', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


########## Fold: 5 ##########


2025/04/18 15:18:48 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e26e6b38968749319694acbc5d676dd3', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run agreeable-calf-168 at: http://localhost:5000/#/experiments/535394779431182411/runs/a829cba16f764bcd8cf7def70d60ee62
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411




🏃 View run funny-sow-515 at: http://localhost:5000/#/experiments/535394779431182411/runs/e26e6b38968749319694acbc5d676dd3
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


2025/04/18 15:18:52 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '3a1d051ffe5948adb4f68d4a92cd87d0', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


🏃 View run dashing-newt-487 at: http://localhost:5000/#/experiments/535394779431182411/runs/3a1d051ffe5948adb4f68d4a92cd87d0
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


[I 2025-04-18 15:19:05,468] Trial 0 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.002827571420648799, 'max_depth': 2, 'subsample': 0.9, 'colsample_bytree': 0.8, 'min_child_weight': 6}. Best is trial 0 with value: 100.96271617963595.
2025/04/18 15:19:05 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'c04bd9b11ed64580a2390b54dc6518d0', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


########## Fold: 1 ##########


2025/04/18 15:19:09 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '16ed098cf59443dd900def64574b27d4', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run sedate-hare-187 at: http://localhost:5000/#/experiments/535394779431182411/runs/c04bd9b11ed64580a2390b54dc6518d0
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411




🏃 View run crawling-rat-57 at: http://localhost:5000/#/experiments/535394779431182411/runs/16ed098cf59443dd900def64574b27d4
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


2025/04/18 15:19:12 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '0e0e00b0852b4f67ad1801e2de5bc530', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


🏃 View run polite-ray-995 at: http://localhost:5000/#/experiments/535394779431182411/runs/0e0e00b0852b4f67ad1801e2de5bc530
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


2025/04/18 15:19:25 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'efdb0705c3ac44ec9a7896a01a873066', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


########## Fold: 2 ##########


2025/04/18 15:19:29 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '0649b59d914749bcae1073bdff1bef6d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run brawny-robin-391 at: http://localhost:5000/#/experiments/535394779431182411/runs/efdb0705c3ac44ec9a7896a01a873066
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411




🏃 View run bouncy-donkey-991 at: http://localhost:5000/#/experiments/535394779431182411/runs/0649b59d914749bcae1073bdff1bef6d
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


2025/04/18 15:19:33 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '01d5367ed2894e5ba7c3a7d3b7353aae', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


🏃 View run nebulous-hare-43 at: http://localhost:5000/#/experiments/535394779431182411/runs/01d5367ed2894e5ba7c3a7d3b7353aae
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


2025/04/18 15:19:46 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '8a23a210b8c74b728ed1e380e0edbcbd', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


########## Fold: 3 ##########


2025/04/18 15:19:50 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '811a7ade2c9044ec8c0a993d20af8fe1', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run youthful-shrew-214 at: http://localhost:5000/#/experiments/535394779431182411/runs/8a23a210b8c74b728ed1e380e0edbcbd
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


[W 2025-04-18 15:19:51,149] Trial 1 failed with parameters: {'learning_rate': 0.050914069782080844, 'max_depth': 3, 'subsample': 0.9, 'colsample_bytree': 0.5, 'min_child_weight': 6} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/conda/envs/olist-freight/lib/python3.13/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_7471/2029287172.py", line 34, in fine_tuning
    X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])
                                 ~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/envs/olist-freight/lib/python3.13/site-packages/mlflow/utils/autologging_utils/safety.py", line 483, in safe_patch_function
    patch_function(call_original, *args, **kwargs)
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/envs/olist-freight/lib/python3.13/site-packages/mlflow/utils/autologg

🏃 View run chill-ray-267 at: http://localhost:5000/#/experiments/535394779431182411/runs/811a7ade2c9044ec8c0a993d20af8fe1
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


KeyboardInterrupt: 

In [None]:
opt.logging.set_verbosity(opt.logging.ERROR)

def champion_callback(study, frozen_trial):
    winner = study.user_attrs.get("winner", None)

    if study.best_value and winner != study.best_value:
      study.set_user_attr("winner", study.best_value)
      if winner:
          improvement_percent = (abs(winner - study.best_value) / study.best_value) * 100
          print(
              f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with "
              f"{improvement_percent: .4f}% improvement"
          )
      else:
          print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}")


def fine_tuning_with_mlflow(trial, k=5):
    with mlflow.start_run(nested=True):
        # tuning
        params = {
            "learning_rate": trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),
            "max_depth": trial.suggest_int('max_depth', 1, 10),
            "subsample": trial.suggest_float('subsample', 0.5, 1, step=0.1),
            "colsample_bytree": trial.suggest_float('colsample_bytree', 0.5, 1, step=0.1),
            "min_child_weight": trial.suggest_int('min_child_weight', 1, 10)
        }

        folds = KFold(n_splits=k, shuffle=True, random_state=42)

        absolute_errors = list()
        squared_errors = list()
        r2 = list()

        mlflow.xgboost.autolog()
        
        for k, (train_index, test_index) in enumerate(folds.split(X, y)):
            
            print("#"*10 + f" Fold: {k+1} " + "#"*10)
            
            X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
            X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

            encoder = CatBoostEncoder()
            
            cat_imputer = SimpleImputer(strategy='most_frequent')
            num_imputer = SimpleImputer(strategy='median')

            cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
            num_pipeline = Pipeline([('imputer', num_imputer)])

            cat_cols = X_train_internal.select_dtypes(include=['object']).columns
            num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

            X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
            X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

            X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
            X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

            model_XGBoost.fit(X_train_internal, y_train_internal)
            y_pred = model_XGBoost.predict(X_test_internal)
            
            r2score = r2_score(y_test_internal, y_pred)
            mse = mean_squared_error(y_test_internal, y_pred)
            mae = mean_absolute_error(y_test_internal, y_pred)

            absolute_errors.append(mae)
            squared_errors.append(mse)
            r2.append(r2score)
            
            # Registrar métricas do fold no MLflow
            mlflow.log_metric(f"Fold_{k + 1}_MAE", mae)
            mlflow.log_metric(f"Fold_{k + 1}_MSE", mse)
            mlflow.log_metric(f"Fold_{k + 1}_R2", r2score)

        
        absolute_errors = np.array(absolute_errors)
        squared_errors = np.array(squared_errors)
        r2 = np.array(r2)

        avg_mae = np.mean(absolute_errors)
        avg_mse = np.mean(squared_errors)
        avg_r2 = np.mean(r2)

        std_mae = np.std(absolute_errors)
        std_mse = np.std(squared_errors)
        std_r2 = np.std(r2)
        
        # Registrar métricas médias no MLflow
        mlflow.log_metric("Average_MAE", avg_mae)
        mlflow.log_metric("Average_MSE", avg_mse)
        mlflow.log_metric("Average_R2", avg_r2)

        mlflow.log_metric("Std_MAE", std_mae)
        mlflow.log_metric("Std_MSE", std_mse)
        mlflow.log_metric("Std_R2", std_r2)
        
        mlflow.log_params(params)
        

    return avg_mse

In [30]:
run_name = 'Optuna Fine-Tuning'

# Initiate the parent run and call the hyperparameter tuning child run logic
with mlflow.start_run(run_name=run_name, nested=True):
  # Initialize the Optuna study
  study = opt.create_study(direction="minimize")
  
  # Execute the hyperparameter optimization trials.
  # Note the addition of the `champion_callback` inclusion to control our logging
  study.optimize(fine_tuning_with_mlflow, n_trials=10, callbacks=[champion_callback])
  
  mlflow.log_params(study.best_params)
  mlflow.log_metric("best_mse", study.best_value)
  mlflow.log_metric("best_rmse", math.sqrt(study.best_value))

  # Log tags
  mlflow.set_tags(
      tags={
          "project": "Freight Value Prediction",
          "optimizer_engine": "optuna",
          "model_family": "xgboost",
          "feature_set_version": 1,
      }
  )
  
  # Log a fit model instance
  model = XGBRegressor(
        n_estimators=1000,
        n_jobs=-1,
        random_state=0,
        **study.best_params
    )
  
  # Log a fit model instance
  model = model.train(study.best_params, X_train)
  
  artifact_path = "model"

  mlflow.xgboost.log_model(
      xgb_model=model,
      artifact_path=artifact_path,
      input_example=X_train[[0]],
      model_format="ubj",
      metadata={"model_data_version": 1},
  )

  # Get the logged model uri so that we can load it from the artifact store
  model_uri = mlflow.get_artifact_uri(artifact_path)



########## Fold: 1 ##########




########## Fold: 2 ##########




########## Fold: 3 ##########




########## Fold: 4 ##########




🏃 View run dashing-stag-211 at: http://localhost:5000/#/experiments/535394779431182411/runs/b419428ac4074559ac3b9cad2c570fa9
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411
🏃 View run Optuna Fine-Tuning at: http://localhost:5000/#/experiments/535394779431182411/runs/db3195d711154d10a9fe83c60dc4550e
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


KeyboardInterrupt: 

In [None]:
params = {'learning_rate': 0.008335068262687652, 'max_depth': 5, 'subsample': 0.6, 'colsample_bytree': 0.9, 'min_child_weight': 4}

In [None]:
model_XGBoost = XGBRegressor(n_estimators=1000, n_jobs=-1, random_state=0, **params)

In [None]:
cross_validation_with_mlflow(X, y, model_XGBoost, k=5, model_name='XGBoost Fine-Tuned')

########## Fold: 1 ##########
MAE: 3.785
MSE: 64.887
R2: 0.736
########## Fold: 2 ##########
MAE: 3.925
MSE: 67.987
R2: 0.743
########## Fold: 3 ##########
MAE: 3.843
MSE: 62.634
R2: 0.747
########## Fold: 4 ##########
MAE: 3.821
MSE: 72.702
R2: 0.732
########## Fold: 5 ##########
MAE: 3.774
MSE: 55.732
R2: 0.761
##### Displaying Average of Obtained Metrics : #####
Average MAE: 3.829 +/- 0.054
Average MSE: 64.788 +/- 5.648
Average R2: 0.744 +/- 0.010


In [None]:
# def feature_engineering(X_train, X_test):

#     X_train['volume'] = X_train['product_length_cm'] * X_train['product_height_cm'] * X_train['product_width_cm']
#     X_test['volume'] = X_test['product_length_cm'] * X_test['product_height_cm'] * X_test['product_width_cm']

#     # other important features

#     return X_train, X_test

In [None]:
# def cross_validation(X, y, model, k):
#     folds = KFold(n_splits=k, shuffle=True, random_state=42)

#     absolute_errors = list()
#     squared_errors = list()
#     r2 = list()

#     for k, (train_index, test_index) in enumerate(folds.split(X, y)):
        
#         print("#"*10 + f" Fold: {k+1} " + "#"*10)
        
#         X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
#         X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

#         encoder = CatBoostEncoder()
        
#         cat_imputer = SimpleImputer(strategy='most_frequent')
#         num_imputer = SimpleImputer(strategy='median')

#         cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
#         num_pipeline = Pipeline([('imputer', num_imputer)])

#         X_train_internal, X_test_internal = feature_engineering(X_train_internal, X_test_internal)

#         cat_cols = X_train_internal.select_dtypes(include=['object']).columns
#         num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

#         X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
#         X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

#         X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
#         X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

#         model.fit(X_train_internal, y_train_internal)
#         y_pred = model.predict(X_test_internal)
#         r2score = r2_score(y_test_internal, y_pred)
#         mse = mean_squared_error(y_test_internal, y_pred)
#         mae = mean_absolute_error(y_test_internal, y_pred)

#         absolute_errors.append(mae)
#         squared_errors.append(mse)
#         r2.append(r2score)

#         print(f'MAE: {mae:.3f}')
#         print(f'MSE: {mse:.3f}')
#         print(f'R2: {r2score:.3f}')
    
#     absolute_errors = np.array(absolute_errors)
#     squared_errors = np.array(squared_errors)
#     r2 = np.array(r2)

#     avg_mae = np.mean(absolute_errors)
#     avg_mse = np.mean(squared_errors)
#     avg_r2 = np.mean(r2)

#     std_mae = np.std(absolute_errors)
#     std_mse = np.std(squared_errors)
#     std_r2 = np.std(r2)

#     print("#"*5 + f" Displaying Average of Obtained Metrics : " + "#"*5)
#     print(f"Average MAE: {avg_mae:.3f} +/- {std_mae:.3f}")
#     print(f'Average MSE: {avg_mse:.3f} +/- {std_mse:.3f}')
#     print(f'Average R2: {avg_r2:.3f} +/- {std_r2:.3f}')

In [None]:
# cross_validation(X, y, model_LightGBM, k=5)

########## Fold: 1 ##########
MAE: 5.664
MSE: 120.227
R2: 0.511
########## Fold: 2 ##########
MAE: 5.827
MSE: 128.344
R2: 0.515
########## Fold: 3 ##########
MAE: 5.773
MSE: 119.646
R2: 0.517
########## Fold: 4 ##########
MAE: 5.715
MSE: 136.550
R2: 0.497
########## Fold: 5 ##########
MAE: 5.612
MSE: 108.576
R2: 0.535
##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.718 +/- 0.076
Average MSE: 122.669 +/- 9.366
Average R2: 0.515 +/- 0.012
