# Freight Value Prediction Model
This notebook focuses on building and evaluating machine learning models to predict freight values using the Olist dataset. The steps include data preparation, feature engineering, model training, evaluation, and fine-tuning.

## 1. Importing Required Libraries
We start by importing the necessary libraries for data manipulation, visualization, and machine learning.

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold
from sklearn.inspection import permutation_importance

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from category_encoders import CatBoostEncoder

from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import optuna as opt

from warnings import filterwarnings
filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.lightgbm
import mlflow.catboost

mlflow.set_tracking_uri("http://localhost:5000/")
mlflow.set_experiment(experiment_id=535394779431182411)

<Experiment: artifact_location='mlflow-artifacts:/535394779431182411', creation_time=1744772924702, experiment_id='535394779431182411', last_update_time=1744772924702, lifecycle_stage='active', name='freight-prediction', tags={}>

## 2. Loading the Dataset
We load the preprocessed dataset `df_final.csv` from the `data` directory. This dataset contains features and the target variable `freight_value`.

In [3]:
# Define the base directory for the data files
data_dir = 'data'

# Load the dataset
df = pd.read_csv(os.path.join(data_dir, 'df_final.csv'))

df.head()

Unnamed: 0,price,freight_value,product_category_name,product_weight_g,product_length_cm,product_height_cm,product_width_cm,customer_city,customer_state,review_score,...,volume,density,actual_delivery_time,estimated_delivery_time,approval_order_time,distance,purchase_month,purchase_day_of_week,black_friday,christmas
0,29.99,8.72,utilidades_domesticas,500.0,19.0,8.0,13.0,sao paulo,SP,4,...,1976.0,0.253036,8.0,15,0.0,18.566632,10,0,0,0
1,118.7,22.76,perfumaria,400.0,19.0,13.0,19.0,barreiras,BA,4,...,4693.0,0.085233,13.0,19,1.0,847.437333,7,1,0,0
2,159.9,19.22,automotivo,420.0,24.0,19.0,21.0,vianopolis,GO,5,...,9576.0,0.04386,9.0,26,0.0,512.100044,8,2,0,0
3,45.0,27.2,pet_shop,450.0,30.0,10.0,20.0,sao goncalo do amarante,RN,5,...,6000.0,0.075,13.0,26,0.0,1816.085655,11,5,0,0
4,19.9,8.72,papelaria,250.0,51.0,15.0,15.0,santo andre,SP,5,...,11475.0,0.021786,2.0,12,0.0,29.684401,2,1,0,0


## 3. Splitting Data into Features and Target
We separate the dataset into features (`X`) and the target variable (`y`), which is `freight_value`.

In [4]:
# Splitting in X and Y
X = df.drop(columns='freight_value', axis=1)
y = df.freight_value

## 4. Train-Test Split
We split the data into training and testing sets using an 80-20 split to evaluate the model's performance on unseen data.

In [5]:
# splitting in train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

## 5. Initializing Models
We initialize multiple regression models, including XGBoost, LightGBM, CatBoost, and Decision Tree, with default or predefined hyperparameters.

In [6]:
# Initializing models
model_XGBoost = XGBRegressor(n_estimators = 1000, max_depth = 8, learning_rate = 1e-3, random_state = 0)
model_LightGBM = LGBMRegressor(n_estimators = 1000, max_depth = 8, num_leaves = 2^8, learning_rate = 1e-3, n_jobs = -1, verbose = -1, random_state = 0)
model_Catboost = CatBoostRegressor(n_estimators = 1000, max_depth = 8, learning_rate = 1e-3, random_state = 0, verbose = 0)
model_DecisionTree = DecisionTreeRegressor(random_state = 0, max_depth = 8, min_samples_split = 2)

## 6. Feature Encoding and Importance
We encode categorical features using CatBoostEncoder and calculate feature importance using permutation importance with the XGBoost model.

In [7]:
# Feature importance

encoder = CatBoostEncoder()
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

for col in X_train_encoded.select_dtypes(include=['object']).columns:
    X_train_encoded[col] = encoder.fit_transform(X_train_encoded[col], y_train)
    X_test_encoded[col] = encoder.transform(X_test_encoded[col])

model_XGBoost.fit(X_train_encoded, y_train)
r = permutation_importance(model_XGBoost, X_test_encoded, y_test, n_repeats=30, random_state=0)

In [8]:
importances = pd.DataFrame({'Feature': X_test_encoded.columns, 'importance': r.importances_mean})
importances = importances.sort_values(by='importance', ascending=False)
importances

Unnamed: 0,Feature,importance
16,distance,0.211973
11,volume,0.198656
2,product_weight_g,0.178174
0,price,0.080163
7,customer_state,0.025644
3,product_length_cm,0.022553
9,seller_city,0.021714
10,seller_state,0.013614
5,product_width_cm,0.004467
12,density,0.003321


## 7. Feature Selection
Based on feature importance, we drop less important features to simplify the model and improve performance.

In [9]:
less_important_columns = ['purchase_day_of_week', 'approval_order_time', 'estimated_delivery_time', 'christmas', 'black_friday']

X_train = X_train.drop(columns=less_important_columns)
X_test = X_test.drop(columns=less_important_columns)

In [10]:
# Feature importance

encoder = CatBoostEncoder()
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

for col in X_train_encoded.select_dtypes(include=['object']).columns:
    X_train_encoded[col] = encoder.fit_transform(X_train_encoded[col], y_train)
    X_test_encoded[col] = encoder.transform(X_test_encoded[col])

model_XGBoost.fit(X_train_encoded, y_train)
r_v2 = permutation_importance(model_XGBoost, X_test_encoded, y_test, n_repeats=30, random_state=0)

In [11]:
importances_v2 = pd.DataFrame({'Feature': X_test_encoded.columns, 'importance': r_v2.importances_mean})
importances_v2 = importances_v2.sort_values(by='importance', ascending=False)
importances_v2

Unnamed: 0,Feature,importance
14,distance,0.211323
11,volume,0.20385
2,product_weight_g,0.182162
0,price,0.090672
7,customer_state,0.024785
3,product_length_cm,0.024383
9,seller_city,0.022433
10,seller_state,0.012966
5,product_width_cm,0.004509
6,customer_city,0.003811


## 8. Cross-Validation
We perform k-fold cross-validation to evaluate the model's performance across multiple splits of the data.

In [12]:
k = 5

folds = KFold(n_splits=k, shuffle=True, random_state=42)

absolute_errors = list()
squared_errors = list()
r2 = list()

for k, (train_index, test_index) in enumerate(folds.split(X, y)):
    
    print("#"*10 + f" Fold: {k+1} " + "#"*10)
    
    X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
    X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

    encoder = CatBoostEncoder()
    
    cat_imputer = SimpleImputer(strategy='most_frequent')
    num_imputer = SimpleImputer(strategy='median')

    cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
    num_pipeline = Pipeline([('imputer', num_imputer)])

    cat_cols = X_train_internal.select_dtypes(include=['object']).columns
    num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

    X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
    X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

    X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
    X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

    model_XGBoost.fit(X_train_internal, y_train_internal)
    y_pred = model_XGBoost.predict(X_test_internal)
    r2score = r2_score(y_test_internal, y_pred)
    mse = mean_squared_error(y_test_internal, y_pred)
    mae = mean_absolute_error(y_test_internal, y_pred)

    absolute_errors.append(mae)
    squared_errors.append(mse)
    r2.append(r2score)

    print(f'MAE: {mae:.3f}')
    print(f'MSE: {mse:.3f}')
    print(f'R2: {r2score:.3f}')


########## Fold: 1 ##########


MAE: 5.044
MSE: 98.518
R2: 0.600
########## Fold: 2 ##########
MAE: 5.188
MSE: 106.660
R2: 0.597
########## Fold: 3 ##########
MAE: 5.146
MSE: 97.921
R2: 0.605
########## Fold: 4 ##########
MAE: 5.078
MSE: 112.953
R2: 0.584
########## Fold: 5 ##########
MAE: 4.990
MSE: 88.761
R2: 0.620


In [13]:
absolute_errors = np.array(absolute_errors)
squared_errors = np.array(squared_errors)
r2 = np.array(r2)

avg_mae = np.mean(absolute_errors)
avg_mse = np.mean(squared_errors)
avg_r2 = np.mean(r2)

std_mae = np.std(absolute_errors)
std_mse = np.std(squared_errors)
std_r2 = np.std(r2)

print("#"*5 + f" Displaying Average of Obtained Metrics : " + "#"*5)
print(f"Average MAE: {avg_mae:.3f} +/- {std_mae:.3f}")
print(f'Average MSE: {avg_mse:.3f} +/- {std_mse:.3f}')
print(f'Average R2: {avg_r2:.3f} +/- {std_r2:.3f}')

##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.089 +/- 0.071
Average MSE: 100.963 +/- 8.250
Average R2: 0.601 +/- 0.012


In [14]:
def cross_validation(X, y, model, k):
    folds = KFold(n_splits=k, shuffle=True, random_state=42)

    absolute_errors = list()
    squared_errors = list()
    r2 = list()

    for k, (train_index, test_index) in enumerate(folds.split(X, y)):
        
        print("#"*10 + f" Fold: {k+1} " + "#"*10)
        
        X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
        X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

        encoder = CatBoostEncoder()
        
        cat_imputer = SimpleImputer(strategy='most_frequent')
        num_imputer = SimpleImputer(strategy='median')

        cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
        num_pipeline = Pipeline([('imputer', num_imputer)])

        cat_cols = X_train_internal.select_dtypes(include=['object']).columns
        num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

        X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
        X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

        X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
        X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

        model.fit(X_train_internal, y_train_internal)
        y_pred = model.predict(X_test_internal)

        r2score = r2_score(y_test_internal, y_pred)
        mse = mean_squared_error(y_test_internal, y_pred)
        mae = mean_absolute_error(y_test_internal, y_pred)

        absolute_errors.append(mae)
        squared_errors.append(mse)
        r2.append(r2score)

        print(f'MAE: {mae:.3f}')
        print(f'MSE: {mse:.3f}')
        print(f'R2: {r2score:.3f}')
    
    absolute_errors = np.array(absolute_errors)
    squared_errors = np.array(squared_errors)
    r2 = np.array(r2)

    avg_mae = np.mean(absolute_errors)
    avg_mse = np.mean(squared_errors)
    avg_r2 = np.mean(r2)

    std_mae = np.std(absolute_errors)
    std_mse = np.std(squared_errors)
    std_r2 = np.std(r2)

    print("#"*5 + f" Displaying Average of Obtained Metrics : " + "#"*5)
    print(f"Average MAE: {avg_mae:.3f} +/- {std_mae:.3f}")
    print(f'Average MSE: {avg_mse:.3f} +/- {std_mse:.3f}')
    print(f'Average R2: {avg_r2:.3f} +/- {std_r2:.3f}')

In [24]:
def cross_validation_with_mlflow(X, y, model, k, model_name='Model'):
    """
    Perform k-fold cross-validation with MLflow integration.
    """
    folds = KFold(n_splits=k, shuffle=True, random_state=42)

    absolute_errors = list()
    squared_errors = list()
    r2 = list()
    
    # Ativar o autolog apropriado com base no modelo
    if isinstance(model, XGBRegressor):
        mlflow.xgboost.autolog()
    elif isinstance(model, LGBMRegressor):
        mlflow.lightgbm.autolog()
    elif isinstance(model, CatBoostRegressor):
        mlflow.catboost.autolog()
    elif isinstance(model, DecisionTreeRegressor):
        mlflow.sklearn.autolog()
    else:
        raise ValueError("Unsupported model type for autologging.")

    with mlflow.start_run(run_name=f"{model_name} Cross-Validation"):
        
        for k, (train_index, test_index) in enumerate(folds.split(X, y)):
            
            print("#"*10 + f" Fold: {k+1} " + "#"*10)
            
            X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
            X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

            encoder = CatBoostEncoder()
            
            cat_imputer = SimpleImputer(strategy='most_frequent')
            num_imputer = SimpleImputer(strategy='median')

            cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
            num_pipeline = Pipeline([('imputer', num_imputer)])

            cat_cols = X_train_internal.select_dtypes(include=['object']).columns
            num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

            X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
            X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

            X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
            X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

            model.fit(X_train_internal, y_train_internal)
            y_pred = model.predict(X_test_internal)

            r2score = r2_score(y_test_internal, y_pred)
            mse = mean_squared_error(y_test_internal, y_pred)
            mae = mean_absolute_error(y_test_internal, y_pred)

            absolute_errors.append(mae)
            squared_errors.append(mse)
            r2.append(r2score)

            print(f'MAE: {mae:.3f}')
            print(f'MSE: {mse:.3f}')
            print(f'R2: {r2score:.3f}')
            
            # Registrar métricas do fold no MLflow
            mlflow.log_metric(f"Fold_{k + 1}_MAE", mae)
            mlflow.log_metric(f"Fold_{k + 1}_MSE", mse)
            mlflow.log_metric(f"Fold_{k + 1}_R2", r2score)
    
        absolute_errors = np.array(absolute_errors)
        squared_errors = np.array(squared_errors)
        r2 = np.array(r2)

        avg_mae = np.mean(absolute_errors)
        avg_mse = np.mean(squared_errors)
        avg_r2 = np.mean(r2)

        std_mae = np.std(absolute_errors)
        std_mse = np.std(squared_errors)
        std_r2 = np.std(r2)

        print("#"*5 + f" Displaying Average of Obtained Metrics : " + "#"*5)
        print(f"Average MAE: {avg_mae:.3f} +/- {std_mae:.3f}")
        print(f'Average MSE: {avg_mse:.3f} +/- {std_mse:.3f}')
        print(f'Average R2: {avg_r2:.3f} +/- {std_r2:.3f}')
    
        # Registrar métricas médias no MLflow
        mlflow.log_metric("Average_MAE", avg_mae)
        mlflow.log_metric("Average_MSE", avg_mse)
        mlflow.log_metric("Average_R2", avg_r2)

        mlflow.log_metric("Std_MAE", std_mae)
        mlflow.log_metric("Std_MSE", std_mse)
        mlflow.log_metric("Std_R2", std_r2)

        # Registrar o modelo no MLflow
        mlflow.sklearn.log_model(model, f"{model_name}_model")

### Modelo XGBoost

In [16]:
# Modelo XGBoost
cross_validation(X, y, model_XGBoost, k=5)

########## Fold: 1 ##########


MAE: 5.044
MSE: 98.518
R2: 0.600
########## Fold: 2 ##########
MAE: 5.188
MSE: 106.660
R2: 0.597
########## Fold: 3 ##########
MAE: 5.146
MSE: 97.921
R2: 0.605
########## Fold: 4 ##########
MAE: 5.078
MSE: 112.953
R2: 0.584
########## Fold: 5 ##########
MAE: 4.990
MSE: 88.761
R2: 0.620
##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.089 +/- 0.071
Average MSE: 100.963 +/- 8.250
Average R2: 0.601 +/- 0.012


In [25]:
cross_validation_with_mlflow(X, y, model_XGBoost, k=5, model_name="XGBoost")



########## Fold: 1 ##########
MAE: 5.044
MSE: 98.518
R2: 0.600
########## Fold: 2 ##########
MAE: 5.188
MSE: 106.660
R2: 0.597
########## Fold: 3 ##########
MAE: 5.146
MSE: 97.921
R2: 0.605
########## Fold: 4 ##########
MAE: 5.078
MSE: 112.953
R2: 0.584
########## Fold: 5 ##########
MAE: 4.990
MSE: 88.761
R2: 0.620
##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.089 +/- 0.071
Average MSE: 100.963 +/- 8.250
Average R2: 0.601 +/- 0.012




🏃 View run XGBoost Cross-Validation at: http://localhost:5000/#/experiments/535394779431182411/runs/46ff3eb9e8d740ca8259a216758c036e
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


### Modelo Catboost

In [17]:
cross_validation(X, y, model_Catboost, k=5)

########## Fold: 1 ##########


MAE: 5.352
MSE: 110.291
R2: 0.552
########## Fold: 2 ##########
MAE: 5.520
MSE: 117.569
R2: 0.555
########## Fold: 3 ##########
MAE: 5.452
MSE: 108.974
R2: 0.560
########## Fold: 4 ##########
MAE: 5.403
MSE: 127.166
R2: 0.532
########## Fold: 5 ##########
MAE: 5.321
MSE: 100.226
R2: 0.571
##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.410 +/- 0.071
Average MSE: 112.845 +/- 9.035
Average R2: 0.554 +/- 0.013


In [26]:
cross_validation_with_mlflow(X, y, model_Catboost, k=5, model_name="CatBoost")

AttributeError: module 'mlflow.catboost' has no attribute 'autolog'

### Modelo LightGBM

In [27]:
# Modelo LightGBM
cross_validation_with_mlflow(X, y, model_LightGBM, k=5, model_name="LightGBM")

########## Fold: 1 ##########


MAE: 5.664
MSE: 120.227
R2: 0.511
########## Fold: 2 ##########
MAE: 5.827
MSE: 128.344
R2: 0.515
########## Fold: 3 ##########
MAE: 5.773
MSE: 119.646
R2: 0.517
########## Fold: 4 ##########
MAE: 5.715
MSE: 136.550
R2: 0.497
########## Fold: 5 ##########
MAE: 5.612
MSE: 108.576
R2: 0.535
##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.718 +/- 0.076
Average MSE: 122.669 +/- 9.366
Average R2: 0.515 +/- 0.012




🏃 View run LightGBM Cross-Validation at: http://localhost:5000/#/experiments/535394779431182411/runs/ac8eec71f07c434fab146f0562960587
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


### Modelo Decision Tree

In [None]:
# Modelo Decision Tree
cross_validation(X, y, model_DecisionTree, k=5)

########## Fold: 1 ##########
MAE: 5.664
MSE: 120.227
R2: 0.511
########## Fold: 2 ##########
MAE: 5.827
MSE: 128.344
R2: 0.515
########## Fold: 3 ##########
MAE: 5.773
MSE: 119.646
R2: 0.517
########## Fold: 4 ##########
MAE: 5.715
MSE: 136.550
R2: 0.497
########## Fold: 5 ##########
MAE: 5.612
MSE: 108.576
R2: 0.535
##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.718 +/- 0.076
Average MSE: 122.669 +/- 9.366
Average R2: 0.515 +/- 0.012


In [30]:
cross_validation_with_mlflow(X, y, model_DecisionTree, k=5, model_name="Decision Tree")

########## Fold: 1 ##########




MAE: 4.370
MSE: 88.438
R2: 0.641
########## Fold: 2 ##########




MAE: 4.468
MSE: 92.460
R2: 0.650
########## Fold: 3 ##########




MAE: 4.393
MSE: 85.635
R2: 0.654
########## Fold: 4 ##########




MAE: 4.371
MSE: 100.245
R2: 0.631
########## Fold: 5 ##########




MAE: 4.331
MSE: 85.238
R2: 0.635
##### Displaying Average of Obtained Metrics : #####
Average MAE: 4.387 +/- 0.045
Average MSE: 90.404 +/- 5.556
Average R2: 0.642 +/- 0.009




🏃 View run Decision Tree Cross-Validation at: http://localhost:5000/#/experiments/535394779431182411/runs/1d24d0a15eb6472c9ffd20a767aab1e6
🧪 View experiment at: http://localhost:5000/#/experiments/535394779431182411


## 9. Fine-Tuning
We use Optuna to perform hyperparameter optimization for the XGBoost model to improve its performance.

In [20]:
?XGBRegressor

[31mInit signature:[39m
XGBRegressor(
    *,
    objective: Union[str, xgboost.sklearn._SklObjWProto, Callable[[Any, Any], Tuple[numpy.ndarray, numpy.ndarray]], NoneType] = [33m'reg:squarederror'[39m,
    **kwargs: Any,
) -> [38;5;28;01mNone[39;00m
[31mDocstring:[39m     
Implementation of the scikit-learn API for XGBoost regression.
See :doc:`/python/sklearn_estimator` for more information.

Parameters
----------

    n_estimators : typing.Optional[int]
        Number of gradient boosted trees.  Equivalent to number of boosting
        rounds.

    max_depth :  typing.Optional[int]

        Maximum tree depth for base learners.

    max_leaves : typing.Optional[int]

        Maximum number of leaves; 0 indicates no limit.

    max_bin : typing.Optional[int]

        If using histogram-based algorithm, maximum number of bins per feature

    grow_policy : typing.Optional[str]

        Tree growing policy.

        - depthwise: Favors splitting at nodes closest to the node,
    

In [21]:
def fine_tuning(trial, k=5):
    # tuning
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True)
    max_depth = trial.suggest_int('max_depth', 1, 10)
    subsample = trial.suggest_float('subsample', 0.5, 1, step=0.1)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1, step=0.1)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)

    folds = KFold(n_splits=k, shuffle=True, random_state=42)

    absolute_errors = list()
    squared_errors = list()
    r2 = list()

    for k, (train_index, test_index) in enumerate(folds.split(X, y)):
        
        print("#"*10 + f" Fold: {k+1} " + "#"*10)
        
        X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
        X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

        encoder = CatBoostEncoder()
        
        cat_imputer = SimpleImputer(strategy='most_frequent')
        num_imputer = SimpleImputer(strategy='median')

        cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
        num_pipeline = Pipeline([('imputer', num_imputer)])

        cat_cols = X_train_internal.select_dtypes(include=['object']).columns
        num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

        X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
        X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

        X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
        X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

        model_XGBoost.fit(X_train_internal, y_train_internal)
        y_pred = model_XGBoost.predict(X_test_internal)
        r2score = r2_score(y_test_internal, y_pred)
        mse = mean_squared_error(y_test_internal, y_pred)
        mae = mean_absolute_error(y_test_internal, y_pred)

        absolute_errors.append(mae)
        squared_errors.append(mse)
        r2.append(r2score)

    
    absolute_errors = np.array(absolute_errors)
    squared_errors = np.array(squared_errors)
    r2 = np.array(r2)

    avg_mae = np.mean(absolute_errors)
    avg_mse = np.mean(squared_errors)
    avg_r2 = np.mean(r2)

    std_mae = np.std(absolute_errors)
    std_mse = np.std(squared_errors)
    std_r2 = np.std(r2)

    return avg_mse

study = opt.create_study(direction='minimize')
study.optimize(fine_tuning, n_trials=20)

[I 2025-04-17 17:48:39,203] A new study created in memory with name: no-name-2766b846-7baa-4183-ab42-8eb032c77802


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-17 17:49:26,556] Trial 0 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.0014385451992090926, 'max_depth': 6, 'subsample': 1.0, 'colsample_bytree': 0.8, 'min_child_weight': 1}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-17 17:50:12,368] Trial 1 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.019413921538283543, 'max_depth': 3, 'subsample': 0.6, 'colsample_bytree': 0.9, 'min_child_weight': 1}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-17 17:50:58,873] Trial 2 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.017738500853710506, 'max_depth': 6, 'subsample': 0.6, 'colsample_bytree': 0.9, 'min_child_weight': 5}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-17 17:51:44,811] Trial 3 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.047404271450566984, 'max_depth': 2, 'subsample': 0.5, 'colsample_bytree': 0.6, 'min_child_weight': 7}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########
########## Fold: 4 ##########
########## Fold: 5 ##########


[I 2025-04-17 17:52:30,385] Trial 4 finished with value: 100.96271617963595 and parameters: {'learning_rate': 0.05920617625648391, 'max_depth': 9, 'subsample': 0.6, 'colsample_bytree': 0.8, 'min_child_weight': 9}. Best is trial 0 with value: 100.96271617963595.


########## Fold: 1 ##########
########## Fold: 2 ##########
########## Fold: 3 ##########


[W 2025-04-17 17:53:00,159] Trial 5 failed with parameters: {'learning_rate': 0.003384630250152554, 'max_depth': 8, 'subsample': 0.8, 'colsample_bytree': 1.0, 'min_child_weight': 6} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/conda/envs/olist-freight/lib/python3.13/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_46436/2029287172.py", line 40, in fine_tuning
    y_pred = model_XGBoost.predict(X_test_internal)
  File "/opt/conda/envs/olist-freight/lib/python3.13/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/opt/conda/envs/olist-freight/lib/python3.13/site-packages/xgboost/sklearn.py", line 1327, in predict
    predts = self.get_booster().inplace_predict(
        data=X,
    ...<4 lines>...
        validate_features=validate_features,
    )
  File "/opt/conda/envs/olist-freight/lib/python3.13/site-packages/xgboost/cor

KeyboardInterrupt: 

In [None]:
def fine_tuning(trial, k=5):
    # tuning
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True)
    max_depth = trial.suggest_int('max_depth', 1, 10)
    subsample = trial.suggest_float('subsample', 0.5, 1, step=0.1)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1, step=0.1)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)

    folds = KFold(n_splits=k, shuffle=True, random_state=42)

    absolute_errors = list()
    squared_errors = list()
    r2 = list()

    mlflow.xgboost.autolog()

    # Iniciar um run no MLflow
    with mlflow.start_run(run_name="Optuna Fine-Tuning"):
        
        # Registrar os hiperparâmetros no MLflow
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("subsample", subsample)
        mlflow.log_param("colsample_bytree", colsample_bytree)
        mlflow.log_param("min_child_weight", min_child_weight)
    
        for k, (train_index, test_index) in enumerate(folds.split(X, y)):
            
            print("#"*10 + f" Fold: {k+1} " + "#"*10)
            
            X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
            X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

            encoder = CatBoostEncoder()
            
            cat_imputer = SimpleImputer(strategy='most_frequent')
            num_imputer = SimpleImputer(strategy='median')

            cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
            num_pipeline = Pipeline([('imputer', num_imputer)])

            cat_cols = X_train_internal.select_dtypes(include=['object']).columns
            num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

            X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
            X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

            X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
            X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

            model_XGBoost.fit(X_train_internal, y_train_internal)
            y_pred = model_XGBoost.predict(X_test_internal)
            
            r2score = r2_score(y_test_internal, y_pred)
            mse = mean_squared_error(y_test_internal, y_pred)
            mae = mean_absolute_error(y_test_internal, y_pred)

            absolute_errors.append(mae)
            squared_errors.append(mse)
            r2.append(r2score)
            
            # Registrar métricas do fold no MLflow
            mlflow.log_metric(f"Fold_{k + 1}_MAE", mae)
            mlflow.log_metric(f"Fold_{k + 1}_MSE", mse)
            mlflow.log_metric(f"Fold_{k + 1}_R2", r2score)

        
        absolute_errors = np.array(absolute_errors)
        squared_errors = np.array(squared_errors)
        r2 = np.array(r2)

        avg_mae = np.mean(absolute_errors)
        avg_mse = np.mean(squared_errors)
        avg_r2 = np.mean(r2)

        std_mae = np.std(absolute_errors)
        std_mse = np.std(squared_errors)
        std_r2 = np.std(r2)
        
        # Registrar métricas médias no MLflow
        mlflow.log_metric("Average_MAE", avg_mae)
        mlflow.log_metric("Average_MSE", avg_mse)
        mlflow.log_metric("Average_R2", avg_r2)

        mlflow.log_metric("Std_MAE", std_mae)
        mlflow.log_metric("Std_MSE", std_mse)
        mlflow.log_metric("Std_R2", std_r2)
        
        # Registrar o modelo no MLflow
        mlflow.xgboost.log_model(model_XGBoost, "XGBoost_Model")

    return avg_mse

study = opt.create_study(direction='minimize')
study.optimize(fine_tuning, n_trials=20)

[I 2025-04-17 18:57:36,786] A new study created in memory with name: no-name-fd47ce24-d8c8-4de4-ace3-c39eb2146a51


########## Fold: 1 ##########




In [None]:
params = {'learning_rate': 0.008335068262687652, 'max_depth': 5, 'subsample': 0.6, 'colsample_bytree': 0.9, 'min_child_weight': 4}

In [None]:
model_XGBoost = XGBRegressor(n_estimators=1000, n_jobs=-1, random_state=0, **params)

In [None]:
cross_validation_with_mlflow(X, y, model_XGBoost, k=5, model_name='XGBoost Fine-Tuned')

########## Fold: 1 ##########
MAE: 3.785
MSE: 64.887
R2: 0.736
########## Fold: 2 ##########
MAE: 3.925
MSE: 67.987
R2: 0.743
########## Fold: 3 ##########
MAE: 3.843
MSE: 62.634
R2: 0.747
########## Fold: 4 ##########
MAE: 3.821
MSE: 72.702
R2: 0.732
########## Fold: 5 ##########
MAE: 3.774
MSE: 55.732
R2: 0.761
##### Displaying Average of Obtained Metrics : #####
Average MAE: 3.829 +/- 0.054
Average MSE: 64.788 +/- 5.648
Average R2: 0.744 +/- 0.010


In [None]:
# def feature_engineering(X_train, X_test):

#     X_train['volume'] = X_train['product_length_cm'] * X_train['product_height_cm'] * X_train['product_width_cm']
#     X_test['volume'] = X_test['product_length_cm'] * X_test['product_height_cm'] * X_test['product_width_cm']

#     # other important features

#     return X_train, X_test

In [None]:
# def cross_validation(X, y, model, k):
#     folds = KFold(n_splits=k, shuffle=True, random_state=42)

#     absolute_errors = list()
#     squared_errors = list()
#     r2 = list()

#     for k, (train_index, test_index) in enumerate(folds.split(X, y)):
        
#         print("#"*10 + f" Fold: {k+1} " + "#"*10)
        
#         X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
#         X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

#         encoder = CatBoostEncoder()
        
#         cat_imputer = SimpleImputer(strategy='most_frequent')
#         num_imputer = SimpleImputer(strategy='median')

#         cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
#         num_pipeline = Pipeline([('imputer', num_imputer)])

#         X_train_internal, X_test_internal = feature_engineering(X_train_internal, X_test_internal)

#         cat_cols = X_train_internal.select_dtypes(include=['object']).columns
#         num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

#         X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
#         X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

#         X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
#         X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

#         model.fit(X_train_internal, y_train_internal)
#         y_pred = model.predict(X_test_internal)
#         r2score = r2_score(y_test_internal, y_pred)
#         mse = mean_squared_error(y_test_internal, y_pred)
#         mae = mean_absolute_error(y_test_internal, y_pred)

#         absolute_errors.append(mae)
#         squared_errors.append(mse)
#         r2.append(r2score)

#         print(f'MAE: {mae:.3f}')
#         print(f'MSE: {mse:.3f}')
#         print(f'R2: {r2score:.3f}')
    
#     absolute_errors = np.array(absolute_errors)
#     squared_errors = np.array(squared_errors)
#     r2 = np.array(r2)

#     avg_mae = np.mean(absolute_errors)
#     avg_mse = np.mean(squared_errors)
#     avg_r2 = np.mean(r2)

#     std_mae = np.std(absolute_errors)
#     std_mse = np.std(squared_errors)
#     std_r2 = np.std(r2)

#     print("#"*5 + f" Displaying Average of Obtained Metrics : " + "#"*5)
#     print(f"Average MAE: {avg_mae:.3f} +/- {std_mae:.3f}")
#     print(f'Average MSE: {avg_mse:.3f} +/- {std_mse:.3f}')
#     print(f'Average R2: {avg_r2:.3f} +/- {std_r2:.3f}')

In [None]:
# cross_validation(X, y, model_LightGBM, k=5)

########## Fold: 1 ##########
MAE: 5.664
MSE: 120.227
R2: 0.511
########## Fold: 2 ##########
MAE: 5.827
MSE: 128.344
R2: 0.515
########## Fold: 3 ##########
MAE: 5.773
MSE: 119.646
R2: 0.517
########## Fold: 4 ##########
MAE: 5.715
MSE: 136.550
R2: 0.497
########## Fold: 5 ##########
MAE: 5.612
MSE: 108.576
R2: 0.535
##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.718 +/- 0.076
Average MSE: 122.669 +/- 9.366
Average R2: 0.515 +/- 0.012
