# Freight Value Prediction Model
This notebook focuses on building and evaluating machine learning models to predict freight values using the Olist dataset. The steps include data preparation, feature engineering, model training, evaluation, and fine-tuning.

## 1. Importing Required Libraries
We start by importing the necessary libraries for data manipulation, visualization, and machine learning.

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.inspection import permutation_importance

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from category_encoders import CatBoostEncoder

from sklearn import metrics
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score

import optuna as opt
import math
import scikitplot as skplt

from warnings import filterwarnings
filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


## 2. Loading the Dataset
We load the preprocessed dataset `df_final.csv` from the `data` directory. This dataset contains features and the target variable `freight_value`.

In [2]:
# Define the base directory for the data files
data_dir = 'data'

# Load the dataset
df = pd.read_csv(os.path.join(data_dir, 'df_final.csv'))

df.head()

Unnamed: 0,price,freight_value,product_category_name,product_weight_g,product_length_cm,product_height_cm,product_width_cm,customer_city,customer_state,review_score,...,volume,density,actual_delivery_time,estimated_delivery_time,approval_order_time,distance,purchase_month,purchase_day_of_week,black_friday,christmas
0,29.99,8.72,utilidades_domesticas,500.0,19.0,8.0,13.0,sao paulo,SP,4,...,1976.0,0.253036,8.0,15,0.0,18.566632,10,0,0,0
1,118.7,22.76,perfumaria,400.0,19.0,13.0,19.0,barreiras,BA,4,...,4693.0,0.085233,13.0,19,1.0,847.437333,7,1,0,0
2,159.9,19.22,automotivo,420.0,24.0,19.0,21.0,vianopolis,GO,5,...,9576.0,0.04386,9.0,26,0.0,512.100044,8,2,0,0
3,45.0,27.2,pet_shop,450.0,30.0,10.0,20.0,sao goncalo do amarante,RN,5,...,6000.0,0.075,13.0,26,0.0,1816.085655,11,5,0,0
4,19.9,8.72,papelaria,250.0,51.0,15.0,15.0,santo andre,SP,5,...,11475.0,0.021786,2.0,12,0.0,29.684401,2,1,0,0


## 3. Splitting Data into Features and Target
We separate the dataset into features (`X`) and the target variable (`y`), which is `freight_value`.

In [3]:
# Splitting in X and Y
X = df.drop(columns='freight_value', axis=1)
y = df.freight_value

## 4. Train-Test Split
We split the data into training and testing sets using an 80-20 split to evaluate the model's performance on unseen data.

In [4]:
# splitting in train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

## 5. Initializing Models
We initialize multiple regression models, including XGBoost, LightGBM, CatBoost, and Decision Tree, with default or predefined hyperparameters.

In [5]:
# Initializing models
model_XGBoost = XGBRegressor(n_estimators = 1000, max_depth = 8, learning_rate = 1e-3, random_state = 0)
model_LightGBM = LGBMRegressor(n_estimators = 1000, max_depth = 8, num_leaves = 2^8, learning_rate = 1e-3, n_jobs = -1, verbose = -1, random_state = 0)
model_DecisionTree = DecisionTreeRegressor(random_state = 0, max_depth = 8, min_samples_split = 2)

## 6. Feature Encoding and Importance
We encode categorical features using CatBoostEncoder and calculate feature importance using permutation importance with the XGBoost model.

In [7]:
# Feature importance

encoder = CatBoostEncoder()
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

for col in X_train_encoded.select_dtypes(include=['object']).columns:
    X_train_encoded[col] = encoder.fit_transform(X_train_encoded[col], y_train)
    X_test_encoded[col] = encoder.transform(X_test_encoded[col])

model_XGBoost.fit(X_train_encoded, y_train)
r = permutation_importance(model_XGBoost, X_test_encoded, y_test, n_repeats=30, random_state=0, n_jobs=-1)

In [8]:
importances = pd.DataFrame({'Feature': X_test_encoded.columns, 'importance': r.importances_mean})
importances = importances.sort_values(by='importance', ascending=False)
importances

Unnamed: 0,Feature,importance
16,distance,0.211973
11,volume,0.198656
2,product_weight_g,0.178174
0,price,0.080163
7,customer_state,0.025644
3,product_length_cm,0.022553
9,seller_city,0.021714
10,seller_state,0.013614
5,product_width_cm,0.004467
12,density,0.003321


## 7. Feature Selection
Based on feature importance, we drop less important features to simplify the model and improve performance.

In [9]:
less_important_columns = ['purchase_day_of_week', 'approval_order_time', 'estimated_delivery_time', 'christmas', 'black_friday']

X_train = X_train.drop(columns=less_important_columns)
X_test = X_test.drop(columns=less_important_columns)

In [10]:
# Feature importance

encoder = CatBoostEncoder()
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

for col in X_train_encoded.select_dtypes(include=['object']).columns:
    X_train_encoded[col] = encoder.fit_transform(X_train_encoded[col], y_train)
    X_test_encoded[col] = encoder.transform(X_test_encoded[col])

model_XGBoost.fit(X_train_encoded, y_train)
r_v2 = permutation_importance(model_XGBoost, X_test_encoded, y_test, n_repeats=30, random_state=0)

In [11]:
importances_v2 = pd.DataFrame({'Feature': X_test_encoded.columns, 'importance': r_v2.importances_mean})
importances_v2 = importances_v2.sort_values(by='importance', ascending=False)
importances_v2

Unnamed: 0,Feature,importance
14,distance,0.211323
11,volume,0.20385
2,product_weight_g,0.182162
0,price,0.090672
7,customer_state,0.024785
3,product_length_cm,0.024383
9,seller_city,0.022433
10,seller_state,0.012966
5,product_width_cm,0.004509
6,customer_city,0.003811


## 8. Cross-Validation
We perform k-fold cross-validation to evaluate the model's performance across multiple splits of the data.

In [12]:
k=5
# 1) Identifique colunas
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(exclude=['object']).columns.tolist()

# 2) Crie o pré-processador
preprocessor = ColumnTransformer([
    ("cat", Pipeline([
        ("encoder", CatBoostEncoder()),
        ("imputer", SimpleImputer(strategy="most_frequent"))
    ]), cat_cols),
    ("num", SimpleImputer(strategy="median"), num_cols)
])

# 3) Monte o pipeline completo
pipeline = Pipeline([
    ("preproc", preprocessor),
    ("model", model_XGBoost)
])

# 4) Defina o CV e as métricas
cv = KFold(n_splits=k, shuffle=True, random_state=42)
scoring = {
    "MAE": make_scorer(mean_absolute_error),
    "MSE": make_scorer(mean_squared_error),
    "R2" : make_scorer(r2_score)
}

# 5) Rode cross_validate em paralelo
scores = cross_validate(
    pipeline, X, y,
    cv=cv,
    scoring=scoring,
    n_jobs=-1,            # usa todos os núcleos disponíveis
    return_train_score=False
)

In [14]:
def cross_validation(X, y, model, k):
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()
    num_cols = X.select_dtypes(exclude=['object']).columns.tolist()
    
    # Preprocessing
    preprocessor = ColumnTransformer([
        ('cat', Pipeline([
            ('encoder', CatBoostEncoder()),
            ('imputer', SimpleImputer(strategy='most_frequent'))
        ]), cat_cols),
        ('num', SimpleImputer(strategy='median'), num_cols)
    ])
    
    
    # Pipeline
    pipeline = Pipeline([
        ('preproc', preprocessor),
        ('model', model)
    ])
    
    # Metrics
    scoring = {
        'MAE': make_scorer(mean_absolute_error),
        'MSE': make_scorer(mean_squared_error),
        'R2': make_scorer(r2_score)        
    }
    
    # Cross-Validation
    cv = KFold(n_splits=k, shuffle=True, random_state=42)
    scores = cross_validate(
        pipeline, X, y,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False
    )
    
    # Impressão por fold
    for i in range(k):
        print('#' * 10 + f' Fold: {i+1} ' + '#' * 10)
        print(f"MAE: {scores['test_MAE'][i]:.3f}")
        print(f"MSE: {scores['test_MSE'][i]:.3f}")
        print(f"R2: {scores['test_R2'][i]:.3f}")
    
    # Estatísticas agregadas
    print("#" * 5 + " Displaying Average of Obtained Metrics : " + "#" * 5)
    print(f"Average MAE: {np.mean(scores['test_MAE']):.3f} +/- {np.std(scores['test_MAE']):.3f}")
    print(f"Average MSE: {np.mean(scores['test_MSE']):.3f} +/- {np.std(scores['test_MSE']):.3f}")
    print(f"Average R2:  {np.mean(scores['test_R2']):.3f} +/- {np.std(scores['test_R2']):.3f}")

### Modelo XGBoost

In [15]:
# Modelo XGBoost
cross_validation(X_train, y_train, model_XGBoost, k=5)

########## Fold: 1 ##########
MAE: 5.135
MSE: 105.310
R2: 0.606
########## Fold: 2 ##########
MAE: 5.134
MSE: 110.527
R2: 0.580
########## Fold: 3 ##########
MAE: 5.117
MSE: 99.207
R2: 0.612
########## Fold: 4 ##########
MAE: 5.110
MSE: 105.206
R2: 0.588
########## Fold: 5 ##########
MAE: 5.057
MSE: 88.463
R2: 0.601
##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.110 +/- 0.029
Average MSE: 101.743 +/- 7.545
Average R2:  0.597 +/- 0.012


### Modelo LightGBM

In [16]:
# Modelo LightGBM
cross_validation(X_train, y_train, model_LightGBM, k=5)



########## Fold: 1 ##########
MAE: 5.746
MSE: 128.903
R2: 0.517
########## Fold: 2 ##########
MAE: 5.769
MSE: 136.637
R2: 0.480
########## Fold: 3 ##########
MAE: 5.757
MSE: 121.284
R2: 0.525
########## Fold: 4 ##########
MAE: 5.719
MSE: 122.145
R2: 0.522
########## Fold: 5 ##########
MAE: 5.628
MSE: 102.618
R2: 0.537
##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.724 +/- 0.051
Average MSE: 122.317 +/- 11.292
Average R2:  0.516 +/- 0.019


### Modelo Decision Tree

In [17]:
# Modelo Decision Tree
cross_validation(X_train, y_train, model_DecisionTree, k=5)

########## Fold: 1 ##########
MAE: 4.435
MSE: 92.206
R2: 0.655
########## Fold: 2 ##########
MAE: 4.474
MSE: 102.957
R2: 0.609
########## Fold: 3 ##########
MAE: 4.405
MSE: 94.683
R2: 0.629
########## Fold: 4 ##########
MAE: 4.478
MSE: 99.613
R2: 0.610
########## Fold: 5 ##########
MAE: 4.439
MSE: 87.759
R2: 0.604
##### Displaying Average of Obtained Metrics : #####
Average MAE: 4.446 +/- 0.027
Average MSE: 95.444 +/- 5.365
Average R2:  0.621 +/- 0.019


## 9. Fine-Tuning
We use Optuna to perform hyperparameter optimization for the XGBoost model to improve its performance.

In [18]:
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
num_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()

preprocessor = ColumnTransformer([
    ('cat', Pipeline([
        ('encoder', CatBoostEncoder()),
        ('imputer', SimpleImputer(strategy='most_frequent'))
    ]), cat_cols),
    ('num', SimpleImputer(strategy='median'), num_cols)
])

In [19]:
def fine_tuning(trial):
    # defining params
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
    }
    
    # build pipeline
    model = XGBRegressor(**params, random_state=42, verbosity=0)
    pipeline = Pipeline([
        ('preproc', preprocessor),
        ('model', model)
    ])
    
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
    
    results = cross_validate(
        pipeline, X_train, y_train,
        cv=cv,
        scoring={'MSE': mse_scorer},
        n_jobs=-1,
        return_train_score=False
    )
    
    mean_mse = - results['test_MSE'].mean()
    
    return mean_mse

In [20]:
study = opt.create_study(direction='minimize')
study.optimize(fine_tuning, n_trials=20)

[I 2025-05-09 00:25:07,481] A new study created in memory with name: no-name-c5424a58-ae73-4836-a4d9-14e357e76763
[I 2025-05-09 00:25:09,312] Trial 0 finished with value: 223.40376082504312 and parameters: {'learning_rate': 0.0013354558034301739, 'max_depth': 4, 'subsample': 0.5, 'colsample_bytree': 0.5, 'min_child_weight': 8}. Best is trial 0 with value: 223.40376082504312.
[I 2025-05-09 00:25:10,853] Trial 1 finished with value: 89.47262721616752 and parameters: {'learning_rate': 0.025568418948590264, 'max_depth': 3, 'subsample': 1.0, 'colsample_bytree': 0.8, 'min_child_weight': 5}. Best is trial 1 with value: 89.47262721616752.
[I 2025-05-09 00:25:12,276] Trial 2 finished with value: 188.10634412807852 and parameters: {'learning_rate': 0.00905691680493906, 'max_depth': 1, 'subsample': 0.5, 'colsample_bytree': 0.7, 'min_child_weight': 5}. Best is trial 1 with value: 89.47262721616752.
[I 2025-05-09 00:25:16,968] Trial 3 finished with value: 209.59255277157436 and parameters: {'learni

In [21]:
study.best_params

{'learning_rate': 0.09080343055629318,
 'max_depth': 6,
 'subsample': 0.7,
 'colsample_bytree': 1.0,
 'min_child_weight': 3}

In [22]:
best_XGBoost = XGBRegressor(n_estimators=1000, n_jobs=-1, random_state=0, **study.best_params)

cross_validation(X_train, y_train, best_XGBoost, k=5)

########## Fold: 1 ##########
MAE: 3.612
MSE: 62.080
R2: 0.768
########## Fold: 2 ##########
MAE: 3.619
MSE: 70.907
R2: 0.730
########## Fold: 3 ##########
MAE: 3.559
MSE: 59.698
R2: 0.766
########## Fold: 4 ##########
MAE: 3.637
MSE: 64.413
R2: 0.748
########## Fold: 5 ##########
MAE: 3.543
MSE: 54.606
R2: 0.754
##### Displaying Average of Obtained Metrics : #####
Average MAE: 3.594 +/- 0.036
Average MSE: 62.341 +/- 5.377
Average R2:  0.753 +/- 0.014


In [23]:
final_pipeline = Pipeline([
    ("preproc", preprocessor),
    ("model", best_XGBoost)
])

final_pipeline.fit(X_train, y_train)

In [25]:
y_test_predict = final_pipeline.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_predict)
test_mae = mean_absolute_error(y_test, y_test_predict)

print(f'Test MSE: {test_mse}')
print(f'Test MAE: {test_mae}')

Test MSE: 59.392380752817814
Test MAE: 3.501378664906357


In [27]:
final_pipeline.feature_names_in_

array(['price', 'product_category_name', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm',
       'customer_city', 'customer_state', 'review_score', 'seller_city',
       'seller_state', 'volume', 'density', 'actual_delivery_time',
       'distance', 'purchase_month'], dtype=object)

In [28]:
model_s = pd.Series({
    'model': final_pipeline,
    'features': final_pipeline.feature_names_in_,
    'test_mse': test_mse,
    'test_mae': test_mae
})

model_s

model       (ColumnTransformer(transformers=[('cat',\n    ...
features    [price, product_category_name, product_weight_...
test_mse                                            59.392381
test_mae                                             3.501379
dtype: object

In [30]:
model_s.to_pickle('model/xgb.pkl')

## Old

In [None]:
# def cross_validation(X, y, model, k):
#     folds = KFold(n_splits=k, shuffle=True, random_state=42)

#     absolute_errors = list()
#     squared_errors = list()
#     r2 = list()

#     for k, (train_index, test_index) in enumerate(folds.split(X, y)):
        
#         print("#"*10 + f" Fold: {k+1} " + "#"*10)
        
#         X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
#         X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

#         encoder = CatBoostEncoder()
        
#         cat_imputer = SimpleImputer(strategy='most_frequent')
#         num_imputer = SimpleImputer(strategy='median')

#         cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
#         num_pipeline = Pipeline([('imputer', num_imputer)])

#         cat_cols = X_train_internal.select_dtypes(include=['object']).columns
#         num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

#         X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
#         X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

#         X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
#         X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

#         model.fit(X_train_internal, y_train_internal)
#         y_pred = model.predict(X_test_internal)

#         r2score = r2_score(y_test_internal, y_pred)
#         mse = mean_squared_error(y_test_internal, y_pred)
#         mae = mean_absolute_error(y_test_internal, y_pred)

#         absolute_errors.append(mae)
#         squared_errors.append(mse)
#         r2.append(r2score)

#         print(f'MAE: {mae:.3f}')
#         print(f'MSE: {mse:.3f}')
#         print(f'R2: {r2score:.3f}')
    
#     absolute_errors = np.array(absolute_errors)
#     squared_errors = np.array(squared_errors)
#     r2 = np.array(r2)

#     avg_mae = np.mean(absolute_errors)
#     avg_mse = np.mean(squared_errors)
#     avg_r2 = np.mean(r2)

#     std_mae = np.std(absolute_errors)
#     std_mse = np.std(squared_errors)
#     std_r2 = np.std(r2)

#     print("#"*5 + f" Displaying Average of Obtained Metrics : " + "#"*5)
#     print(f"Average MAE: {avg_mae:.3f} +/- {std_mae:.3f}")
#     print(f'Average MSE: {avg_mse:.3f} +/- {std_mse:.3f}')
#     print(f'Average R2: {avg_r2:.3f} +/- {std_r2:.3f}')

In [None]:
# def fine_tuning(trial, k=5):
#     # tuning
#     learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True)
#     max_depth = trial.suggest_int('max_depth', 1, 10)
#     subsample = trial.suggest_float('subsample', 0.5, 1, step=0.1)
#     colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1, step=0.1)
#     min_child_weight = trial.suggest_int('min_child_weight', 1, 10)

#     folds = KFold(n_splits=k, shuffle=True, random_state=42)

#     absolute_errors = list()
#     squared_errors = list()
#     r2 = list()

#     for k, (train_index, test_index) in enumerate(folds.split(X, y)):
        
#         print("#"*10 + f" Fold: {k+1} " + "#"*10)
        
#         X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
#         X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

#         encoder = CatBoostEncoder()
        
#         cat_imputer = SimpleImputer(strategy='most_frequent')
#         num_imputer = SimpleImputer(strategy='median')

#         cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
#         num_pipeline = Pipeline([('imputer', num_imputer)])

#         cat_cols = X_train_internal.select_dtypes(include=['object']).columns
#         num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

#         X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
#         X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

#         X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
#         X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

#         model_XGBoost.fit(X_train_internal, y_train_internal)
#         y_pred = model_XGBoost.predict(X_test_internal)
#         r2score = r2_score(y_test_internal, y_pred)
#         mse = mean_squared_error(y_test_internal, y_pred)
#         mae = mean_absolute_error(y_test_internal, y_pred)

#         absolute_errors.append(mae)
#         squared_errors.append(mse)
#         r2.append(r2score)

    
#     absolute_errors = np.array(absolute_errors)
#     squared_errors = np.array(squared_errors)
#     r2 = np.array(r2)

#     avg_mae = np.mean(absolute_errors)
#     avg_mse = np.mean(squared_errors)
#     avg_r2 = np.mean(r2)

#     std_mae = np.std(absolute_errors)
#     std_mse = np.std(squared_errors)
#     std_r2 = np.std(r2)

#     return avg_mse

# study = opt.create_study(direction='minimize')
# study.optimize(fine_tuning, n_trials=20)