In [16]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold
from sklearn.inspection import permutation_importance

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from category_encoders import CatBoostEncoder

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000/")
mlflow.set_experiment(experiment_id=535394779431182411)

In [2]:
# Define the base directory for the data files
data_dir = 'data'

# Load the dataset
df = pd.read_csv(os.path.join(data_dir, 'df_final.csv'))

df.head()

Unnamed: 0,price,freight_value,product_category_name,product_weight_g,product_length_cm,product_height_cm,product_width_cm,customer_city,customer_state,review_score,...,volume,density,actual_delivery_time,estimated_delivery_time,approval_order_time,distance,purchase_month,purchase_day_of_week,black_friday,christmas
0,29.99,8.72,utilidades_domesticas,500.0,19.0,8.0,13.0,sao paulo,SP,4,...,1976.0,0.253036,8.0,15,0.0,18.566632,10,0,0,0
1,118.7,22.76,perfumaria,400.0,19.0,13.0,19.0,barreiras,BA,4,...,4693.0,0.085233,13.0,19,1.0,847.437333,7,1,0,0
2,159.9,19.22,automotivo,420.0,24.0,19.0,21.0,vianopolis,GO,5,...,9576.0,0.04386,9.0,26,0.0,512.100044,8,2,0,0
3,45.0,27.2,pet_shop,450.0,30.0,10.0,20.0,sao goncalo do amarante,RN,5,...,6000.0,0.075,13.0,26,0.0,1816.085655,11,5,0,0
4,19.9,8.72,papelaria,250.0,51.0,15.0,15.0,santo andre,SP,5,...,11475.0,0.021786,2.0,12,0.0,29.684401,2,1,0,0


In [3]:
# Splitting in X and Y
X = df.drop(columns='freight_value', axis=1)
y = df.freight_value

In [4]:
# splitting in train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

In [5]:
# Initializing models
model_XGBoost = XGBRegressor(n_estimators = 1000, max_depth = 8, learning_rate = 1e-3, random_state = 0)
model_LightGBM = LGBMRegressor(n_estimators = 1000, max_depth = 8, num_leaves = 2^8, learning_rate = 1e-3, n_jobs = -1, verbose = -1, random_state = 0)
model_Catboost = CatBoostRegressor(n_estimators = 1000, max_depth = 8, learning_rate = 1e-3, random_state = 0, verbose = 0)
model_DecisionTree = DecisionTreeRegressor(random_state = 0, max_depth = 8, min_samples_split = 2)

In [6]:
# Feature importance

encoder = CatBoostEncoder()
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

for col in X_train_encoded.select_dtypes(include=['object']).columns:
    X_train_encoded[col] = encoder.fit_transform(X_train_encoded[col], y_train)
    X_test_encoded[col] = encoder.transform(X_test_encoded[col])

model_XGBoost.fit(X_train_encoded, y_train)
r = permutation_importance(model_XGBoost, X_test_encoded, y_test, n_repeats=30, random_state=0)

In [7]:
importances = pd.DataFrame({'Feature': X_test_encoded.columns, 'importance': r.importances_mean})
importances = importances.sort_values(by='importance', ascending=False)
importances

Unnamed: 0,Feature,importance
16,distance,0.211973
11,volume,0.198656
2,product_weight_g,0.178174
0,price,0.080163
7,customer_state,0.025644
3,product_length_cm,0.022553
9,seller_city,0.021714
10,seller_state,0.013614
5,product_width_cm,0.004467
12,density,0.003321


In [8]:
less_important_columns = ['purchase_day_of_week', 'approval_order_time', 'estimated_delivery_time', 'christmas', 'black_friday']

X_train = X_train.drop(columns=less_important_columns)
X_test = X_test.drop(columns=less_important_columns)

In [10]:
# Feature importance

encoder = CatBoostEncoder()
X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

for col in X_train_encoded.select_dtypes(include=['object']).columns:
    X_train_encoded[col] = encoder.fit_transform(X_train_encoded[col], y_train)
    X_test_encoded[col] = encoder.transform(X_test_encoded[col])

model_XGBoost.fit(X_train_encoded, y_train)
r_v2 = permutation_importance(model_XGBoost, X_test_encoded, y_test, n_repeats=30, random_state=0)

In [11]:
importances_v2 = pd.DataFrame({'Feature': X_test_encoded.columns, 'importance': r_v2.importances_mean})
importances_v2 = importances_v2.sort_values(by='importance', ascending=False)
importances_v2

Unnamed: 0,Feature,importance
14,distance,0.211323
11,volume,0.20385
2,product_weight_g,0.182162
0,price,0.090672
7,customer_state,0.024785
3,product_length_cm,0.024383
9,seller_city,0.022433
10,seller_state,0.012966
5,product_width_cm,0.004509
6,customer_city,0.003811


In [None]:
k = 5

folds = KFold(n_splits=k, shuffle=True, random_state=42)

absolute_errors = list()
squared_errors = list()
r2 = list()

for k, (train_index, test_index) in enumerate(folds.split(X, y)):
    
    print("#"*10 + f" Fold: {k+1} " + "#"*10)
    
    X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
    X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

    encoder = CatBoostEncoder()
    
    cat_imputer = SimpleImputer(strategy='most_frequent')
    num_imputer = SimpleImputer(strategy='median')

    cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
    num_pipeline = Pipeline([('imputer', num_imputer)])

    cat_cols = X_train_internal.select_dtypes(include=['object']).columns
    num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

    X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
    X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

    X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
    X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

    model_XGBoost.fit(X_train_internal, y_train_internal)
    y_pred = model_XGBoost.predict(X_test_internal)
    r2score = r2_score(y_test_internal, y_pred)
    mse = mean_squared_error(y_test_internal, y_pred)
    mae = mean_absolute_error(y_test_internal, y_pred)

    absolute_errors.append(mae)
    squared_errors.append(mse)
    r2.append(r2score)

    print(f'MAE: {mae:.3f}')
    print(f'MSE: {mse:.3f}')
    print(f'R2: {r2score:.3f}')


########## Fold: 1 ##########
MAE: 5.043531
MSE: 98.518149
R2: 0.599536
########## Fold: 2 ##########
MAE: 5.187921
MSE: 106.659645
R2: 0.596607
########## Fold: 3 ##########
MAE: 5.146126
MSE: 97.921411
R2: 0.604912
########## Fold: 4 ##########
MAE: 5.078323
MSE: 112.953040
R2: 0.584078
########## Fold: 5 ##########
MAE: 4.989949
MSE: 88.761336
R2: 0.619984


In [20]:
absolute_errors = np.array(absolute_errors)
squared_errors = np.array(squared_errors)
r2 = np.array(r2)

avg_mae = np.mean(absolute_errors)
avg_mse = np.mean(squared_errors)
avg_r2 = np.mean(r2)

std_mae = np.std(absolute_errors)
std_mse = np.std(squared_errors)
std_r2 = np.std(r2)

print("#"*5 + f" Displaying Average of Obtained Metrics : " + "#"*5)
print(f"Average MAE: {avg_mae:.3f} +/- {std_mae:.3f}")
print(f'Average MSE: {avg_mse:.3f} +/- {std_mse:.3f}')
print(f'Average R2: {avg_r2:.3f} +/- {std_r2:.3f}')

##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.089 +/- 0.071
Average MSE: 100.963 +/- 8.250
Average R2: 0.601 +/- 0.012


In [21]:
def cross_validation(X, y, model, k):
    folds = KFold(n_splits=k, shuffle=True, random_state=42)

    absolute_errors = list()
    squared_errors = list()
    r2 = list()

    for k, (train_index, test_index) in enumerate(folds.split(X, y)):
        
        print("#"*10 + f" Fold: {k+1} " + "#"*10)
        
        X_train_internal, y_train_internal = X.iloc[train_index, :], y.iloc[train_index]
        X_test_internal, y_test_internal = X.iloc[test_index, :], y.iloc[test_index]

        encoder = CatBoostEncoder()
        
        cat_imputer = SimpleImputer(strategy='most_frequent')
        num_imputer = SimpleImputer(strategy='median')

        cat_pipeline = Pipeline([('encoder', encoder), ('imputer', cat_imputer)])
        num_pipeline = Pipeline([('imputer', num_imputer)])

        cat_cols = X_train_internal.select_dtypes(include=['object']).columns
        num_cols = X_train_internal.select_dtypes(exclude=['object']).columns

        X_train_internal[cat_cols] = cat_pipeline.fit_transform(X_train_internal[cat_cols], y_train_internal)
        X_train_internal[num_cols] = num_pipeline.fit_transform(X_train_internal[num_cols])

        X_test_internal[cat_cols] = cat_pipeline.transform(X_test_internal[cat_cols])
        X_test_internal[num_cols] = num_pipeline.transform(X_test_internal[num_cols])

        model.fit(X_train_internal, y_train_internal)
        y_pred = model.predict(X_test_internal)
        r2score = r2_score(y_test_internal, y_pred)
        mse = mean_squared_error(y_test_internal, y_pred)
        mae = mean_absolute_error(y_test_internal, y_pred)

        absolute_errors.append(mae)
        squared_errors.append(mse)
        r2.append(r2score)

        print(f'MAE: {mae:.3f}')
        print(f'MSE: {mse:.3f}')
        print(f'R2: {r2score:.3f}')
    
    absolute_errors = np.array(absolute_errors)
    squared_errors = np.array(squared_errors)
    r2 = np.array(r2)

    avg_mae = np.mean(absolute_errors)
    avg_mse = np.mean(squared_errors)
    avg_r2 = np.mean(r2)

    std_mae = np.std(absolute_errors)
    std_mse = np.std(squared_errors)
    std_r2 = np.std(r2)

    print("#"*5 + f" Displaying Average of Obtained Metrics : " + "#"*5)
    print(f"Average MAE: {avg_mae:.3f} +/- {std_mae:.3f}")
    print(f'Average MSE: {avg_mse:.3f} +/- {std_mse:.3f}')
    print(f'Average R2: {avg_r2:.3f} +/- {std_r2:.3f}')

### Modelo XGBoost

In [22]:
# Modelo XGBoost
cross_validation(X, y, model_XGBoost, k=5)

########## Fold: 1 ##########
MAE: 5.044
MSE: 98.518
R2: 0.600
########## Fold: 2 ##########
MAE: 5.188
MSE: 106.660
R2: 0.597
########## Fold: 3 ##########
MAE: 5.146
MSE: 97.921
R2: 0.605
########## Fold: 4 ##########
MAE: 5.078
MSE: 112.953
R2: 0.584
########## Fold: 5 ##########
MAE: 4.990
MSE: 88.761
R2: 0.620
##### Displaying Average of Obtained Metrics : #####
Average MAE: 5.089 +/- 0.071
Average MSE: 100.963 +/- 8.250
Average R2: 0.601 +/- 0.012


### Modelo LightGBM

In [None]:
# Modelo LightGBM
cross_validation(X, y, model_LightGBM, k=5)

In [None]:
# Modelo Decision Tree
cross_validation(X, y, model_LightGBM, k=5)