In [15]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from math import sqrt
import xgboost as xgb
import lightgbm as lgb
import warnings
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
warnings.filterwarnings('ignore')

In [16]:
# Load data function
def load_data(split=None):
    folder = f"../data/processed/{split}"
    X_train = pd.read_csv(f"{folder}/X_train.csv")
    X_test = pd.read_csv(f"{folder}/X_test.csv")
    y_train = pd.read_csv(f"{folder}/y_train.csv")
    y_test = pd.read_csv(f"{folder}/y_test.csv")

    if 'date' in X_train.columns:
        X_train.drop('date', axis=1, inplace=True)
        X_test.drop('date', axis=1, inplace=True)

    # if 'itemid' in X_train.columns:
    #     X_train.drop('itemid', axis=1, inplace=True)
    #     X_test.drop('itemid', axis=1, inplace=True)
        
    # X_train.drop('label_1_count', axis=1, inplace=True)
    # X_test.drop('label_1_count', axis=1, inplace=True)
    return X_train, X_test, y_train, y_test

In [17]:
# Models and parameters
models = {
    "Decision Tree Regressor": (DecisionTreeRegressor(), {
        'max_depth': [None, 10, 20, 30], 
        'min_samples_split': [2, 5, 10], 
        'min_samples_leaf': [1, 2, 4]
    })
}


In [18]:
# Highlight functions
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: green' if v else '' for v in is_max]

def highlight_min(s):
    is_min = s == s.min()
    return ['background-color: green' if v else '' for v in is_min]

def sMAPE(y_test, y_pred):
    return 100 * np.mean(np.abs(y_pred - y_test) / (1 + np.abs(y_pred)))

In [19]:
# Train model function
def trainModel(split=None, scaleX=False, scaleY=False):
    # Load data
    X_train, X_test, y_train, y_test = load_data(split)

    # Scaling
    if scaleX:
        scaler = MinMaxScaler()
        X_train = pd.DataFrame(scaler.fit_transform(X_train))
        X_test =  pd.DataFrame(scaler.transform(X_test))

    scaler_y = None
    if scaleY:
        scaler_y = MinMaxScaler()
        y_train =  pd.DataFrame(scaler_y.fit_transform(y_train.values.reshape(-1, 1)))

    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    print(y_test.describe())

    results = {}

    for name, (model, param_grid) in models.items():
        print(name, scaleX, scaleY)
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3)
        grid_search.fit(X_train, y_train.values.ravel())

        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)

        if scaler_y:
            y_pred = scaler_y.inverse_transform(y_pred.reshape(1, -1)).flatten()

        y_test_np = y_test.values.ravel()
        y_pred_np = y_pred.ravel()

        mse = mean_squared_error(y_test_np, y_pred_np)
        rmse = sqrt(mse)
        r2 = r2_score(y_test_np, y_pred_np)
        mae = mean_absolute_error(y_test_np, y_pred_np)
        smape = sMAPE(y_test_np, y_pred_np)

        results[name] = {
            "Best Params": grid_search.best_params_,
            "MSE": mse,
            "RMSE": rmse,
            "R^2": r2,
            "MAE": mae,
            "sMAPE": smape
        }
    
    results_df = pd.DataFrame(results).T
    styled_df = results_df.style.apply(highlight_max, subset=['R^2']).apply(highlight_min, subset=['MSE', 'RMSE', 'MAE', 'sMAPE'])
    return styled_df, results_df



In [20]:
# Run training for different configurations
results_list = []

for data in ['all', 'all_normalize', 'importance', 'importance_normalize']:
    for scaleX in [False, True]:
        for scaleY in [False, True]:
            styled_df, results_df = trainModel(data, scaleX, scaleY)
            results_list.append((f"Data: {data}, Scale X: {scaleX}, Scale Y: {scaleY}", styled_df, results_df))



(73320, 43) (15275, 43) (73320, 1) (15275, 1)
       sales_predict_day
count       15275.000000
mean            4.527660
std             9.754365
min             0.000000
25%             0.000000
50%             1.000000
75%             4.000000
max            86.000000
Decision Tree Regressor False False
(73320, 43) (15275, 43) (73320, 1) (15275, 1)
       sales_predict_day
count       15275.000000
mean            4.527660
std             9.754365
min             0.000000
25%             0.000000
50%             1.000000
75%             4.000000
max            86.000000
Decision Tree Regressor False True
(73320, 43) (15275, 43) (73320, 1) (15275, 1)
       sales_predict_day
count       15275.000000
mean            4.527660
std             9.754365
min             0.000000
25%             0.000000
50%             1.000000
75%             4.000000
max            86.000000
Decision Tree Regressor True False
(73320, 43) (15275, 43) (73320, 1) (15275, 1)
       sales_predict_day
count     

In [21]:
# Combine all results into one DataFrame
combined_results = pd.concat([df for _, _, df in results_list], keys=[desc for desc, _, _ in results_list])
combined_results.to_csv('tree_keeplabel1.csv')
print("Combined results saved to 'combined_results.csv'")


Combined results saved to 'combined_results.csv'


In [22]:
# Display the styled results in a notebook or to save them
for desc, styled_df, results_df in results_list:
    print(desc)
    display(styled_df)


Data: all, Scale X: False, Scale Y: False


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
Decision Tree Regressor,"{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}",48.566494,6.968967,0.489534,2.980311,42.174604


Data: all, Scale X: False, Scale Y: True


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
Decision Tree Regressor,"{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}",48.65,6.974955,0.488656,2.976622,42.173681


Data: all, Scale X: True, Scale Y: False


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
Decision Tree Regressor,"{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}",48.659048,6.975604,0.488561,2.980347,42.13892


Data: all, Scale X: True, Scale Y: True


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
Decision Tree Regressor,"{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}",48.562511,6.968681,0.489575,2.978532,42.147676


Data: all_normalize, Scale X: False, Scale Y: False


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
Decision Tree Regressor,"{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}",50.131526,7.080362,0.473084,2.995934,42.330626


Data: all_normalize, Scale X: False, Scale Y: True


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
Decision Tree Regressor,"{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}",48.811141,6.986497,0.486962,2.981065,42.141685


Data: all_normalize, Scale X: True, Scale Y: False


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
Decision Tree Regressor,"{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}",49.255456,7.018223,0.482292,2.979533,42.185857


Data: all_normalize, Scale X: True, Scale Y: True


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
Decision Tree Regressor,"{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}",49.23261,7.016595,0.482532,2.991665,42.172832


Data: importance, Scale X: False, Scale Y: False


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
Decision Tree Regressor,"{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}",32.244336,5.67841,0.66109,2.390857,40.430353


Data: importance, Scale X: False, Scale Y: True


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
Decision Tree Regressor,"{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}",32.223046,5.676535,0.661314,2.390924,40.448876


Data: importance, Scale X: True, Scale Y: False


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
Decision Tree Regressor,"{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}",32.16085,5.671054,0.661968,2.389053,40.446408


Data: importance, Scale X: True, Scale Y: True


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
Decision Tree Regressor,"{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}",32.225019,5.676708,0.661293,2.391107,40.444292


Data: importance_normalize, Scale X: False, Scale Y: False


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
Decision Tree Regressor,"{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}",31.36298,5.600266,0.670354,2.37066,40.203631


Data: importance_normalize, Scale X: False, Scale Y: True


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
Decision Tree Regressor,"{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}",31.85959,5.64443,0.665134,2.376896,40.235717


Data: importance_normalize, Scale X: True, Scale Y: False


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
Decision Tree Regressor,"{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}",31.609455,5.622229,0.667763,2.37185,40.215802


Data: importance_normalize, Scale X: True, Scale Y: True


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
Decision Tree Regressor,"{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}",31.855972,5.644109,0.665172,2.377079,40.245834
