In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/kltn-final/processed/all/y_test.csv
/kaggle/input/kltn-final/processed/all/X_test.csv
/kaggle/input/kltn-final/processed/all/y_train.csv
/kaggle/input/kltn-final/processed/all/data_for_model.csv
/kaggle/input/kltn-final/processed/all/X_train.csv
/kaggle/input/kltn-final/processed/all_normalize/y_test.csv
/kaggle/input/kltn-final/processed/all_normalize/X_test.csv
/kaggle/input/kltn-final/processed/all_normalize/y_train.csv
/kaggle/input/kltn-final/processed/all_normalize/data_for_model.csv
/kaggle/input/kltn-final/processed/all_normalize/X_train.csv
/kaggle/input/kltn-final/processed/importance/y_test.csv
/kaggle/input/kltn-final/processed/importance/X_test.csv
/kaggle/input/kltn-final/processed/importance/y_train.csv
/kaggle/input/kltn-final/processed/importance/data_for_model.csv
/kaggle/input/kltn-final/processed/importance/X_train.csv
/kaggle/input/kltn-final/processed/importance_normalize/y_test.csv
/kaggle/input/kltn-final/processed/importance_normalize/X_test.csv
/

In [2]:
import pandas as pd
import numpy as np
import cudf
from cuml.linear_model import LinearRegression
from cuml.ensemble import RandomForestRegressor
from cuml.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from cuml.model_selection import GridSearchCV
from math import sqrt
import xgboost as xgb
import lightgbm as lgb
import warnings
from sklearn.preprocessing import MinMaxScaler
warnings.filterwarnings('ignore')


In [3]:
def load_data(split=None):
    folder = f"/kaggle/input/kltn-final/processed/{split}"
    X_train = pd.read_csv(f"{folder}/X_train.csv")
    X_test = pd.read_csv(f"{folder}/X_test.csv")
    y_train = pd.read_csv(f"{folder}/y_train.csv")
    y_test = pd.read_csv(f"{folder}/y_test.csv")

    if 'date' in X_train.columns:
        X_train.drop('date', axis=1, inplace=True)
        X_test.drop('date', axis=1, inplace=True)
    if 'itemid' in X_train.columns:
        X_train.drop('itemid', axis=1, inplace=True)
        X_test.drop('itemid', axis=1, inplace=True)

    return X_train, X_test, y_train, y_test


In [4]:
models = {
    'LightGBM': (lgb.LGBMRegressor(), {
        'n_estimators': [100, 200, 300], 
        'learning_rate': [0.01, 0.1, 0.2], 
        'num_leaves': [31, 63, 127]
    }),
    "Linear Regression": (LinearRegression(fit_intercept=True), {}),
    "Random Forest": (RandomForestRegressor(), {
        'n_estimators': [100, 200, 300], 
        'max_depth': [10, 20, 30], 
        'min_samples_split': [2, 5, 10]
    }),
    "XGBoost Regressor": (xgb.XGBRegressor(objective='reg:squarederror', tree_method='gpu_hist', device='cuda'), {
        'n_estimators': [100, 200, 300], 
        'max_depth': [3, 6, 9], 
        'learning_rate': [0.01, 0.1, 0.2]
    }),
    "K-Neighbors Regressor": (KNeighborsRegressor(), {
        'n_neighbors': [3, 5, 7, 9, 11, 75], 
        'weights': ['uniform', 'distance'], 
        'metric': ['euclidean', 'manhattan', 'minkowski'], 
        'algorithm': ['auto', 'ball_tree', 'kd_tree']
    })
}

In [5]:
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: green' if v else '' for v in is_max]

def highlight_min(s):
    is_min = s == s.min()
    return ['background-color: green' if v else '' for v in is_min]

def sMAPE(y_test, y_pred):
    return 100 * np.mean(np.abs(y_pred - y_test) / (1 + np.abs(y_pred)))


In [6]:
def trainModel(split=None, scaleX=False, scaleY=False):
    # Load data
    X_train, X_test, y_train, y_test = load_data(split)

    # Scaling
    if scaleX:
        scaler = MinMaxScaler()
        X_train = pd.DataFrame(scaler.fit_transform(X_train))
        X_test =  pd.DataFrame(scaler.transform(X_test))

    scaler_y = None
    if scaleY:
        scaler_y = MinMaxScaler()
        y_train =  pd.DataFrame(scaler_y.fit_transform(y_train.values.reshape(-1, 1)))

    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    print(y_test.describe())

    results = {}

    for name, (model, param_grid) in models.items():
        print(name, scaleX, scaleY)
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3)

        if name == 'LightGBM':
            X_train_cudf = X_train.astype('float32')
            y_train_cudf = y_train.astype('float32')
            X_test_cudf = X_test.astype('float32')
            y_test_cudf = y_test.astype('float32')
        else:
            X_train_cudf = cudf.DataFrame.from_pandas(X_train.astype('float32'))
            X_test_cudf = cudf.DataFrame.from_pandas(X_test.astype('float32'))
            y_train_cudf = y_train.values.ravel().astype('float32')
            y_test_cudf = y_test.values.ravel().astype('float32')
            
        grid_search.fit(X_train_cudf, y_train_cudf)

        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test_cudf)

        if hasattr(y_pred, 'to_numpy'):
            y_pred_np = y_pred.to_numpy()
        else:
            y_pred_np = np.array(y_pred)

        if scaler_y:
            y_pred = scaler_y.inverse_transform(y_pred_np.reshape(1, -1)).flatten()

        if hasattr(y_test_cudf, 'to_numpy'):
            y_test_np = y_test_cudf.to_numpy()
        else:
            y_test_np = np.array(y_test_cudf)

        mse = mean_squared_error(y_test_np, y_pred_np)
        rmse = sqrt(mse)
        r2 = r2_score(y_test_np, y_pred_np)
        mae = mean_absolute_error(y_test_np, y_pred_np)
        smape = sMAPE(y_test_np, y_pred_np)

        results[name] = {
            "Best Params": grid_search.best_params_,
            "MSE": mse,
            "RMSE": rmse,
            "R^2": r2,
            "MAE": mae,
            "sMAPE": smape
        }
    
    results_df = pd.DataFrame(results).T
    styled_df = results_df.style.apply(highlight_max, subset=['R^2']).apply(highlight_min, subset=['MSE', 'RMSE', 'MAE', 'sMAPE'])
    return styled_df, results_df


In [7]:
results_list = []

for data in ['all', 'all_normalize', 'importance', 'importance_normalize']:
    for scaleX in [False, True]:
        for scaleY in [False, True]:
            styled_df, results_df = trainModel(data, scaleX, scaleY)
            results_list.append((f"Data: {data}, Scale X: {scaleX}, Scale Y: {scaleY}", styled_df, results_df))


(73320, 43) (15275, 43) (73320, 1) (15275, 1)
       sales_predict_day
count       15275.000000
mean            4.527660
std             9.754365
min             0.000000
25%             0.000000
50%             1.000000
75%             4.000000
max            86.000000
LightGBM False False
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033476 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6256
[LightGBM] [Info] Number of data points in the train set: 48880, number of used features: 43
[LightGBM] [Info] Start training from score 3.378417
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6305
[LightGBM] [Info] Number of data points in the train set: 48880, number of used features: 43
[LightGBM] [Info] Start training from score 3.459493
[LightGBM] [Info] Auto-c

In [8]:
# To combine all results into one DataFrame
combined_results = pd.concat([df for _, _, df in results_list], keys=[desc for desc, _, _ in results_list])
combined_results.to_csv('combined_results.csv')
print("Combined results saved to 'combined_results.csv'")

Combined results saved to 'combined_results.csv'


In [9]:
# To display the styled results in a notebook or to save them
for desc, styled_df, results_df in results_list:
    print(desc)
    display(styled_df)


Data: all, Scale X: False, Scale Y: False


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
LightGBM,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}",27.703377,5.2634,0.708819,2.439685,244.489418
Linear Regression,{},76.941719,8.771643,0.191291,4.378832,89.960217
Random Forest,"{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}",28.91597,5.377357,0.696074,2.340047,39.435568
XGBoost Regressor,"{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100}",34.530357,5.876254,0.637063,2.73337,36.762157
K-Neighbors Regressor,"{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}",100.022141,10.001107,-0.0513,4.971675,141.112292


Data: all, Scale X: False, Scale Y: True


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
LightGBM,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}",113.243008,10.64157,-0.19026,4.473409,430.630241
Linear Regression,{},114.94828,10.721394,-0.208184,4.515626,420.907021
Random Forest,"{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}",113.77668,10.666615,-0.195869,4.485559,377.002716
XGBoost Regressor,"{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100}",113.042076,10.632125,-0.188148,4.468737,354.990554
K-Neighbors Regressor,"{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}",114.838364,10.716266,-0.207028,4.511597,417.22064


Data: all, Scale X: True, Scale Y: False


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
LightGBM,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}",28.278002,5.317707,0.702779,2.453648,249.935168
Linear Regression,{},26.066845,5.10557,0.72602,2.267398,38.692513
Random Forest,"{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}",28.91597,5.377357,0.696074,2.340047,39.435568
XGBoost Regressor,"{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100}",34.530357,5.876254,0.637063,2.73337,36.762157
K-Neighbors Regressor,"{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}",31.096512,5.576425,0.673155,2.387234,46.720797


Data: all, Scale X: True, Scale Y: True


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
LightGBM,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}",113.226442,10.640791,-0.190086,4.472912,430.653542
Linear Regression,{},113.734444,10.664635,-0.195425,4.484908,375.483704
Random Forest,"{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}",113.77668,10.666615,-0.195869,4.485559,377.002716
XGBoost Regressor,"{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100}",113.042076,10.632125,-0.188148,4.468737,354.990554
K-Neighbors Regressor,"{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}",113.882851,10.671591,-0.196985,4.48972,381.785059


Data: all_normalize, Scale X: False, Scale Y: False


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
LightGBM,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}",28.025935,5.293953,0.705429,2.44839,246.129225
Linear Regression,{},31.728058,5.632766,0.666517,2.651118,50.003546
Random Forest,"{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}",28.933826,5.379017,0.695886,2.342284,39.43302
XGBoost Regressor,"{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100}",32.085922,5.664444,0.662755,2.6424,36.117044
K-Neighbors Regressor,"{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}",32.296776,5.683025,0.660539,2.414424,46.533373


Data: all_normalize, Scale X: False, Scale Y: True


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
LightGBM,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}",113.207041,10.63988,-0.189882,4.472881,430.600604
Linear Regression,{},113.86441,10.670727,-0.196791,4.489192,379.741359
Random Forest,"{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}",113.776413,10.666603,-0.195866,4.485551,377.000976
XGBoost Regressor,"{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100}",113.145546,10.63699,-0.189236,4.470571,357.666183
K-Neighbors Regressor,"{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}",113.911713,10.672943,-0.197288,4.488983,381.913018


Data: all_normalize, Scale X: True, Scale Y: False


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
LightGBM,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}",28.28471,5.318337,0.702709,2.432207,247.663295
Linear Regression,{},25.940691,5.0932,0.727346,2.258664,38.53828
Random Forest,"{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}",28.933846,5.379019,0.695886,2.34228,39.433894
XGBoost Regressor,"{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100}",32.085922,5.664444,0.662755,2.6424,36.117044
K-Neighbors Regressor,"{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}",32.042118,5.660576,0.663216,2.415232,46.82751


Data: all_normalize, Scale X: True, Scale Y: True


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
LightGBM,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}",113.193082,10.639224,-0.189735,4.47282,430.653224
Linear Regression,{},113.72496,10.664191,-0.195326,4.484673,375.17972
Random Forest,"{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}",113.776413,10.666603,-0.195866,4.485551,377.000928
XGBoost Regressor,"{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100}",113.145546,10.63699,-0.189236,4.470571,357.666183
K-Neighbors Regressor,"{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}",113.877304,10.671331,-0.196927,4.489502,381.690407


Data: importance, Scale X: False, Scale Y: False


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
LightGBM,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}",27.441993,5.238511,0.711566,2.254594,256.4294
Linear Regression,{},65.597008,8.099198,0.310531,3.807243,80.085105
Random Forest,"{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}",27.328369,5.227654,0.712761,2.256646,38.26268
XGBoost Regressor,"{'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300}",27.745369,5.267387,0.708378,2.300165,41.180873
K-Neighbors Regressor,"{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}",100.279541,10.013967,-0.054005,4.960262,142.735946


Data: importance, Scale X: False, Scale Y: True


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
LightGBM,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}",113.764267,10.666033,-0.195739,4.485579,434.863103
Linear Regression,{},114.719353,10.710712,-0.205777,4.506675,411.01613
Random Forest,"{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}",113.800423,10.667728,-0.196119,4.485981,377.403855
XGBoost Regressor,"{'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300}",113.869667,10.670973,-0.196847,4.488097,379.689598
K-Neighbors Regressor,"{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}",114.850838,10.716848,-0.207159,4.511626,417.644739


Data: importance, Scale X: True, Scale Y: False


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
LightGBM,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}",26.894289,5.18597,0.717323,2.239881,256.217631
Linear Regression,{},25.82678,5.082006,0.728543,2.230955,38.016519
Random Forest,"{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}",27.328369,5.227654,0.712761,2.256646,38.26268
XGBoost Regressor,"{'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300}",27.745369,5.267387,0.708378,2.300165,41.180873
K-Neighbors Regressor,"{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}",31.563404,5.618132,0.668247,2.438123,45.413816


Data: importance, Scale X: True, Scale Y: True


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
LightGBM,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}",113.767751,10.666197,-0.195775,4.485733,434.89369
Linear Regression,{},113.695915,10.662829,-0.19502,4.484474,374.531388
Random Forest,"{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}",113.800423,10.667728,-0.196119,4.485981,377.403855
XGBoost Regressor,"{'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300}",113.869667,10.670973,-0.196847,4.488097,379.689598
K-Neighbors Regressor,"{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}",113.828651,10.669051,-0.196415,4.48728,379.410195


Data: importance_normalize, Scale X: False, Scale Y: False


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
LightGBM,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}",27.489826,5.243074,0.711064,2.251027,256.29866
Linear Regression,{},27.081417,5.203981,0.715356,2.360151,43.080261
Random Forest,"{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}",27.068531,5.202743,0.715492,2.24283,38.107073
XGBoost Regressor,"{'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300}",28.075428,5.298625,0.704908,2.311099,41.121554
K-Neighbors Regressor,"{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}",31.889738,5.6471,0.664817,2.409689,45.335826


Data: importance_normalize, Scale X: False, Scale Y: True


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
LightGBM,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}",113.794089,10.667431,-0.196052,4.48602,434.959762
Linear Regression,{},113.800591,10.667736,-0.196121,4.487869,378.00107
Random Forest,"{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}",113.802933,10.667846,-0.196145,4.485953,377.411366
XGBoost Regressor,"{'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300}",113.875038,10.671225,-0.196903,4.488061,379.805803
K-Neighbors Regressor,"{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}",113.893074,10.67207,-0.197093,4.488342,381.170154


Data: importance_normalize, Scale X: True, Scale Y: False


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
LightGBM,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}",27.604443,5.253993,0.709859,2.258488,255.81027
Linear Regression,{},26.023556,5.101329,0.726475,2.255879,39.517286
Random Forest,"{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}",27.068501,5.20274,0.715492,2.242835,38.10541
XGBoost Regressor,"{'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300}",28.076588,5.298735,0.704896,2.311115,41.121751
K-Neighbors Regressor,"{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}",30.499727,5.522656,0.679427,2.382368,44.35862


Data: importance_normalize, Scale X: True, Scale Y: True


Unnamed: 0,Best Params,MSE,RMSE,R^2,MAE,sMAPE
LightGBM,"{'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}",113.785253,10.667017,-0.195959,4.485851,434.913205
Linear Regression,{},113.720322,10.663973,-0.195277,4.485246,375.324368
Random Forest,"{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}",113.802933,10.667846,-0.196145,4.485953,377.411342
XGBoost Regressor,"{'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300}",113.875099,10.671228,-0.196904,4.488061,379.807377
K-Neighbors Regressor,"{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}",113.826775,10.668963,-0.196396,4.48746,379.347086
