Using LightGBM (lgb)

In [19]:
#Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
#load dataset
data_path = 'data/preprocessed_data.csv'
print("Loading data from:", data_path)
preprocessed_data = pd.read_csv(data_path)
print(preprocessed_data)

Loading data from: data/preprocessed_data.csv
        Store  DayOfWeek        Date  Sales  Customers  Open  Promo  \
0           1          5  2015-07-31   5263        555     1      1   
1           2          5  2015-07-31   6064        625     1      1   
2           3          5  2015-07-31   8314        821     1      1   
3           4          5  2015-07-31  13995       1498     1      1   
4           5          5  2015-07-31   4822        559     1      1   
...       ...        ...         ...    ...        ...   ...    ...   
844333    682          2  2013-01-01   3375        566     1      0   
844334    733          2  2013-01-01  10765       2377     1      0   
844335    769          2  2013-01-01   5035       1248     1      0   
844336    948          2  2013-01-01   4491       1039     1      0   
844337   1097          2  2013-01-01   5961       1405     1      0   

        StateHoliday  SchoolHoliday  StoreType  Assortment  \
0                  0              1    

In [21]:
# Handle any other object columns if they exist
object_columns = preprocessed_data.select_dtypes(include=['object']).columns.tolist()
if object_columns:
    print(f"  Additional object columns found: {object_columns}")
    preprocessed_data = pd.get_dummies(
        preprocessed_data, 
        columns=object_columns, 
        drop_first=False
    )
    # Sanitize the column names
    preprocessed_data.columns = preprocessed_data.columns.str.replace('[^a-zA-Z0-9_]', '_', regex=True)

  Additional object columns found: ['Date', 'PromoInterval']


In [22]:
#Preprocess the data
#Define features (X) and target (y)
X = preprocessed_data.drop(["Sales", "Date", "PromoInterval"], axis=1, errors='ignore')
y = preprocessed_data["Sales"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=50)

In [23]:
#setup data for model
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

In [24]:
#init hyperparameter
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators': 500,
    'learning_rate': 1,
    'num_leaves': 50,
    'max_depth': 5,
    'random_state': 50,
    'device': 'gpu'
}

In [None]:
#Tranin LightGBM
lgb_model = lgb.train(
    params,
    train_data,
    valid_sets=[val_data],
    num_boost_round=10000,
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(100)]
)


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2676
[LightGBM] [Info] Number of data points in the train set: 675470, number of used features: 921
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 13 dense feature groups (10.31 MB) transferred to GPU in 0.013106 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 6961.526139
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 630.475
[200]	valid_0's rmse: 541.342
[300]	valid_0's rmse: 497.903
[400]	valid_0's rmse: 478.958
[500]	valid_0's rmse: 469.617
Did not meet early stopping. Best iteration is:
[500]	valid_0's rmse: 469.617


In [26]:
#Define RMSPE function (Root Mean Square Percentage Error)
y_pred_lgb = lgb_model.predict(X_val)
rmse_lgb = np.sqrt(mean_squared_error(y_val, y_pred_lgb))
rmspe_lgb = np.sqrt(np.mean(((y_val - y_pred_lgb) / y_val) ** 2))
print(f"RMSE: {rmse_lgb:.2f}, RMSPE: {rmspe_lgb:.4f}")

RMSE: 469.62, RMSPE: 0.0838


In [27]:
#3 Feature importance
feature_importance = lgb_model.feature_importance()
features = X.columns

importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importance})
importance_df = importance_df.sort_values('Importance', ascending=False).head(3)
    
print("\n  Top 3 important features:")
for i, row in importance_df.iterrows():
    print(f"  {row['Feature']}: {row['Importance']:.6f}")


  Top 3 important features:
  Customers: 3208.000000
  CompetitionDistance: 1737.000000
  Store: 1388.000000


In [None]:
import pandas as pd

lgb_results = pd.DataFrame({
    'y_val': y_val,
    'y_pred_lgb': y_pred_lgb
})
lgb_results.to_csv('prediction/lgb_predictions.csv', index=False)