In [1]:
import lightgbm as lgb
import numpy as np
import joblib
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, mean_poisson_deviance

In [2]:
ORDINAL_COLUMNS = [
    "item_id",
    "is_weekday",
    "is_weekend",
    "is_holiday",
    "price_category",
    "7dl_price_category",
    "event_name_1",
    "event_type_1",
    "event_name_2",
    "event_type_2",
    "7dl_event_name_1",
    "7dl_event_name_2",
    "7dl_event_type_1",
    "7dl_event_type_2",
    "snap_TX"
]

In [3]:
COLUMNS_TO_DROP = ["wm_yr_wk", "date", "weekday"]

In [4]:
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

In [5]:
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves':  120,
    'learning_rate': 0.3,
    'feature_fraction': 0.9
}

In [6]:
sales_train = pd.read_pickle("../data/fulling_connected_feature_eng_train_data.pkl")
sales_test = pd.read_pickle("../data/fulling_connected_feature_eng_test_data.pkl")

In [7]:
sales_train = sales_train.drop(COLUMNS_TO_DROP, axis=1)
sales_test = sales_test.drop(COLUMNS_TO_DROP, axis=1)

In [8]:
sales_train[ORDINAL_COLUMNS] = sales_train[ORDINAL_COLUMNS].astype("category")
sales_test[ORDINAL_COLUMNS] = sales_test[ORDINAL_COLUMNS].astype("category")

In [9]:
outcome_col = "sales_amount"
predictor_cols = [col for col in sales_train.columns if col !=  outcome_col]
train_predictors = sales_train[predictor_cols]
train_outcome = sales_train[outcome_col]
test_predictors = sales_test[predictor_cols]
test_outcome = sales_test[outcome_col]

In [10]:
lgb_train = lgb.Dataset(train_predictors, label=train_outcome, categorical_feature=ORDINAL_COLUMNS, free_raw_data = False)

In [11]:
cv_results = lgb.cv(
    params,
    lgb_train,
    num_boost_round=1000,
    folds=tscv.split(train_predictors),
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011201 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2622
[LightGBM] [Info] Number of data points in the train set: 262404, number of used features: 29
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021651 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2622
[LightGBM] [Info] Number of data points in the train set: 524803, number of used features: 29
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031260 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2622
[LightGBM] [Info] Number of data

In [12]:
best_num_boost_round = len(cv_results["valid rmse-mean"])
final_model = lgb.train(params, lgb_train, num_boost_round=best_num_boost_round)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062431 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2622
[LightGBM] [Info] Number of data points in the train set: 1574399, number of used features: 29
[LightGBM] [Info] Start training from score 1.903873


In [13]:
train_predictions = final_model.predict(train_predictors)
mae_error = mean_absolute_error(train_outcome, train_predictions)
mse_error = mean_squared_error(train_outcome, train_predictions)
rmse_error = mean_squared_error(train_outcome, train_predictions, squared=False)
mape_error = mean_absolute_percentage_error(train_outcome, train_predictions)
print(f"MAE error was: {mae_error}")
print(f"MSE error was: {mse_error}")
print(f"RMSE error was: {rmse_error}")
print(f"MAPE error was: {mape_error}")

MAE error was: 0.8372347134344229
MSE error was: 2.563568364246365
RMSE error was: 1.6011147255104379
MAPE error was: 1202230778314987.0


In [14]:
test_predictions = np.round(final_model.predict(test_predictors))
mae_error = mean_absolute_error(test_outcome, test_predictions)
mse_error = mean_squared_error(test_outcome, test_predictions)
rmse_error = mean_squared_error(test_outcome, test_predictions, squared=False)
mape_error = mean_absolute_percentage_error(test_outcome, test_predictions)
print(f"MAE error was: {mae_error}")
print(f"MSE error was: {mse_error}")
print(f"RMSE error was: {rmse_error}")
print(f"MAPE error was: {mape_error}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = data[col].cat.set_categories(category)


MAE error was: 1.4074379448012497
MSE error was: 9.24557368512411
RMSE error was: 3.0406534963925287
MAPE error was: 1359835367438114.5


In [15]:
joblib.dump(final_model, 'lgbm_poisson_model.pkl')

['lgbm_poisson_model.pkl']

### Reloading Model to ensure saved correctly

In [16]:
model = joblib.load('lgbm_poisson_model.pkl')
test_predictions = model.predict(test_predictors)
mae_error = mean_absolute_error(test_outcome, test_predictions)
mse_error = mean_squared_error(test_outcome, test_predictions)
rmse_error = mean_squared_error(test_outcome, test_predictions, squared=False)
mape_error = mean_absolute_percentage_error(test_outcome, test_predictions)
print(f"MAE error was: {mae_error}")
print(f"MSE error was: {mse_error}")
print(f"RMSE error was: {rmse_error}")
print(f"MAPE error was: {mape_error}")

MAE error was: 1.4482807172600267
MSE error was: 9.156756556304211
RMSE error was: 3.026013310662101
MAPE error was: 1518903883475891.8


In [17]:
sales_test["Predictions"] = np.round(test_predictions)

In [18]:
cols = pd.Series(
    [
    "d_1914",
    "d_1915",
    "d_1916",
    "d_1917",
    "d_1918",
    "d_1919",
    "d_1920",
    "d_1921",
    "d_1922",
    "d_1923",
    "d_1924",
    "d_1925",
    "d_1926",
    "d_1927",
    "d_1928",
    "d_1929",
    "d_1930",
    "d_1931",
    "d_1932",
    "d_1933",
    "d_1934",
    "d_1935",
    "d_1936",
    "d_1937",
    "d_1938",
    "d_1939",
    "d_1940",
    "d_1941"
    ]
)

In [19]:
sales_test["date"] = sales_test.apply(lambda x: pd.to_datetime(f"{x.day}-{x.month}-{x.year}"), axis=1)

  sales_test["date"] = sales_test.apply(lambda x: pd.to_datetime(f"{x.day}-{x.month}-{x.year}"), axis=1)


In [20]:
sales_test_copy = sales_test[['date', 'item_id', 'Predictions']].copy()

In [21]:
sales_pivoted =  sales_test.pivot_table('Predictions', ['item_id'], 'date')

In [22]:
sales_pivoted.columns = cols

In [23]:
sales_pivoted.reset_index().to_csv("LightGBM_REGRESSION_PREDICTIONS.csv")