In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
import joblib

In [69]:
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class GroupTimeSeriesSplit(_BaseKFold):
    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_size=None
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_size = max_train_size

    def split(self, X, y=None, groups=None):
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
        group_test_size = n_groups // n_folds
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []
            for train_group_idx in unique_groups[:group_test_start]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)
            train_end = train_array.size
            if self.max_train_size and self.max_train_size < train_end:
                train_array = train_array[train_end -
                                          self.max_train_size:train_end]
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)
            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [None]:
sales_train = pd.read_pickle("../data/fulling_connected_feature_eng_train_data.pkl")
sales_test = pd.read_pickle("../data/fulling_connected_feature_eng_test_data.pkl")

In [None]:
COLUMNS_TO_DROP = ["wm_yr_wk", "date", "weekday"]

In [None]:
outcome_col = "sales_amount"
predictor_cols = [col for col in sales_train.columns if col !=  outcome_col]

In [None]:
train_predictors = sales_train[predictor_cols]
train_outcome = sales_train[outcome_col]

In [None]:
test_predictors = sales_test[predictor_cols]
test_outcome = sales_test[outcome_col]

In [None]:
ORDINAL_COLUMNS = [
    "item_id",
    "is_weekday",
    "is_weekend",
    "is_holiday",
    "price_category",
    "7dl_price_category",
    "event_name_1",
    "event_type_1",
    "event_name_2",
    "event_type_2",
    "7dl_event_name_1",
    "7dl_event_name_2",
    "7dl_event_type_1",
    "7dl_event_type_2",
    "snap_TX"
]

In [None]:
model = lgb.LGBMRegressor()

In [None]:
columns_transforms = make_column_transformer(
        (OneHotEncoder(), ORDINAL_COLUMNS),
        ("drop", COLUMNS_TO_DROP),
        remainder='passthrough'
    )

In [None]:
model_pipeline = Pipeline(
        [
            ("column_transformation", columns_transforms),
            ("model", model),
        ]
    )

In [None]:
tuning_parameters = {
    "model__max_depth": [20, 50, 100, 200],
    "model__num_leaves": [20, 40, 100, 120],
    "model__learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
    "model__n_estimators": [100, 500, 700, 1000],
    "model__colsample_bytree": [0.3, 0.5, 0.7, 1]
}

In [None]:
cv_split = TimeSeriesSplit(n_splits=5)
grid_search = RandomizedSearchCV(
    model_pipeline, tuning_parameters,
    cv=cv_split,
    scoring=["neg_mean_squared_error", "neg_mean_absolute_error"],
    refit="neg_mean_absolute_error",
    n_jobs=5
)
grid_search.fit(train_predictors, train_outcome)
lgbm_model = grid_search.best_estimator_
joblib.dump(lgbm_model, 'lgbm_model.joblib')

In [None]:
model_pipeline = Pipeline(
        [
            ("column_transformation", columns_transforms),
            ("model", GradientBoostingRegressor()),
        ]
    )

tuning_parameters = {
    "model__max_depth": [20, 50, 100, 200],
    "model__learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
    "model__n_estimators": [100, 500, 700, 1000],
}

cv_split = TimeSeriesSplit(n_splits=5)

grid_search_xgboost = RandomizedSearchCV(
    model_pipeline, tuning_parameters,
    cv=cv_split,
    scoring=["neg_mean_squared_error", "neg_mean_absolute_error"],
    refit="neg_mean_absolute_error",
    n_jobs=5
)
grid_search_xgboost.fit(train_predictors, train_outcome)
xgboost_model = grid_search_xgboost.best_estimator_
joblib.dump(xgboost_model, 'xgboost_model.joblib')

In [95]:
lgbm_model = joblib.load('lgbm_model.joblib')

In [96]:
train_predictions = lgbm_model.predict(train_predictors)
mae_error = mean_absolute_error(train_outcome, train_predictions)
mse_error = mean_squared_error(train_outcome, train_predictions)
rmse_error = mean_squared_error(train_outcome, train_predictions, squared=False)
print(f"MAE error was: {mae_error}")
print(f"MSE error was: {mse_error}")
print(f"RMSE error was: {rmse_error}")

MAE error was: 1.4645787446348453
MSE error was: 11.542862484675576
RMSE error was: 3.3974788424176503


In [97]:
test_predictions = lgbm_model.predict(test_predictors)
mae_error = mean_absolute_error(test_outcome, test_predictions)
mse_error = mean_squared_error(test_outcome, test_predictions)
rmse_error = mean_squared_error(test_outcome, test_predictions, squared=False)
print(f"MAE error was: {mae_error}")
print(f"MSE error was: {mse_error}")
print(f"RMSE error was: {rmse_error}")

MAE error was: 1.7420153260332747
MSE error was: 11.03325752696739
RMSE error was: 3.321634767244495


In [None]:
train_predictions

In [90]:
tscv = TimeSeriesSplit(test_size=train_predictors.shape[0] // (5 + 1))
for i, (train_index, test_index) in enumerate(tscv.split(train_predictors.set_index("date"), groups=train_predictors.item_id)):
    train_split_1 = train_predictors.iloc[train_index, :]
    vals_split_1 = train_predictors.iloc[test_index, :]
    break


In [93]:
vals_split_1.shape

(262399, 31)

In [94]:
train_split_1.shape

(262404, 31)