In [257]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import optuna
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

In [258]:
# df_train = pd.read_csv(r"./data/train.csv", index_col=0)
# df_test = pd.read_csv(r"./data/test.csv", index_col=0)
# df_transactions = pd.read_csv(r"./data/transactions.csv")
# df_stores = pd.read_csv(r"./data/stores.csv")
# df_holidays = pd.read_csv(r"./data/holidays_events.csv")
# df_oil = pd.read_csv(r"./data/oil.csv")

In [259]:
# df_oil['date'] = df_oil['date'].astype("datetime64")
# df_date = pd.DataFrame(pd.date_range("2013-01-01", "2017-08-31"), columns=["date"])
# df_oil = df_date.merge(df_oil, left_on='date', right_on='date', how='left').fillna(method='ffill').fillna(method='bfill')
# df = pd.concat([df_train, df_test], ignore_index=True)
# df = df.merge(df_stores, left_on="store_nbr", right_on="store_nbr", how="left")
# df_holidays = df_holidays.rename(columns={"type": "Holiday_type"})
# df = df.merge(df_holidays, left_on="date", right_on="date", how="left")
# df = df.merge(df_transactions, left_on=["date", "store_nbr"], right_on=["date", "store_nbr"], how="left")
# df["date"] = df["date"].astype("datetime64")
# df = df.merge(df_oil, left_on="date", right_on="date", how="left")
# df = df.replace(",", "_", regex=True)
# df = df.rename(columns={"date": "Date"})
# df.to_parquet('merged_sales.parquet')

In [260]:
df = pd.read_parquet(r'./data/merged_sales.parquet')

In [261]:
LAGS = [16,17,18,19,20,21,22,30,31,90,180,365]

In [262]:
def lag_features(dataframe, lags):
    def random_noise(dataframe):
        return np.random.normal(scale=2.0, size=(len(dataframe)))
    for lag in lags:
        dataframe['sales_lag_' + str(lag)] = dataframe.groupby(["store_nbr", "family"])['sales'].transform(
            lambda x: x.shift(lag)) + random_noise(dataframe)
    return dataframe

In [263]:
df = lag_features(df, LAGS)

In [264]:
df = df.fillna(value=np.nan)

In [265]:
categorical = df.columns[df.dtypes == 'object'].tolist()
numerical = df.columns[(df.dtypes =='int64') | (df.dtypes == 'float64')].tolist()
for col in categorical:
    df[col] = df[col].astype('category')
for col in numerical:
    df[col] = df[col].fillna(0)
    df[col] = df[col].astype('float32')

In [266]:
df.tail(5)

Unnamed: 0,Date,store_nbr,family,sales,onpromotion,city,state,type,cluster,Holiday_type,...,sales_lag_18,sales_lag_19,sales_lag_20,sales_lag_21,sales_lag_22,sales_lag_30,sales_lag_31,sales_lag_90,sales_lag_180,sales_lag_365
3082855,2017-08-31,9.0,POULTRY,0.0,1.0,Quito,Pichincha,B,6.0,,...,412.621979,383.601837,526.204956,291.465424,333.740814,571.590759,467.541138,549.392456,285.779358,412.798706
3082856,2017-08-31,9.0,PREPARED FOODS,0.0,0.0,Quito,Pichincha,B,6.0,,...,101.605942,130.403656,114.118416,113.447815,122.472748,49.822735,62.255978,76.171776,95.72908,126.463234
3082857,2017-08-31,9.0,PRODUCE,0.0,1.0,Quito,Pichincha,B,6.0,,...,1693.480469,1423.101807,1453.528442,1035.672485,1312.949585,2472.445557,1517.024902,1684.491333,1149.684937,1352.996094
3082858,2017-08-31,9.0,SCHOOL AND OFFICE SUPPLIES,0.0,9.0,Quito,Pichincha,B,6.0,,...,202.265106,138.830063,142.898117,147.715851,84.997009,202.249176,52.959568,5.911297,7.716478,86.596542
3082859,2017-08-31,9.0,SEAFOOD,0.0,0.0,Quito,Pichincha,B,6.0,,...,20.364407,16.720655,21.403603,12.425954,16.506483,21.945946,13.819348,28.511127,-2.854146,21.447567


In [267]:
train = df.query("Date >= '2015-01-01' & Date < '2017-08-01'")
validation = df.query("Date >= '2017-08-01' & Date <= '2017-08-15'")
test = df.query("Date > '2017-08-15'")
df = df.drop('Date', axis=1)

In [268]:
TARGET = 'sales'
FEATURES = list(df.columns.difference([TARGET]))

In [269]:
X_train = train[FEATURES]
y_train = train[TARGET]
X_validation = validation[FEATURES]
y_validation = validation[TARGET]
X_test = test[FEATURES]
y_test = test[TARGET]

In [270]:
d_train = xgb.DMatrix(data=train[FEATURES], label=train[TARGET], enable_categorical=True)
d_validation = xgb.DMatrix(data=validation[FEATURES], label=validation[TARGET], enable_categorical=True)
d_test = xgb.DMatrix(data=test[FEATURES], label=test[TARGET], enable_categorical=True)

In [271]:
reg = xgb.XGBRegressor(objective ='reg:squarederror',
                       early_stopping_rounds = 25,
                       eval_metric = 'mae')

In [272]:
# reg.fit(X_train, y_train,
#         eval_set=[(X_train, y_train), (X_validation, y_validation)],
#         verbose=False);

In [273]:
def objective(n_trials):
    params = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'max_depth' : n_trials.suggest_int('max_depth', 1, 18),
        'n_estimators' : n_trials.suggest_int('n_estimators', 10, 1000, step=10),
        'learning_rate': n_trials.suggest_float('learning_rate', 0.005, 0.5),
        'colsample_bytree': n_trials.suggest_float('colsample_bytree', 0.1, 0.6),
        'min_child_weight' : n_trials.suggest_int('min_child_weight', 0, 10),
        'subsample': n_trials.suggest_float('subsample', 0.1, 1),
        'reg_alpha' : n_trials.suggest_int('reg_alpha', 40, 180),
        'reg_lambda' : n_trials.suggest_int('reg_lambda', 0, 1),
        'gamma': n_trials.suggest_float('lambda', 1, 10),
        'n_iter_no_change' : 20
    }
    d_train = xgb.DMatrix(data = train[FEATURES], label = train[TARGET], enable_categorical=True)
    d_val = xgb.DMatrix(data = validation[FEATURES], label = validation[TARGET], enable_categorical=True)
    reg = xgb.train(params, d_train)
    y_pred = reg.predict(d_val)
    rmse = mean_squared_error(y_validation, y_pred, squared=False)
    return rmse

In [274]:
NUM_TRIALS = 20

In [275]:
optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=NUM_TRIALS)

In [279]:
# reg.set_params(**study.best_params)
# reg.fit(X_train, y_train,
#         eval_set=[(X_train, y_train), (X_validation, y_validation)],
#         verbose=False);
reg = xgb.train(study.best_params, d_train)
preds = reg.predict(d_validation).tolist()

In [280]:
preds

[4.630495071411133,
 1.6553395986557007,
 2.6352293491363525,
 2105.058837890625,
 2.6913437843322754,
 286.4288635253906,
 21.260726928710938,
 720.7799072265625,
 658.9365844726562,
 105.5392074584961,
 97.23818969726562,
 103.30287170410156,
 2761.746337890625,
 16.06580924987793,
 3.261514186859131,
 35.046016693115234,
 35.32766342163086,
 2.6913437843322754,
 201.7512969970703,
 9.567584037780762,
 22.36026382446289,
 4.062515735626221,
 76.98600769042969,
 6.236669540405273,
 218.37693786621094,
 117.73334503173828,
 6.236669540405273,
 8.540648460388184,
 298.6396789550781,
 56.494873046875,
 2173.418701171875,
 1.1766709089279175,
 20.099626541137695,
 0.07008248567581177,
 0.07008248567581177,
 1.441254734992981,
 1330.3846435546875,
 1.1060863733291626,
 113.63734436035156,
 2.326282024383545,
 639.906982421875,
 301.7595520019531,
 248.23806762695312,
 37.33476638793945,
 34.67513656616211,
 2820.04296875,
 2.4772586822509766,
 1.1060863733291626,
 8.552496910095215,
 6.755