In [6]:
import click
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from time import time
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

from typing import Tuple

In [3]:
def rmspe(y_pr, y_valid):
    return np.sum(((y_pr - y_valid) / y_valid) ** 2) / len(y_valid)

In [14]:
def read_data(path: str) -> pd.core.frame.DataFrame:
    df = pd.read_csv(path, low_memory=False, index_col='Unnamed: 0')
    df["Date"] = pd.to_datetime(df["Date"])
    click.echo(f"Dataset size: {df.shape}")
    print(df.dtypes)
    return df

In [41]:
def train_valid_split_ts(
    dataset: pd.core.frame.DataFrame, target: str, valid_weeks: int
) -> Tuple[
    pd.core.frame.DataFrame,
    pd.core.frame.DataFrame,
    pd.core.series.Series,
    pd.core.series.Series,
]:
    train, valid = (
        dataset.iloc[: -1115 * 7 * valid_weeks, :],
        dataset.iloc[-1115 * 7 * valid_weeks :, :],
    )

    train = train.query('Open == 1')
    train = train.drop(['Date', 'Open'], axis=1)
    valid = valid.query('Open == 1')
    valid = valid.drop(['Date', 'Open'], axis=1)

    X_train, y_train = train.drop(target, axis=1), train[target]
    X_valid, y_valid = valid.drop(target, axis=1), valid[target]

    return X_train, X_valid, y_train, y_valid

In [8]:
def te_map(data: pd.core.frame.DataFrame, target_col: str, feature_col: str) -> dict:
    mappings = dict()
    table = data.groupby(feature_col).agg({target_col: "mean"})
    for item_num in range(len(table)):
        mappings.update({table.iloc[item_num, :].name: table.iloc[item_num, :][0]})
    return mappings

In [9]:
def target_encoding(
    data: pd.core.frame.DataFrame, target_col: str
) -> pd.core.frame.DataFrame:
    for obj_col in data.loc[:, data.dtypes == "object"]:
        data[obj_col].replace(te_map(data, target_col, obj_col), inplace=True)
    return data

# Train

In [31]:
data_path = 'train_lagged.csv'

In [49]:
data.shape[0] / 1115 / 7

130.327866752082

In [44]:
data = read_data(data_path)
data_encoded = target_encoding(data, 'Sales')

X_train, X_valid, y_train, y_valid = train_valid_split_ts(data.copy(), 'Sales', 6)

  mask |= (ar1 == a)


Dataset size: (1017209, 37)
Store                                 int64
DayOfWeek                             int64
Date                         datetime64[ns]
Sales                                 int64
Open                                  int64
Promo                                 int64
StateHoliday                         object
SchoolHoliday                         int64
Year                                  int64
Month                                 int64
DayOfMonth                            int64
WeekOfYear                            int64
StoreType                            object
Assortment                           object
CompetitionDistance                 float64
CompetitionOpenSinceMonth           float64
CompetitionOpenSinceYear            float64
Promo2                                int64
Promo2SinceWeek                     float64
Promo2SinceYear                     float64
PromoInterval                        object
Current-OpenComp                    float64
Curr

In [45]:
X_train

Unnamed: 0,Store,DayOfWeek,Promo,StateHoliday,SchoolHoliday,Year,Month,DayOfMonth,WeekOfYear,StoreType,...,t-5,t-6,t-7,t-8,t-9,t-10,t-11,t-12,t-13,t-14
77677,85,2,0,290.735686,1,2013,1,1,1,10058.837334,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
235513,259,2,0,290.735686,1,2013,1,1,1,10058.837334,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
238339,262,2,0,290.735686,1,2013,1,1,1,10058.837334,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
249459,274,2,0,290.735686,1,2013,1,1,1,10058.837334,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
305081,335,2,0,290.735686,1,2013,1,1,1,10058.837334,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1012541,1111,5,1,5947.483893,0,2015,6,19,25,5738.179710,...,0.0,2840.0,3814.0,3678.0,3731.0,4375.0,4567.0,0.0,2716.0,5661.0
1013483,1112,5,1,5947.483893,0,2015,6,19,25,5723.629246,...,0.0,8026.0,7865.0,7606.0,8194.0,7342.0,7878.0,0.0,8314.0,12702.0
1014425,1113,5,1,5947.483893,0,2015,6,19,25,5738.179710,...,0.0,5106.0,5450.0,5951.0,5265.0,5335.0,5939.0,0.0,5533.0,6050.0
1015367,1114,5,1,5947.483893,0,2015,6,19,25,5738.179710,...,0.0,22490.0,18707.0,19420.0,19162.0,18372.0,20368.0,0.0,22017.0,26869.0
