In [None]:
import os, re, gc, sys, time, random, warnings, functools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from IPython.display import display
from typing import Union
from tqdm.notebook import tqdm

import optuna
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, TimeSeriesSplit, GroupKFold, GroupShuffleSplit
from optuna.visualization import plot_optimization_history

!pip install tensorflow_addons==0.9.1
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import backend as K
from tensorflow.keras import losses, models, optimizers
from tensorflow.keras.layers import *
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback, LearningRateScheduler, EarlyStopping
from keras import regularizers
from keras.utils.np_utils import to_categorical

pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    
def reduce_mem_usage(df, verbose=False, y=[]):
    numerics  = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col in y or col_type not in numerics:
            continue
        c_min = df[col].min()
        c_max = df[col].max()
        if str(col_type)[:3] == 'int':
            if   c_min > np.iinfo(np.int8).min  and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)  
        else:
            if   c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def show_cols(show_list, show_num=50, col=True):
    reshaped_list = []
    if show_num < len(show_list):
        for i in range(0, len(show_list)+show_num, show_num):
            if len(show_list) < i: break
            l = sorted(show_list)[i:i+show_num]
            if len(l)==show_num:
                reshaped_list.append(l)
            else:
                reshaped_list.append(l + [None]*(show_num-len(l)))
    else:
        reshaped_list = [sorted(show_list)]
    df_show_col = pd.DataFrame(reshaped_list)
    if 0 < df_show_col.shape[1]:
        display(df_show_col) if col else display(df_show_col.T)
    else:
        print("No features")
        
def show_plot(df_sub, df_train, id):
    plt.figure(figsize=(15,3))
    if -1 < id.find("validation"):
        v_id = id
        e_id = id.replace("validation","evaluation")
    else:
        v_id = id.replace("evaluation","validation")
        e_id = id
    tmp1 = df_sub  .query("id==@v_id").iloc[:,1:]
    tmp2 = df_train.query("id==@e_id").iloc[:, -28:]
    tmp2.columns = tmp1.columns
    plt.plot(tmp1.T)
    plt.plot(tmp2.T)

In [None]:
## evaluation metric
## from https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/133834 and edited to get scores at all levels
class WRMSSEEvaluator(object):

    def __init__(self, train_df: pd.DataFrame, valid_df: pd.DataFrame, calendar: pd.DataFrame, prices: pd.DataFrame):
        train_y = train_df.loc[:, train_df.columns.str.startswith('d_')]
        train_target_columns = train_y.columns.tolist()
        weight_columns = train_y.iloc[:, -28:].columns.tolist()

        train_df['all_id'] = 0  # for lv1 aggregation

        id_columns = train_df.loc[:, ~train_df.columns.str.startswith('d_')].columns.tolist()
        valid_target_columns = valid_df.loc[:, valid_df.columns.str.startswith('d_')].columns.tolist()

        if not all([c in valid_df.columns for c in id_columns]):
            valid_df = pd.concat([train_df[id_columns], valid_df], axis=1, sort=False)

        self.train_df = train_df
        self.valid_df = valid_df
        self.calendar = calendar
        self.prices = prices

        self.weight_columns = weight_columns
        self.id_columns = id_columns
        self.valid_target_columns = valid_target_columns

        weight_df = self.get_weight_df()

        self.group_ids = (
            'all_id',
            'cat_id',
            'state_id',
            'dept_id',
            'store_id',
            'item_id',
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )

        for i, group_id in enumerate(tqdm(self.group_ids)):
            train_y = train_df.groupby(group_id)[train_target_columns].sum()
            scale = []
            for _, row in train_y.iterrows():
                series = row.values[np.argmax(row.values != 0):]
                scale.append(((series[1:] - series[:-1]) ** 2).mean())
            setattr(self, f'lv{i + 1}_scale', np.array(scale))
            setattr(self, f'lv{i + 1}_train_df', train_y)
            setattr(self, f'lv{i + 1}_valid_df', valid_df.groupby(group_id)[valid_target_columns].sum())

            lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
            setattr(self, f'lv{i + 1}_weight', lv_weight / lv_weight.sum())

    def get_weight_df(self) -> pd.DataFrame:
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns].set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
        weight_df = weight_df.loc[zip(self.train_df.item_id, self.train_df.store_id), :].reset_index(drop=True)
        weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)
        return weight_df

    def rmsse(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        valid_y = getattr(self, f'lv{lv}_valid_df')
        score = ((valid_y - valid_preds) ** 2).mean(axis=1)
        scale = getattr(self, f'lv{lv}_scale')
        return (score / scale).map(np.sqrt)

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]):
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds], axis=1, sort=False)

        group_ids = []
        all_scores = []
        for i, group_id in enumerate(self.group_ids):
            lv_scores = self.rmsse(valid_preds.groupby(group_id)[self.valid_target_columns].sum(), i + 1)
            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1, sort=False).prod(axis=1)
            group_ids.append(group_id)
            all_scores.append(lv_scores.sum())

        return group_ids, all_scores
    
## public LB rank
def get_lb_rank(score):
    """
    Get rank on public LB as of 2020-05-31 23:59:59
    """
    df_lb = pd.read_csv("../input/m5-accuracy-final-public-lb/m5-forecasting-accuracy-publicleaderboard-rank.csv")
    return (df_lb.Score <= score).sum() + 1

In [None]:
def get_mean_std(df, g="id", col="sales"):
    df_ms = pd.concat([df[[g]],
                       df.groupby(g)[col].transform('mean'),
                       df.groupby(g)[col].transform('std')], axis=1).drop_duplicates()
    df_ms.columns = ["id", g+"_"+col+"_mean", g+"_"+col+"_std"]
    return df_ms.reset_index(drop=True)

def split_data(df, min_d):
    df_train = df.query("d <= 1913").reset_index(drop=True)
    df_test  = df.query("1942-@min_d <= d").reset_index(drop=True)
    return df_train, df_test

def add_grid(df):
    end_d   = df.d.max()
    grids   = []
    df_base = df[['id','item_id','dept_id','cat_id','store_id','state_id',"release"]].drop_duplicates()
    for i in range(1,29):
        df_grid = df_base.copy()
        df_grid['d']     = end_d+i
        df_grid['sales'] = np.nan
        grids.append(df_grid)
    df_train_grid = pd.concat([df] + grids).reset_index(drop=True)
    return df_train_grid

def relabel(df_train, df_test, cols):
    for c in cols:
        le = LabelEncoder()
        df_train[c] = le.fit_transform(df_train[c].values)
        df_test[c]  = le.transform(df_test[c].values)
    del le
    return df_train, df_test

def prepare_2ary_train(df, g):
    df_id_cnt = pd.DataFrame(df.id.value_counts()).reset_index()
    df_id_cnt.columns  = ["id","cnt"]
    df_id_cnt["d_max"] = df.d.max()
    df_id_cnt["d_min"] = df_id_cnt.d_max - df_id_cnt.cnt
    df_id_cnt["group"] = (df_id_cnt.d_max - df_id_cnt.d_min) // g
    df_id_cnt["thr"]   = df_id_cnt.d_max - df_id_cnt.group * g
    not_use_ids = df_id_cnt.query("thr==d_max").id.unique()
    if 0 < len(not_use_ids):
        print("These ids will not be used on training.")
        print(not_use_ids)
    
    df_group = pd.DataFrame()
    for t in df_id_cnt.thr.unique():
        ids   = set(df_id_cnt[df_id_cnt.thr==t].id.tolist())
        df_id = df[df.id.isin(ids)]
        df_group = df_group.append(df_id[df_id.d > t])
    return df_group

def get_new_years(df):
    new_years = []
    years = list(df.year)
    for i in range(0, len(df.year), GROUP_SIZE):
        if years[i] == years[i+GROUP_SIZE-1]:
            new_years += years[i : i+GROUP_SIZE]
        else:
            med = int((i+i+GROUP_SIZE)/2)
            new_years += [years[med]]*GROUP_SIZE
    return new_years

def get_group(df):
    group = int((df.shape[0]/GROUP_SIZE)//2)
    group = np.array([[0]*GROUP_SIZE + [1]*GROUP_SIZE]*group).flatten()
    if group.shape[0] != df.shape[0]:
        group = np.append(group, np.array([0]*GROUP_SIZE))
    return group

def reset_and_sort(df):
    return df.sort_values(by=["id","d"]).reset_index(drop=True)

In [None]:
def create_price_features(df, df_calender):
    df         = df.copy()
    group_cols = ['store_id','item_id']
    
    # Standardize sell_price
    df['price_std']  = df.groupby(group_cols)['sell_price'].transform('std')
    df['price_std']  = df.price_std.apply(lambda x: x if 0 < x else 1)
    df['price_mean'] = df.groupby(group_cols)['sell_price'].transform('mean')
    df["sell_price"] = (df.sell_price - df.price_mean) / df.price_std
    
    # Add 1 to sell_price in order to avoid "sales * sell_price = 0"
    # This means "Mean = 1" and "Std = 1"
    df["sell_price"] = df.sell_price + 1
    
    # Normalize price
    df['price_max']  = df.groupby(group_cols)['sell_price'].transform('max')
    df['price_min']  = df.groupby(group_cols)['sell_price'].transform('min')
    df['price_norm'] = (df['sell_price'] / df['price_max']).fillna(0)
    
    # Some items are can be inflation dependent and some items are very "stable"
    df['price_nunique'] = df.groupby(group_cols)['sell_price'].transform('nunique')
    df['item_nunique']  = df.groupby(group_cols)['item_id'].transform('nunique')
    
    # Rolling aggregations with months and years
    calendar_prices = df_calendar[['wm_yr_wk','month','year']]
    calendar_prices = calendar_prices.drop_duplicates(subset=['wm_yr_wk'])
    df = df.merge(calendar_prices[['wm_yr_wk','month','year']], on=['wm_yr_wk'], how='left')
    df['price_momentum']   = (df['sell_price'] / df.groupby(group_cols)['sell_price'].transform(lambda x: x.shift(1))).fillna(0)
    df['price_momentum_m'] = (df['sell_price'] / df.groupby(group_cols+['month'])['sell_price'].transform('mean')).fillna(0)
    df['price_momentum_y'] = (df['sell_price'] / df.groupby(group_cols+['year']) ['sell_price'].transform('mean')).fillna(0
                                                                                                                         )
    return df.drop(["year","month"], axis=1)

def create_date_features(df):
    df   = df.copy()
    df.d = df.d.apply(lambda x: int(x.replace("d_","")))
    
    # Make a event feature
    df["event"] = df[["event_type_1","event_type_2"]].apply(lambda x: 0 if x[0] is np.nan and x[1] is np.nan else 1, axis=1)
    
    # Make some features from date
    df['date']  = pd.to_datetime(df.date)
    df['tm_d']  = df['date'].dt.day.astype(np.int8)
    df['tm_w']  = df['date'].dt.week.astype(np.int8)
    df['tm_m']  = df['date'].dt.month.astype(np.int8)
    df['tm_y']  = df['date'].dt.year
    df['tm_y']  = (df['tm_y'] - df['tm_y'].min()).astype(np.int8)
    df['tm_wm'] = df['tm_d'].apply(lambda x: np.ceil(x/7)).astype(np.int8)
    df['tm_dw'] = df['date'].dt.dayofweek.astype(np.int8)
    df['tm_w_end'] = (df['tm_dw']>=5).astype(np.int8)
    return df.drop(["tm_d","tm_m","tm_y","tm_dw"], axis=1)

def create_rolling_features(df, col, periods, windows):
    df = df.copy()
    # Difine group columns
    group_cols =  ["item_id","store_id"]
    # Difine some parameters for calculating decay
    decays     = np.power(0.9, periods)
    
    # Shift features
    df[col + "_lag_mean"]  = 0
    df[col + "_lag_decay"] = 0
    for p, d in zip(periods, decays):
        df[col + "_lag_t" + str(p)] = df.groupby(group_cols)[col].shift(periods=p).fillna(0)
        df[col + "_lag_mean"]      += df[col + "_lag_t" + str(p)]
        df[col + "_lag_decay"]     += df[col + "_lag_t" + str(p)]*d
    df[col + "_lag_mean"] /= len(periods)
    
    # Rolling features
    for w in windows:
        df[col + '_mean_t' + str(w)] = df.groupby(group_cols)[col].transform(lambda x: x.shift(1).rolling(w).mean())
        
    return df

def create_sales_fetatures(df_train, df_test, g, col, agg_cols, add_col_name=""):
    df_train = df_train.copy()
    df_test  = df_test.copy()
    # Difine some columns
    group_cols = ["id"] + g
    use_cols   = group_cols + [col]
    new_cols   = [col+"_"+a+add_col_name for a in agg_cols]
    
    # Calculate some aggregations
    df_agg = pd.concat([df_train[use_cols], df_test[use_cols]]).dropna().drop_duplicates()
    df_agg = df_agg.groupby(group_cols, as_index=False).agg({col:agg_cols}).reset_index(drop=True)
    df_agg.columns = group_cols + new_cols
    
    # Merge with original dataframe
    df_train = df_train.merge(df_agg, on=group_cols, how="left")
    df_test  = df_test .merge(df_agg, on=group_cols, how="left")
    for c in new_cols: 
        df_train[c] = df_train[c].fillna(0)
        df_test [c] = df_test [c].fillna(0)
        
    return df_train, df_test

# Load data

In [None]:
sample_submission = pd.read_csv('../input/m5-forecasting-accuracy/sample_submission.csv')
print(sample_submission.shape)
display(sample_submission.head(1))
display(sample_submission.tail(1))

In [None]:
df_train    = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_evaluation.csv')
df_prices   = pd.read_csv('../input/m5-forecasting-accuracy/sell_prices.csv')
df_calendar = pd.read_csv('../input/m5-forecasting-accuracy/calendar.csv')
evaluator   = WRMSSEEvaluator(df_train.iloc[:, :-28], df_train.iloc[:, -28:], df_calendar, df_prices)

df_release  = df_prices.groupby(['store_id','item_id'])['wm_yr_wk'].agg(['min']).reset_index()
df_release.columns = ['store_id','item_id','release']
df_calendar = reduce_mem_usage(create_date_features(df_calendar))
df_prices   = reduce_mem_usage(create_price_features(df_prices, df_calendar))

print(df_train.shape, df_prices.shape, df_calendar.shape)
display(df_train.head(1))
display(df_prices.head(1))
display(df_calendar.head(1))

In [None]:
#df_sub = pd.read_csv('../input/m5-result/submission.csv')
#show_plot(df_sub, df_train, "HOBBIES_1_009_CA_1_validation")

# Feature engineering

In [None]:
# Main parameters
TARGET_STATES = ["CA","TX","WI"]  # Choice from CA, TX or WI
GROUP_SIZE    = 30
USE_MIN_DATE  = 365*3

# Featrure engineering parameters
periods = [7,14,21,28]
windows = [7]

In [None]:
%%time
print("Original shapes are %s" % str(df_train.shape))

dict_df = {}
for state in TARGET_STATES:
    print("=======================")
    df_train_state = df_train.query("state_id == @state").copy()
    print("The state, %s, shapes are %s " % (state, str(df_train_state.shape)))
    
    print("Start processing of wide2long")
    df_train_long  = pd.melt(df_train_state,
                             id_vars=["id","item_id","dept_id","cat_id","store_id","state_id"],
                             value_name="sales")
    df_train_long["d"] = df_train_long.variable.apply(lambda x: int(x.replace("d_","")))
    df_train_long.drop("variable", axis=1, inplace=True)
    df_train_long = reduce_mem_usage(df_train_long)
    print(" Current train data shapes are %s" % str(df_train_long.shape))

    print("Delete some data because some items had not been released yet")
    df_train_long = df_train_long.merge(df_release, on=["store_id","item_id"], how="left")
    df_train_long = df_train_long.merge(df_calendar[['wm_yr_wk','d']], on=["d"], how="left")
    df_train_long = df_train_long.query("release<=wm_yr_wk").drop("wm_yr_wk", axis=1)
    print(" Current train data shapes are %s" % str(df_train_long.shape))
    
    print("Standardize sales for each id")
    df_ms = get_mean_std(df_train_long)
    df_train_long = df_train_long.merge(df_ms, on="id", how="left")
    df_train_long.sales = (df_train_long.sales.fillna(0)-df_train_long.id_sales_mean)/df_train_long.id_sales_std
    df_train_long = df_train_long.drop(["id_sales_mean","id_sales_std"], axis=1).reset_index(drop=True)
    
    print("Split data in train and test")
    min_d = max(max(windows), max(periods)) + GROUP_SIZE
    df_train_long, df_test_long = split_data(df_train_long, min_d)
    print(" Current train data shapes are %s" % str(df_train_long.shape))
    print(" Current test  data shapes are %s" % str(df_test_long.shape))
    
    print("Add grid for future prediction")
    df_train_long = add_grid(df_train_long)
    df_test_long  = add_grid(df_test_long)
    print(" Current train data shapes are %s" % str(df_train_long.shape))
    print(" Current test  data shapes are %s" % str(df_test_long.shape))
    
    print("Add price and calendar features")
    calendar_cols = ['wm_yr_wk','d','year',"wday","month","event","snap_"+state,"tm_w","tm_wm","tm_w_end"]
    df_train_long = df_train_long.merge(df_calendar[calendar_cols], on=["d"], how="left")
    df_train_long = df_train_long.merge(df_prices, on=['store_id','item_id','wm_yr_wk'], how='left')
    df_test_long  = df_test_long .merge(df_calendar[calendar_cols], on=["d"], how="left")
    df_test_long  = df_test_long .merge(df_prices, on=['store_id','item_id','wm_yr_wk'], how='left')
    print(" Current train data shapes are %s" % str(df_train_long.shape))
    print(" Current test  data shapes are %s" % str(df_test_long.shape))
    
    print("Multiply sales and sell_price because WRMSSE metric uses that.")
    df_train_long.sales = df_train_long.sales * df_train_long.sell_price
    df_test_long.sales  = df_test_long.sales  * df_test_long.sell_price
    
    print("Relabel some features")
    relabel_cols  = ["item_id", "dept_id", "cat_id", "store_id"]
    df_train_long, df_test_long = relabel(df_train_long, df_test_long, relabel_cols)
    df_train_long = reduce_mem_usage(df_train_long)
    df_test_long  = reduce_mem_usage(df_test_long)
    
    print("Add rolling features")
    df_train_long = create_rolling_features(df_train_long, "sales", periods, windows)
    df_test_long  = create_rolling_features(df_test_long,  "sales", periods, windows)
    print(" Current train data shapes are %s" % str(df_train_long.shape))
    print(" Current test  data shapes are %s" % str(df_test_long.shape))
    
    print("Add sales features")
    agg_cols   = ["median","mean","std"]
    group_cols = ["month","wday"]
    df_train_long, df_test_long = create_sales_fetatures(df_train_long, df_test_long, group_cols, "sales", agg_cols, "_wday")
    group_cols = ["month","tm_w_end"]
    df_train_long, df_test_long = create_sales_fetatures(df_train_long, df_test_long, group_cols, "sales", agg_cols, "_wend")
    print(" Current train data shapes are %s" % str(df_train_long.shape))
    print(" Current test  data shapes are %s" % str(df_test_long.shape))
    
    print("Adjust train and test data, and create validation data")
    df_valid_long = df_train_long.query("1914-@GROUP_SIZE < d").copy()
    df_train_long = df_train_long.dropna().copy()
    df_test_long  = df_test_long .query("1942-@GROUP_SIZE < d").copy()
    print(" Current train data shapes are %s" % str(df_train_long.shape))
    print(" Current test  data shapes are %s" % str(df_test_long.shape))
    print(" Current valid data shapes are %s" % str(df_valid_long.shape))
    
    print("Delete old data for saving memory")
    df_train_long = df_train_long.query("@USE_MIN_DATE <= d").copy()
    print(" Current train data shapes are %s" % str(df_train_long.shape))
    
    print("Delete some data for WaveNet")
    df_train_long = prepare_2ary_train(df_train_long, GROUP_SIZE)
    df_train_long = reset_and_sort(df_train_long)
    df_train_long.year     = get_new_years(df_train_long)
    df_train_long["group"] = get_group(df_train_long)
    print(" Current train data shapes are %s" % str(df_train_long.shape))
    
    print("Get sell_price to use in postprocessing")
    valid_sell_price = np.array(reset_and_sort(df_valid_long.query("1913 < d")).sell_price)
    test_sell_price  = np.array(reset_and_sort(df_test_long .query("1941 < d")).sell_price)
    
    dict_df[state] = {"train_data"  : df_train_long,
                      "valid_data"  : reset_and_sort(df_valid_long),
                      "test_data"   : reset_and_sort(df_test_long),
                      "mean_std"    : df_ms,
                      "valid_price" : valid_sell_price,
                      "test_price"  : test_sell_price}
    del df_train_state, df_train_long, df_valid_long, df_test_long, df_ms
    gc.collect()

In [None]:
for state in TARGET_STATES:
    print("============================")
    print(state)
    for key, df in dict_df[state].items():
        print(key, df.shape, df.shape[0]/GROUP_SIZE)
        if not type(df) is np.ndarray:
            display(df.head(2))

In [None]:
del df_train, df_prices, df_calendar, df_release, df
gc.collect()

In [None]:
class BaseModel(object):
    """
    Base Model Class:

    train_df         : train pandas dataframe
    test_df          : test pandas dataframe
    target           : target column name (str)
    features         : list of feature names
    categoricals     : list of categorical feature names
    n_splits         : K in KFold (default is 3)
    cv_method        : options are .. KFold, StratifiedKFold, TimeSeriesSplit, GroupKFold, or GroupShuffleSplit
    group            : group feature name when GroupKFold or StratifiedGroupKFold are used
    task             : options are .. regression, multiclass, or binary
    param            : dict of parameter, set that if you already define
    parameter_tuning : bool, only for LGB
    seed             : seed (int)
    verbose          : bool
    """

    def __init__(self, train_df, test_df, target, features, 
                 valid_df=None, categoricals=[], 
                 n_splits=3, cv_method="KFold", group=None,
                 task="regression", params=None, parameter_tuning=False,
                 seed=42, verbose=True):
        self.train_df     = train_df
        if valid_df is not None and valid_df.shape[0]==0:
            self.valid_df = None            
        else:
            self.valid_df = valid_df
        self.test_df      = test_df
        self.target       = target
        self.features     = features
        self.n_splits     = n_splits
        self.categoricals = categoricals
        self.cv_method    = cv_method
        self.group        = group
        self.task         = task
        self.parameter_tuning = parameter_tuning
        self.seed    = seed
        self.cv      = self.get_cv()
        self.verbose = verbose
        if params is None:
            self.params = self.get_params()
        else:
            self.params = params
        self.y_pred, self.y_valid, self.score, self.model, self.oof, self.y_val, self.fi_df = self.fit()

    def train_model(self, train_set, val_set):
        raise NotImplementedError

    def get_params(self):
        raise NotImplementedError

    def convert_dataset(self, x_train, y_train, x_val, y_val):
        raise NotImplementedError

    def calc_metric(self, y_true, y_pred): # this may need to be changed based on the metric of interest
        if self.task in ("multiclass","nn_multiclass"):
            preds = np.argmax(y_pred, axis=1) if y_true.shape != y_pred.shape else y_pred
            return f1_score(y_true, preds, average='macro')                
        if self.task == "binary":
            return f1_score(y_true, y_pred, average='macro')
        if self.task in ("regression","nn_regression"):
            return np.sqrt(mean_squared_error(y_true, y_pred))

    def get_cv(self):
        if self.cv_method == "KFold":
            cv = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed)
            return cv.split(self.train_df)
        if self.cv_method == "StratifiedKFold":
            cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed)
            return cv.split(self.train_df, self.train_df[self.target])
        if self.cv_method == "TimeSeriesSplit":
            cv = TimeSeriesSplit(max_train_size=None, n_splits=self.n_splits)
            return cv.split(self.train_df)
        if self.cv_method == "GroupKFold":
            if self.group in self.features:
                self.features.remove(self.group)
            cv = GroupKFold(n_splits=self.n_splits)
            return cv.split(self.train_df[self.features], self.train_df[self.target], self.train_df[self.group])
        if self.cv_method == "GroupShuffleSplit":
            if self.group in self.features:
                self.features.remove(self.group)
            cv = GroupShuffleSplit(n_splits=self.n_splits, random_state=self.seed)
            return cv.split(self.train_df[self.features], self.train_df[self.target], self.train_df[self.group])

    def fit(self):
        # Initialize
        y_vals = np.zeros((self.train_df.shape[0], ))
        if self.task in ("multiclass","nn_multiclass"):
            oof_pred = np.zeros((self.train_df.shape[0], self.train_df[self.target].nunique()))
            y_pred   = np.zeros((self.test_df.shape[0],  self.train_df[self.target].nunique())) if self.test_df  is not None else None
            y_valid  = np.zeros((self.valid_df.shape[0], self.train_df[self.target].nunique())) if self.valid_df is not None else None
        else:
            oof_pred = np.zeros((self.train_df.shape[0], ))
            if self.task == "nn_regression":
                y_pred   = self.get_pred_base(self.test_df)  if self.test_df  is not None else None
                y_valid  = self.get_pred_base(self.valid_df) if self.valid_df is not None else None
            else:
                y_pred   = np.zeros((self.test_df.shape[0], ))  if self.test_df  is not None else None
                y_valid  = np.zeros((self.valid_df.shape[0], )) if self.valid_df is not None else None
            
        if self.group is not None:
            if self.group in self.features:
                self.features.remove(self.group)
            if self.group in self.categoricals:
                self.categoricals.remove(self.group)
                
        fi      = np.zeros((self.n_splits, len(self.features)))
        #x_test  = self.test_df[self.features].copy()  if y_pred  is not None else None
        #x_valid = self.valid_df[self.features].copy() if y_valid is not None else None
        x_test  = self.test_df.copy()  if y_pred  is not None else None
        x_valid = self.valid_df.copy() if y_valid is not None else None
        del self.valid_df, self.test_df
        gc.collect()

        # Fitting with out of fold
        for fold, (train_idx, val_idx) in enumerate(self.cv):
            # Prepare train and test dataset
            x_train, x_val     = self.train_df.loc[train_idx, self.features], self.train_df.loc[val_idx, self.features]
            y_train, y_val     = self.train_df.loc[train_idx, self.target],   self.train_df.loc[val_idx, self.target]
            train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val)
            del x_train, y_train
            gc.collect()
            
            # Fit model
            model, importance = self.train_model(train_set, val_set)
            fi[fold, :]       = importance
            y_vals[val_idx]   = y_val
            del train_set
            gc.collect()
            
            # Get some scores
            if   self.task == "binary":
                oof_pred[val_idx] = model.predict(x_val).reshape(oof_pred[val_idx].shape)
                y_pred += model.predict(x_test).reshape(y_pred.shape) / self.n_splits
                
            elif self.task == "regression":
                oof_pred[val_idx] = model.predict(x_val).reshape(oof_pred[val_idx].shape)
                if y_valid is not None:
                    y_valid += model.predict(x_valid).reshape(y_valid.shape) / self.n_splits
                if y_pred is not None:
                    y_pred  += model.predict(x_test).reshape(y_pred.shape) / self.n_splits
                
            elif self.task == "nn_regression":
                oof_pred[val_idx] = model.predict(val_set[0]).reshape(oof_pred[val_idx].shape)
                if y_valid is not None:
                    y_valid.sales += self.predict_future(model, x_valid) / self.n_splits
                if y_pred is not None:
                    y_pred.sales  += self.predict_future(model, x_test) / self.n_splits
                
            elif self.task == "multiclass":
                oof_pred[val_idx] = model.predict(x_val)
                if y_valid is not None:
                    y_valid += model.predict(x_valid).reshape(y_valid.shape) / self.n_splits
                if y_pred is not None:
                    y_pred  += model.predict(x_test).reshape(y_pred.shape) / self.n_splits
                
            elif self.task == "nn_multiclass":
                oof_pred[val_idx] = model.predict(val_set[0]).reshape(-1, preds.shape[-1])
                if y_valid is not None:
                    y_valid += model.predict(self.convert_dataset(x_valid)).reshape(-1, preds.shape[-1]) / self.n_splits
                if y_pred is not None:
                    y_pred  += model.predict(self.convert_dataset(x_test)).reshape(-1, preds.shape[-1]) / self.n_splits
                
            print('Partial score of fold {} is: {}'.format(fold, self.calc_metric(y_val, oof_pred[val_idx])))
        
        # Create feature importance data frame
        fi_df = pd.DataFrame()
        for n in np.arange(self.n_splits):
            tmp = pd.DataFrame()
            tmp["features"]   = self.features
            tmp["importance"] = fi[n, :]
            tmp["fold"]       = n
            fi_df = pd.concat([fi_df, tmp], ignore_index=True)
        gfi   = fi_df[["features", "importance"]].groupby(["features"]).mean().reset_index()
        fi_df = fi_df.merge(gfi, on="features", how="left", suffixes=('', '_mean'))
        
        # Calculate oof score
        loss_score = self.calc_metric(y_vals, oof_pred)
        print('Our oof loss score is: ', loss_score)
        
        return y_pred, y_valid, loss_score, model, oof_pred, y_vals, fi_df

    def plot_feature_importance(self, rank_range=[1, 50]):
        fig, ax   = plt.subplots(1, 1, figsize=(10, 20))
        sorted_df = self.fi_df.sort_values(by="importance_mean", ascending=False).reset_index()
        sns.barplot(data=sorted_df.iloc[self.n_splits*(rank_range[0]-1) : self.n_splits*rank_range[1]],
                    x="importance", y="features", orient='h')
        ax.set_xlabel("feature importance")
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        return sorted_df
    
class WaveNet(BaseModel):
    """
    Wave Net wrapper
    """
    def get_params(self):
        params = {
            "batch_size"    : GROUP_SIZE,
            "learning_rate" : 0.0015,
            "nn_epochs"     : 15,
            "nn_batch_size" : 256,
            "patience"      : 10
        }
        if self.task == "nn_regression":
            params["num_classes"] = 1
        else:
            params["num_classes"] = len(self.train_df[self.target].unique())
        display(params)
        return params

    def get_model(self, shape_):
        
        def root_mean_squared_error(y_true, y_pred):
            return K.sqrt(K.mean(K.square(y_pred - y_true)))

        def cr(x, out_layer, kernel, stride, dilation):
            x = Conv1D(out_layer, kernel_size=kernel, dilation_rate=dilation, strides=stride, padding="same")(x)
            x = Activation("relu")(x)
            return x
    
        def wave_block(x, filters, kernel_size, n):
            dilation_rates = [2**i for i in range(n)]
            x     = Conv1D(filters=filters, kernel_size=1, padding='same')(x)
            res_x = x
            for dilation_rate in dilation_rates:
                tanh_out = Conv1D(filters=filters, kernel_size=kernel_size, padding='same', activation='tanh',    dilation_rate=dilation_rate)(x)
                sigm_out = Conv1D(filters=filters, kernel_size=kernel_size, padding='same', activation='sigmoid', dilation_rate=dilation_rate)(x)
                x     = Multiply()([tanh_out, sigm_out])
                x     = Conv1D(filters=filters, kernel_size=1, padding='same')(x)
                res_x = Add()([res_x, x])
            return res_x
    
        inp = Input(shape=(shape_))
        
        x   = cr(inp, 64, 7, 1, 1)
        x   = wave_block(x, 16, 3, 12)
        x   = wave_block(x, 32, 3, 8)
        x   = cr(x, 32, 7, 1, 1)
        x   = wave_block(x, 64, 3, 1)
        x   = cr(x, 32, 7, 1, 1)
        x   = Dropout(0.2)(x)
        if self.task == "nn_regression":
            out = Conv1D(1, 1, padding='same', name='out')(x)
        else:
            out = Dense(self.params["num_classes"], activation='softmax', name='out')(x)            

        model = models.Model(inputs=inp, outputs=out)
        #display(model.summary())
        opt   = Adam(lr=self.params["learning_rate"])
        opt   = tfa.optimizers.SWA(opt)
        model.compile(loss=root_mean_squared_error, optimizer=opt, metrics=[tf.keras.metrics.RootMeanSquaredError()])
        return model        
    
    # function that decrease the learning as epochs increase (i also change this part of the code)
    def lr_schedule(self, epoch):
        if   epoch < 11: return self.params["learning_rate"]
        elif epoch < 13: return self.params["learning_rate"] / 3
        else:            return self.params["learning_rate"] / 5
    
    def train_model(self, train_set, val_set):
        # Prepare WaveNet model
        K.clear_session()
        config = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1,inter_op_parallelism_threads=1)
        sess   = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=config)
        tf.compat.v1.keras.backend.set_session(sess)
        model  = self.get_model((None, len(self.features)))
        # Define callbacks
        cb_es  = EarlyStopping(monitor='val_loss', patience=self.params["patience"], verbose=1, mode='auto')
        cb_lr  = LearningRateScheduler(self.lr_schedule)
        # Start training
        model.fit(train_set[0], train_set[1],
                  epochs          = self.params["nn_epochs"],
                  batch_size      = self.params["nn_batch_size"],
                  validation_data = (val_set[0], val_set[1]),
                  callbacks       = [cb_lr, cb_es],
                  verbose         = 2)
        return model, None

    def convert_dataset(self, x_train, y_train=None, x_val=None, y_val=None):
        x_ary     = np.array(x_train)
        train_set = [x_ary.reshape (-1, self.params["batch_size"], len(self.features))]
        if y_train is None:
            return train_set
        
        if self.task == "nn_regression":
            y_ary = np.array(y_train)
        else:
            y_ary = to_categorical(y_train, num_classes=self.params["num_classes"])
        train_set.append(y_ary.reshape(-1, self.params["batch_size"], self.params["num_classes"]))
        
        if x_val is not None:
            x_ary   = np.array(x_val)
            if self.task == "nn_regression":
                y_ary = np.array(y_val)
            else:
                y_ary = to_categorical(y_val, num_classes=self.params["num_classes"])
            val_set = [x_ary.reshape(-1, self.params["batch_size"], len(self.features)),
                       y_ary.reshape(-1, self.params["batch_size"], self.params["num_classes"])]
            return train_set, val_set
        
        return train_set
    
    def get_pred_base(self, df):
        df = df.copy()
        thr_base = df.d.min() + GROUP_SIZE - 2
        df_repd_base = df.query("@thr_base < d")[["id"]].sort_values(by="id")
        df_repd_base["sales"] = 0
        return df_repd_base.reset_index(drop=True)
    
    def predict_future(self, model, df):
        df = df.copy()
        thr_base = df.d.min() + GROUP_SIZE - 2
        df_base  = df[df.d<=thr_base].copy()
        for i in tqdm(range(28)):
            d_range_min = df.d.min() + i
            d_range_max = d_range_min + GROUP_SIZE
            df_t = df[(d_range_min<=df.d)&(df.d<d_range_max)][self.features]
            res  = model.predict(np.array(df_t).reshape(-1,self.params["batch_size"],len(self.features)))
            #res = np.ones((df[df.d==d_range_max-1].shape[0],self.params["batch_size"],len(self.features)))
            df.loc[df.d==d_range_max-1, "sales"] = res[:,self.params["batch_size"]-1,0]
            df_pred = create_rolling_features(df, "sales", periods, windows)
            #df_pred = df.copy()
            df = pd.concat([df_base, df_pred[thr_base<df_pred.d]], sort=False)
            df = reset_and_sort(df)
        return np.array(df[thr_base<df.d].sales)

In [None]:
def standardize(train_data, valid_data, test_data, features, skip_words=["sales","price"]):
    print("-------------------")
    print("Start standardizing")
    skipped = []
    for col in features:
        if col not in train_data.columns or max([1 if -1 < col.find(sk) else 0 for sk in skip_words])==1:
            skipped.append(col)
            continue            
        train_mean = np.mean(train_data[col].tolist())
        train_std  = np.std (train_data[col].tolist())
        print("The %s mean is %s, and std is %s" % (col, train_mean, train_std))
        train_data[col] = train_data[col].apply(lambda x: (x-train_mean)/train_std)
        valid_data[col] = valid_data[col].apply(lambda x: (x-train_mean)/train_std)
        test_data [col] = test_data [col].apply(lambda x: (x-train_mean)/train_std)
    
    if 0 < len(skipped):
        print("These features were skipped")
        print(skipped)
    train_data = reduce_mem_usage(train_data)
    valid_data = reduce_mem_usage(valid_data)
    test_data  = reduce_mem_usage(test_data)
    print("-------------------")
    return train_data, valid_data, test_data

# Create WaveNet

In [None]:
# Parameters of training
target  = "sales"
fold    = "GroupKFold"
group   = "group"
task_wn = "nn_regression"
k       = 2

In [None]:
%%time
seed_everything()
drop_cols    = ["state_id","wm_yr_wk","item_nunique",
                "price_max","price_min","price_norm","release"]
not_use_cols = ["id","d","item_id","store_id","sales","year","group"]

dict_results = {}
for state in TARGET_STATES:
    print("=============================")
    print(state)
    train_data  = dict_df[state]["train_data"].copy()
    valid_data  = dict_df[state]["valid_data"].copy()
    test_data   = dict_df[state]["test_data"].copy()
    mean_std    = dict_df[state]["mean_std"].copy()
    valid_price = dict_df[state]["valid_price"].copy()
    test_price  = dict_df[state]["test_price"].copy()

    # Difine features will be used on training
    features = [col for col in train_data.columns if col not in not_use_cols + drop_cols]
    print("The number of training features is %s," % len(features))
    show_cols(features)

    train_data.drop(drop_cols, axis=1, inplace=True)
    valid_data.drop(drop_cols, axis=1, inplace=True)
    test_data .drop(drop_cols, axis=1, inplace=True)
    del dict_df[state]
    gc.collect()

    train_data, valid_data, test_data = standardize(train_data, valid_data, test_data, features)
    print(train_data.shape, valid_data.shape, test_data.shape)
    display(train_data.head(1))
    display(valid_data.head(1))
    display(test_data.head(1))
    
    # Start training
    wn = WaveNet(train_data, test_data,
                 target, features,
                 valid_df=None, task=task_wn,
                 cv_method=fold, n_splits=k, group=group,
                 verbose=False)
    
    dict_results[state] = {"valid_result" : wn.y_valid,
                           "test_result"  : wn.y_pred,
                           "mean_std"     : mean_std,
                           "valid_price"  : valid_price,
                           "test_price"   : test_price}
    del wn, train_data, valid_data, test_data, mean_std, valid_price, test_price
    gc.collect()

In [None]:
df_valid_result = pd.DataFrame()
df_test_result  = pd.DataFrame()
for state in TARGET_STATES:
    mean_std    = dict_results[state]["mean_std"].copy()
    valid_price = dict_results[state]["valid_price"].copy()
    test_price  = dict_results[state]["test_price"].copy()
    
    if dict_results[state]["valid_result"] is not None:
        df_valid = dict_results[state]["valid_result"].copy()
        # Reverse standardization
        df_valid.sales /= valid_price
        df_valid        = df_valid.merge(mean_std, on="id")
        df_valid.sales  = df_valid.sales*df_valid.id_sales_std+df_valid.id_sales_mean
        df_valid.sales  = df_valid.sales.apply(lambda x: np.abs(x))
        df_valid.drop(["id_sales_mean","id_sales_std"], axis=1, inplace=True)
        # Stack result
        df_valid_result = df_valid_result.append(df_valid)
        
    if dict_results[state]["test_result"] is not None:
        df_test = dict_results[state]["test_result"].copy()
        # Reverse standardization
        df_test.sales /= test_price
        df_test        = df_test.merge(mean_std, on="id")
        df_test.sales  = df_test.sales*df_test.id_sales_std+df_test.id_sales_mean
        df_test.sales  = df_test.sales.apply(lambda x: np.abs(x))
        df_test.drop(["id_sales_mean","id_sales_std"], axis=1, inplace=True)
        # Stack result
        df_test_result = df_test_result.append(df_test)
    
df_valid_result = df_valid_result.reset_index(drop=True)
df_test_result  = df_test_result .reset_index(drop=True)

In [None]:
if 0 < df_valid_result.shape[0]:
    ids = df_valid_result.id.unique()
    df_valid_result_wide = pd.DataFrame(np.array(df_valid_result.sales).reshape(-1,28))
    df_valid_result_wide.columns = ["d_"+str(i) for i in range(1914,1914+28)]
    df_valid_result_wide["id"] = ids
    df_valid_result_wide = df_valid_result_wide[["id"]+["d_"+str(i) for i in range(1914,1914+28)]]
    df_valid_result_wide = sample_submission[["id"]].merge(df_valid_result_wide, on="id")

In [None]:
if 0 < df_valid_result.shape[0]:
    groups, scores    = evaluator.score(df_valid_result_wide.drop("id", axis=1))
    score_public_lb   = np.mean(scores)
    score_public_rank = get_lb_rank(score_public_lb)
    for i in range(len(groups)):
        print("Score for group %s: %s" % (groups[i], round(scores[i], 5)))
    print("Public LB Score: %s" % round(score_public_lb, 5))
    print("Public LB Rank : %s" % score_public_rank)

In [None]:
if 0 < df_test_result.shape[0]:
    ids = df_test_result.id.unique()
    df_test_result_wide = pd.DataFrame(np.array(df_test_result.sales).reshape(-1,28))
    df_test_result_wide.columns  = ["d_"+str(i) for i in range(1942,1942+28)]
    df_test_result_wide["id"]    = ids
    df_test_result_wide = df_test_result_wide[["id"]+["d_"+str(i) for i in range(1942,1942+28)]]
    df_test_result_wide = sample_submission[["id"]].merge(df_test_result_wide, on="id")

In [None]:
if 0 < df_valid_result.shape[0]:
    df_valid_result_wide.columns = ["id"]+["F"+str(i) for i in range(1,29)]
    df_valid_result_wide.id      = df_valid_result_wide.id.apply(lambda x: x.replace("evaluation", "validation"))
else:
    df_valid_result_wide         = sample_submission[sample_submission.id.str.contains("validation")]
    
if 0 < df_test_result.shape[0]:
    df_test_result_wide.columns  = ["id"]+["F"+str(i) for i in range(1,29)]
else:
    df_test_result_wide          = sample_submission[sample_submission.id.str.contains("evaluation")]

In [None]:
submission = pd.concat([df_valid_result_wide, df_test_result_wide], sort=False).reset_index(drop=True)
    
print(submission.shape)
display(submission.head())
display(submission.tail())

In [None]:
submission.to_csv("./submission.csv", index=False)