In [1]:
import pandas as pd
import numpy as np
import ipynb.fs.full.utils as utils
import importlib
importlib.reload(utils)
from keras.models import Sequential, Model, load_model
from keras.layers.core import Dense, Dropout, Reshape, Activation, Lambda, Flatten
from keras.layers import TimeDistributed, BatchNormalization, Input, Conv1D
from keras.layers.merge import dot
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
from tqdm import tqdm_notebook
import keras
import os
import matplotlib.pyplot as pltё
import seaborn as sns
import lightgbm as lgb
from sklearn import preprocessing, metrics
import pyspark.sql.functions as SF
import pyspark.sql.types as ST

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
%matplotlib inline

In [3]:
utils.setup_context()

In [35]:
calendar = pd.read_csv("calendar.csv")
calendar.head(1)

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0


In [36]:
def transform_event(event):
    if event is None:
        return 0
    return 1

calendar = calendar.drop(columns=["event_name_1", "event_name_2"])
calendar["event_type_1"] = calendar["event_type_1"].apply(transform_event)
calendar["event_type_2"] = calendar["event_type_2"].apply(transform_event)

In [37]:
prices = pd.read_csv("sell_prices.csv")
prices.head(1)

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58


In [38]:
sales = pd.read_csv("sales_train_validation.csv")
sales.head(1)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1


In [39]:
prediction_columns = [f"d_{i}" for i in range(1914, 1914 + 28)]
for column in prediction_columns:
    sales[column] = None

In [40]:
sales = sales \
    .pipe(utils.encode_categorical, cols=["item_id", "dept_id", "cat_id"]) \
    .pipe(utils.reduce_mem_usage)

prices = prices \
    .pipe(utils.encode_categorical, cols=["item_id"]) \
    .pipe(utils.reduce_mem_usage)

calendar = calendar.pipe(utils.reduce_mem_usage)

Mem. usage decreased to 230.06 Mb (49.2% reduction)
Mem. usage decreased to 130.48 Mb (37.5% reduction)
Mem. usage decreased to  0.11 Mb (37.5% reduction)


In [41]:
sales = pd.melt(sales, id_vars=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"], var_name="day", value_name="demand")

In [42]:
sales.dtypes

id          object
item_id      int32
dept_id      int32
cat_id       int32
store_id    object
state_id    object
day         object
demand      object
dtype: object

In [43]:
sales.head(1)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand
0,HOBBIES_1_001_CA_1_validation,1437,3,1,CA_1,CA,d_1,0


In [44]:
sales.shape

(59181090, 8)

In [45]:
calendar.drop(columns=['weekday', 'wday', 'month', 'year'], inplace = True)

In [46]:
sales = sales.merge(calendar, left_on="day", right_on="d", how="left")

In [47]:
sales = sales.merge(prices, on=["store_id", "item_id", "wm_yr_wk"], how="left")

In [48]:
sales.drop(columns=["d"], inplace=True)

In [49]:
sales = sales[~sales.sell_price.isna()].reset_index(drop=True)

In [50]:
sales["day"] = sales["day"].apply(lambda s: s[2:]).apply(int)

In [51]:
sales.tail(2)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand,date,wm_yr_wk,event_type_1,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
46881675,FOODS_3_826_WI_3_validation,1435,2,0,WI_3,WI,1941,,2016-05-22,11617,1,1,0,0,0,1.28
46881676,FOODS_3_827_WI_3_validation,1436,2,0,WI_3,WI,1941,,2016-05-22,11617,1,1,0,0,0,1.0


In [52]:
sales.to_parquet("/mnt/m5_train_data.parquet", index=False)

In [4]:
from pyspark.sql.window import Window
from itertools import product
import os

window = Window()

shift = 0

def get_last_k_values(df, columns, k, group_columns = ["item_id"], agregate = False, agg_function = SF.sum):
    window = Window().partitionBy(group_columns).orderBy("day")
    
    agg_addition_name = ""
    
    agg_df = df
    if agregate:
        agg_addition_name = f"agg_{agg_function.__name__}_"
        agg_df = agg_df \
            .groupby(group_columns + ["day"]) \
            .agg(*[agg_function(column).alias(column) for column in columns])
    
    agg_df = agg_df \
        .select(*(group_columns + ["day"] + 
                  [SF.lag(column, i + shift) \
                       .over(window) \
                       .alias(f"previous_{column}_by_{'&'.join(group_columns)}_at_{i + shift}{agg_addition_name}")
                   for column, i in product(columns, range(1, k))
        ]))
    
    df = df \
        .join(agg_df, on=group_columns + ["day"], how="left")
    
    return df


def group_stats(df, group_columns, columns_for_stats, k_arr, functions = [SF.mean, SF.min, SF.max, SF.stddev]):
    if type(group_columns) is str:
        group_columns = [group_columns]
        
    if type(k_arr) is int:
        k_arr = [k_arr]
            
    if len(group_columns) == 2 and group_columns[0] == "item_id" and group_columns[1] == "store_id":
        agg_df = df
    else:
        agg_df = df \
            .groupby(group_columns + ["day"]) \
            .agg(*[SF.sum(column).alias(column) for column in columns_for_stats])
    
    select_columns = group_columns + columns_for_stats + ["day"]
    for k in k_arr:
        window = Window().partitionBy(group_columns).orderBy("day").rowsBetween(-k - shift, -shift)
        agg_df = agg_df \
            .select(*(select_columns + [
                func(column) \
                    .over(window) \
                    .alias(f"{func.__name__}_by_{''.join(group_columns)}_on_{column}_at_{k}")
                for func, column in product(functions, columns_for_stats)
        ]))
        select_columns = agg_df.columns
    
    agg_df = agg_df.drop(*columns_for_stats)
    return df \
            .join(agg_df, on=["day"] + group_columns, how="left")


def get_date_features(df):
    return df \
        .withColumn("day_of_month", SF.dayofmonth("date")) \
        .withColumn("day_of_week", SF.dayofweek("date")) \
        .withColumn("dat_of_year", SF.dayofyear("date")) \
        .withColumn("month", SF.month("date")) \
        .withColumn("year", SF.year("date")) \
        .withColumn("quarter", SF.quarter("date")) \
        .withColumn("week", SF.weekofyear("date"))
    

def get_features(df, cache_prefix = ""):
    cache_prefix = "/mnt/m5_all_train_data/" + cache_prefix
    initial_columns = df.columns
    columns_for_stats = ["sell_price", "demand"]
    
    group_columns = [["item_id", "store_id"], "dept_id", "cat_id", "store_id", "state_id", ["dept_id", "store_id"], ["cat_id", "store_id"]]
    stat_days = [7, 14, 30, 60, 90]
    
    cache_path = f'{cache_prefix}shift={shift}_initial.parquet'
    
    if utils.check_path_exists(cache_path, is_spark_dir=True):
        df = utils.read_parquet(cache_path).cache()
    else:    
        df = df.withColumn("date", SF.to_timestamp("date"))
        df.cache()
        print(df.count())

        df = get_last_k_values(df, columns_for_stats, k=10, group_columns=["item_id", "store_id"])
        print(df.count())
        df = get_date_features(df)
        print(f"save cache to {cache_path} with columns={len(df.columns)}")
        utils.write_parquet(df, cache_path)
        df = utils.read_parquet(cache_path).cache()
        print(df.count())
    
    print(df.columns)
    for i, column in enumerate(group_columns):
        cache_path = f'{cache_prefix}shift={shift}_i={i}.parquet'
        if utils.check_path_exists(cache_path, is_spark_dir=True):
            df = utils.read_parquet(cache_path)
            continue
            
        if type(column) is str:
            grp_col = [column]
        else:
            grp_col = column
          
        df = group_stats(df, grp_col, columns_for_stats, k_arr=stat_days)
        print(df.columns)
        print(f"save cache to {cache_path} with columns={len(df.columns)}")
        utils.write_parquet(df, cache_path)
        df = utils.read_parquet(cache_path).cache()
        print(df.count())
        
    print(f"generate {len(df.columns) - len(initial_columns)} features")
    return df
        

In [4]:
from sklearn.metrics import mean_squared_error

stores = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
train_data_dir = "/mnt/m5_all_train_data"
os.makedirs(train_data_dir, exist_ok=True)

meta_columns = ["store_id", "cat_id", "dept_id", "item_id", "id", "state_id", "date"]
cat_features = ["cat_id", "dept_id", "store_id"]
non_feature_columns = ["id", "date", "item_id", "day", "state_id"]
feature_meta_columns = ["sell_price", "wm_yr_wk", "event_type_1", "event_type_2", "snap_CA", "snap_TX", "snap_WI"]

In [6]:
inference_data = utils.read_parquet("/mnt/m5_train_data.parquet")

In [16]:
for_eval = inference_data.where(SF.col("day") >= 1914).drop("demand")
prediction = utils.read_csv("melt_mlp_predictions.csv")
for_eval = for_eval.join(prediction, on=["id", "day"], how="left")

inference_data = inference_data \
    .where(SF.col("day") > 1120 - 560) \
    .where(SF.col("day") <= 1913)

inference_data = utils.concatenate([inference_data, for_eval])

features = get_features(
    inference_data,
    cache_prefix="ex1_"
)
utils.write_parquet(features, f"{train_data_dir}/ex1_train_data.parquet")

save cache to /mnt/m5_all_train_data/ex1_shift=0_initial.parquet with columns=41
36732524
['item_id', 'store_id', 'day', 'id', 'dept_id', 'cat_id', 'state_id', 'demand', 'date', 'wm_yr_wk', 'event_type_1', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'previous_sell_price_by_item_id&store_id_at_1', 'previous_sell_price_by_item_id&store_id_at_2', 'previous_sell_price_by_item_id&store_id_at_3', 'previous_sell_price_by_item_id&store_id_at_4', 'previous_sell_price_by_item_id&store_id_at_5', 'previous_sell_price_by_item_id&store_id_at_6', 'previous_sell_price_by_item_id&store_id_at_7', 'previous_sell_price_by_item_id&store_id_at_8', 'previous_sell_price_by_item_id&store_id_at_9', 'previous_demand_by_item_id&store_id_at_1', 'previous_demand_by_item_id&store_id_at_2', 'previous_demand_by_item_id&store_id_at_3', 'previous_demand_by_item_id&store_id_at_4', 'previous_demand_by_item_id&store_id_at_5', 'previous_demand_by_item_id&store_id_at_6', 'previous_demand_by_item_id&store_i

['day', 'cat_id', 'dept_id', 'item_id', 'store_id', 'id', 'state_id', 'demand', 'date', 'wm_yr_wk', 'event_type_1', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'previous_sell_price_by_item_id&store_id_at_1', 'previous_sell_price_by_item_id&store_id_at_2', 'previous_sell_price_by_item_id&store_id_at_3', 'previous_sell_price_by_item_id&store_id_at_4', 'previous_sell_price_by_item_id&store_id_at_5', 'previous_sell_price_by_item_id&store_id_at_6', 'previous_sell_price_by_item_id&store_id_at_7', 'previous_sell_price_by_item_id&store_id_at_8', 'previous_sell_price_by_item_id&store_id_at_9', 'previous_demand_by_item_id&store_id_at_1', 'previous_demand_by_item_id&store_id_at_2', 'previous_demand_by_item_id&store_id_at_3', 'previous_demand_by_item_id&store_id_at_4', 'previous_demand_by_item_id&store_id_at_5', 'previous_demand_by_item_id&store_id_at_6', 'previous_demand_by_item_id&store_id_at_7', 'previous_demand_by_item_id&store_id_at_8', 'previous_demand_by_item_id&store_id_

['day', 'state_id', 'store_id', 'cat_id', 'dept_id', 'item_id', 'id', 'demand', 'date', 'wm_yr_wk', 'event_type_1', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'previous_sell_price_by_item_id&store_id_at_1', 'previous_sell_price_by_item_id&store_id_at_2', 'previous_sell_price_by_item_id&store_id_at_3', 'previous_sell_price_by_item_id&store_id_at_4', 'previous_sell_price_by_item_id&store_id_at_5', 'previous_sell_price_by_item_id&store_id_at_6', 'previous_sell_price_by_item_id&store_id_at_7', 'previous_sell_price_by_item_id&store_id_at_8', 'previous_sell_price_by_item_id&store_id_at_9', 'previous_demand_by_item_id&store_id_at_1', 'previous_demand_by_item_id&store_id_at_2', 'previous_demand_by_item_id&store_id_at_3', 'previous_demand_by_item_id&store_id_at_4', 'previous_demand_by_item_id&store_id_at_5', 'previous_demand_by_item_id&store_id_at_6', 'previous_demand_by_item_id&store_id_at_7', 'previous_demand_by_item_id&store_id_at_8', 'previous_demand_by_item_id&store_id_

['day', 'dept_id', 'store_id', 'state_id', 'cat_id', 'item_id', 'id', 'demand', 'date', 'wm_yr_wk', 'event_type_1', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'previous_sell_price_by_item_id&store_id_at_1', 'previous_sell_price_by_item_id&store_id_at_2', 'previous_sell_price_by_item_id&store_id_at_3', 'previous_sell_price_by_item_id&store_id_at_4', 'previous_sell_price_by_item_id&store_id_at_5', 'previous_sell_price_by_item_id&store_id_at_6', 'previous_sell_price_by_item_id&store_id_at_7', 'previous_sell_price_by_item_id&store_id_at_8', 'previous_sell_price_by_item_id&store_id_at_9', 'previous_demand_by_item_id&store_id_at_1', 'previous_demand_by_item_id&store_id_at_2', 'previous_demand_by_item_id&store_id_at_3', 'previous_demand_by_item_id&store_id_at_4', 'previous_demand_by_item_id&store_id_at_5', 'previous_demand_by_item_id&store_id_at_6', 'previous_demand_by_item_id&store_id_at_7', 'previous_demand_by_item_id&store_id_at_8', 'previous_demand_by_item_id&store_id_

['day', 'cat_id', 'store_id', 'dept_id', 'state_id', 'item_id', 'id', 'demand', 'date', 'wm_yr_wk', 'event_type_1', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'previous_sell_price_by_item_id&store_id_at_1', 'previous_sell_price_by_item_id&store_id_at_2', 'previous_sell_price_by_item_id&store_id_at_3', 'previous_sell_price_by_item_id&store_id_at_4', 'previous_sell_price_by_item_id&store_id_at_5', 'previous_sell_price_by_item_id&store_id_at_6', 'previous_sell_price_by_item_id&store_id_at_7', 'previous_sell_price_by_item_id&store_id_at_8', 'previous_sell_price_by_item_id&store_id_at_9', 'previous_demand_by_item_id&store_id_at_1', 'previous_demand_by_item_id&store_id_at_2', 'previous_demand_by_item_id&store_id_at_3', 'previous_demand_by_item_id&store_id_at_4', 'previous_demand_by_item_id&store_id_at_5', 'previous_demand_by_item_id&store_id_at_6', 'previous_demand_by_item_id&store_id_at_7', 'previous_demand_by_item_id&store_id_at_8', 'previous_demand_by_item_id&store_id_

generate 305 features


In [7]:
inference_data = inference_data \
    .where(SF.col("day") > 1913 - 100) \

for_eval = inference_data.where(SF.col("day") >= 1914).drop("demand")
prediction = utils.read_csv("melt_mlp_predictions.csv")
for_eval = for_eval.join(prediction, on=["id", "day"], how="left")

inference_data = inference_data.where(SF.col("day") <= 1913)
inference_data = utils.concatenate([inference_data, for_eval])

In [8]:
features = get_features(
    inference_data,
    cache_prefix="ex1_inference_"
).where(SF.col("day") >= 1914)
utils.write_parquet(features, f"{train_data_dir}/ex1_inference_data.parquet")

3902440
3902440
save cache to /mnt/m5_all_train_data/ex1_inference_shift=0_initial.parquet with columns=41
3902440
['item_id', 'store_id', 'day', 'id', 'dept_id', 'cat_id', 'state_id', 'demand', 'date', 'wm_yr_wk', 'event_type_1', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'previous_sell_price_by_item_id&store_id_at_1', 'previous_sell_price_by_item_id&store_id_at_2', 'previous_sell_price_by_item_id&store_id_at_3', 'previous_sell_price_by_item_id&store_id_at_4', 'previous_sell_price_by_item_id&store_id_at_5', 'previous_sell_price_by_item_id&store_id_at_6', 'previous_sell_price_by_item_id&store_id_at_7', 'previous_sell_price_by_item_id&store_id_at_8', 'previous_sell_price_by_item_id&store_id_at_9', 'previous_demand_by_item_id&store_id_at_1', 'previous_demand_by_item_id&store_id_at_2', 'previous_demand_by_item_id&store_id_at_3', 'previous_demand_by_item_id&store_id_at_4', 'previous_demand_by_item_id&store_id_at_5', 'previous_demand_by_item_id&store_id_at_6', 'previous_

3902440
['day', 'cat_id', 'dept_id', 'item_id', 'store_id', 'id', 'state_id', 'demand', 'date', 'wm_yr_wk', 'event_type_1', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'previous_sell_price_by_item_id&store_id_at_1', 'previous_sell_price_by_item_id&store_id_at_2', 'previous_sell_price_by_item_id&store_id_at_3', 'previous_sell_price_by_item_id&store_id_at_4', 'previous_sell_price_by_item_id&store_id_at_5', 'previous_sell_price_by_item_id&store_id_at_6', 'previous_sell_price_by_item_id&store_id_at_7', 'previous_sell_price_by_item_id&store_id_at_8', 'previous_sell_price_by_item_id&store_id_at_9', 'previous_demand_by_item_id&store_id_at_1', 'previous_demand_by_item_id&store_id_at_2', 'previous_demand_by_item_id&store_id_at_3', 'previous_demand_by_item_id&store_id_at_4', 'previous_demand_by_item_id&store_id_at_5', 'previous_demand_by_item_id&store_id_at_6', 'previous_demand_by_item_id&store_id_at_7', 'previous_demand_by_item_id&store_id_at_8', 'previous_demand_by_item_id&s

3902440
['day', 'state_id', 'store_id', 'cat_id', 'dept_id', 'item_id', 'id', 'demand', 'date', 'wm_yr_wk', 'event_type_1', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'previous_sell_price_by_item_id&store_id_at_1', 'previous_sell_price_by_item_id&store_id_at_2', 'previous_sell_price_by_item_id&store_id_at_3', 'previous_sell_price_by_item_id&store_id_at_4', 'previous_sell_price_by_item_id&store_id_at_5', 'previous_sell_price_by_item_id&store_id_at_6', 'previous_sell_price_by_item_id&store_id_at_7', 'previous_sell_price_by_item_id&store_id_at_8', 'previous_sell_price_by_item_id&store_id_at_9', 'previous_demand_by_item_id&store_id_at_1', 'previous_demand_by_item_id&store_id_at_2', 'previous_demand_by_item_id&store_id_at_3', 'previous_demand_by_item_id&store_id_at_4', 'previous_demand_by_item_id&store_id_at_5', 'previous_demand_by_item_id&store_id_at_6', 'previous_demand_by_item_id&store_id_at_7', 'previous_demand_by_item_id&store_id_at_8', 'previous_demand_by_item_id&s

3902440
['day', 'dept_id', 'store_id', 'state_id', 'cat_id', 'item_id', 'id', 'demand', 'date', 'wm_yr_wk', 'event_type_1', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'previous_sell_price_by_item_id&store_id_at_1', 'previous_sell_price_by_item_id&store_id_at_2', 'previous_sell_price_by_item_id&store_id_at_3', 'previous_sell_price_by_item_id&store_id_at_4', 'previous_sell_price_by_item_id&store_id_at_5', 'previous_sell_price_by_item_id&store_id_at_6', 'previous_sell_price_by_item_id&store_id_at_7', 'previous_sell_price_by_item_id&store_id_at_8', 'previous_sell_price_by_item_id&store_id_at_9', 'previous_demand_by_item_id&store_id_at_1', 'previous_demand_by_item_id&store_id_at_2', 'previous_demand_by_item_id&store_id_at_3', 'previous_demand_by_item_id&store_id_at_4', 'previous_demand_by_item_id&store_id_at_5', 'previous_demand_by_item_id&store_id_at_6', 'previous_demand_by_item_id&store_id_at_7', 'previous_demand_by_item_id&store_id_at_8', 'previous_demand_by_item_id&s

3902440
['day', 'cat_id', 'store_id', 'dept_id', 'state_id', 'item_id', 'id', 'demand', 'date', 'wm_yr_wk', 'event_type_1', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'previous_sell_price_by_item_id&store_id_at_1', 'previous_sell_price_by_item_id&store_id_at_2', 'previous_sell_price_by_item_id&store_id_at_3', 'previous_sell_price_by_item_id&store_id_at_4', 'previous_sell_price_by_item_id&store_id_at_5', 'previous_sell_price_by_item_id&store_id_at_6', 'previous_sell_price_by_item_id&store_id_at_7', 'previous_sell_price_by_item_id&store_id_at_8', 'previous_sell_price_by_item_id&store_id_at_9', 'previous_demand_by_item_id&store_id_at_1', 'previous_demand_by_item_id&store_id_at_2', 'previous_demand_by_item_id&store_id_at_3', 'previous_demand_by_item_id&store_id_at_4', 'previous_demand_by_item_id&store_id_at_5', 'previous_demand_by_item_id&store_id_at_6', 'previous_demand_by_item_id&store_id_at_7', 'previous_demand_by_item_id&store_id_at_8', 'previous_demand_by_item_id&s

3902440
generate 305 features


In [10]:
train_features = utils \
    .read_parquet(f"{train_data_dir}/ex1_train_data.parquet") \
    .where(SF.col("day") <= 1913)

inference_features = utils \
    .read_parquet(f"{train_data_dir}/ex1_inference_data.parquet")

features = utils.concatenate([inference_features, train_features])
schema = features.schema

cast_columns = []
for field in schema.fields:
    if type(field.dataType) is ST.DoubleType:
        cast_columns.append(field.name)
        
for column in cast_columns:
    features = features \
        .withColumn(column, SF.col(column).cast(ST.FloatType()))
    
utils.write_parquet(features, f"{train_data_dir}/ex1_all_train_data_repartition.parquet", single=True)

In [14]:
params = {
        'boosting_type': 'gbdt',
        'objective': 'tweedie',
        'tweedie_variance_power': 1.1,
        'metric': 'rmse',
        'subsample': 0.5,
        'subsample_freq': 1,
        'learning_rate': 0.03,
        'num_leaves': 2**11-1,
        'min_data_in_leaf': 2**12-1,
        'feature_fraction': 0.5,
        'max_bin': 100,
        'boost_from_average': False,
}

In [15]:
# define custom loss function
def custom_asymmetric_train(y_pred, y_true):
    y_true = y_true.get_label()
    residual = (y_true - y_pred).astype("float")
    grad = np.where(residual < 0, -2 * residual, -2 * residual * 1.15)
    hess = np.where(residual < 0, 2, 2 * 1.15)
    return grad, hess

# define custom evaluation metric
def custom_asymmetric_valid(y_pred, y_true):
    y_true = y_true.get_label()
    residual = (y_true - y_pred).astype("float")
    loss = np.where(residual < 0, (residual ** 2) , (residual ** 2) * 1.15) 
    return "custom_asymmetric_eval", np.mean(loss), False


In [9]:
from sklearn.metrics import mean_squared_error

train_datas = []
evaluate_datas = []

for store_id in stores:
    print(f"processing {store_id}")
    features = utils.read_parquet(f"{train_data_dir}/{store_id}_data.parquet")
    train_datas.append(features \
        .where(SF.col("day") <= 1913) \
        .where(SF.col("day") > 1120))

    evaluate_datas.append(features \
        .where(SF.col("day") > 1913) \
        .where(SF.col("day") <= 1913 + 28))
    
utils.write_parquet(utils.concatenate(train_datas), "/mnt/m5_all_train_data/all_train_features.parquet", single=True)
utils.write_parquet(utils.concatenate(evaluate_datas), "/mnt/m5_all_train_data/all_eval_features.parquet", single=True)

processing CA_1
processing CA_2
processing CA_3
processing CA_4
processing TX_1
processing TX_2
processing TX_3
processing WI_1
processing WI_2
processing WI_3


In [7]:
train_data = utils.read_pandas_parquet_from_spark(f"/mnt/m5_all_train_data/ex1_all_train_data_repartition.parquet/part-00000-2359a11b-2e74-4b5d-af61-adcf6edd274c-c000.snappy.parquet")

In [8]:
train_data.shape

(37586244, 321)

In [9]:
train_data["store_id"] = train_data["store_id"].apply(stores.index)

In [10]:
target_length = 28
evaluate_day = 1914
test_start_day = evaluate_day - target_length * 10
train_start_day = test_start_day - target_length * 16

evaluate_data = train_data[train_data.day == evaluate_day]

test_data = train_data[(train_data.day >= test_start_day) & (train_data.day < evaluate_day)]

train_data = train_data[(train_data.day < test_start_day) & (train_data.day > train_start_day)]

train_target = train_data[["id", "day", "demand"]]
train_data = train_data[train_data.day < test_start_day - target_length]

test_target = test_data[["id", "day", "demand"]]
test_data = test_data[test_data.day < evaluate_day - target_length]

In [11]:
def make_shift_target(df, target, k):
    df = df[["id", "day"]]
    target = target[["id", "day", "demand"]]
    target.day = target.day - (k - 1) # because initial shift=1
    df = df.merge(target, on=["id", "day"], how="left")
    
    print(f"have nans {df.demand.isna().sum()} in {df.shape[0]} samples({float(df.demand.isna().sum()) / df.shape[0]}%)")
    
#     df_shape = df.shape[0]
#     df = df.dropna(subset=["demand"])
#     if df_shape != df.shape[0]:
#         print(f"пиздец, должно быть {df_shape} а получилось {df.shape[0]}, иди чини еблан")
        
    return df["demand"].fillna(0)

In [12]:
# if without lags
drop_columns = []

for column in train_data.columns:
    if column.startswith("previous_demand"):
        drop_columns.append(column)
        
train_data.drop(columns=drop_columns, inplace=True)
evaluate_data.drop(columns=drop_columns, inplace=True)
test_data.drop(columns=drop_columns, inplace=True)

In [12]:
feature_names = list(train_data.columns)
for column in non_feature_columns + ['demand']:
    feature_names.remove(column)

In [16]:
for i in tqdm_notebook(range(2, 29)):
    train_set = lgb.Dataset(train_data[feature_names],
        categorical_feature=cat_features, feature_name=feature_names)

    valid_set = lgb.Dataset(test_data[feature_names],
    categorical_feature=cat_features, feature_name=feature_names)
    
    print("make shifting..")
    fold_train_target = make_shift_target(train_data, train_target, i)
    fold_test_target = make_shift_target(test_data, test_target, i)

    print("setting labels..")
    train_set.set_label(fold_train_target)
    valid_set.set_label(fold_test_target)

    print("start training..")
    model = lgb.train(
        params,
        train_set,
        valid_sets = [train_set, valid_set],
        verbose_eval = 50,
        num_boost_round = 2000,
        early_stopping_rounds = 50, 
        feval = custom_asymmetric_valid
    )
    utils.save_pickle(f"fold_{i}.pkl", model)
    
#     train_data[f"prediction_{i}"] = model.predict(train_data[feature_names])
#     test_data[f"prediction_{i}"] = model.predict(test_data[feature_names])
#     feature_names.append(f"prediction_{i}")


HBox(children=(IntProgress(value=0, max=27), HTML(value='')))

make shifting..
have nans 0 in 12277556 samples(0.0%)
have nans 0 in 7672008 samples(0.0%)
setting labels..
start training..




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 2.3123	training's custom_asymmetric_eval: 6.04329	valid_1's rmse: 2.26409	valid_1's custom_asymmetric_eval: 5.78968
[100]	training's rmse: 2.11117	training's custom_asymmetric_eval: 4.95997	valid_1's rmse: 2.07475	valid_1's custom_asymmetric_eval: 4.78909
[150]	training's rmse: 2.069	training's custom_asymmetric_eval: 4.74016	valid_1's rmse: 2.04283	valid_1's custom_asymmetric_eval: 4.62489
[200]	training's rmse: 2.04256	training's custom_asymmetric_eval: 4.61298	valid_1's rmse: 2.02783	valid_1's custom_asymmetric_eval: 4.55641
[250]	training's rmse: 2.02697	training's custom_asymmetric_eval: 4.54081	valid_1's rmse: 2.021	valid_1's custom_asymmetric_eval: 4.52718
[300]	training's rmse: 2.01595	training's custom_asymmetric_eval: 4.49054	valid_1's rmse: 2.01648	valid_1's custom_asymmetric_eval: 4.50864
[350]	training's rmse: 2.0067	training's custom_asymmetric_eval: 4.44902	valid_1's rmse: 2.01283	valid_1'

Early stopping, best iteration is:
[373]	training's rmse: 2.10983	training's custom_asymmetric_eval: 4.91583	valid_1's rmse: 2.1171	valid_1's custom_asymmetric_eval: 4.97669
make shifting..
have nans 0 in 12277556 samples(0.0%)
have nans 0 in 7672008 samples(0.0%)
setting labels..
start training..
Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 2.41857	training's custom_asymmetric_eval: 6.60366	valid_1's rmse: 2.35357	valid_1's custom_asymmetric_eval: 6.25555
[100]	training's rmse: 2.24198	training's custom_asymmetric_eval: 5.5893	valid_1's rmse: 2.17795	valid_1's custom_asymmetric_eval: 5.28311
[150]	training's rmse: 2.20004	training's custom_asymmetric_eval: 5.35694	valid_1's rmse: 2.15071	valid_1's custom_asymmetric_eval: 5.13464
[200]	training's rmse: 2.17245	training's custom_asymmetric_eval: 5.21633	valid_1's rmse: 2.13905	valid_1's custom_asymmetric_eval: 5.07584
[250]	training's rmse: 2.15444	training's custom_asymmetric_eval: 5.12745	valid_1'

[400]	training's rmse: 2.17328	training's custom_asymmetric_eval: 5.21734	valid_1's rmse: 2.20909	valid_1's custom_asymmetric_eval: 5.40299
Early stopping, best iteration is:
[387]	training's rmse: 2.17595	training's custom_asymmetric_eval: 5.23014	valid_1's rmse: 2.20883	valid_1's custom_asymmetric_eval: 5.40097
make shifting..
have nans 0 in 12277556 samples(0.0%)
have nans 0 in 7672008 samples(0.0%)
setting labels..
start training..
Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 2.49389	training's custom_asymmetric_eval: 7.01602	valid_1's rmse: 2.42831	valid_1's custom_asymmetric_eval: 6.65705
[100]	training's rmse: 2.32058	training's custom_asymmetric_eval: 5.98588	valid_1's rmse: 2.25691	valid_1's custom_asymmetric_eval: 5.66136
[150]	training's rmse: 2.27126	training's custom_asymmetric_eval: 5.70886	valid_1's rmse: 2.2358	valid_1's custom_asymmetric_eval: 5.49084
[200]	training's rmse: 2.23895	training's custom_asymmetric_eval: 5.54041	valid_1

[250]	training's rmse: 2.24836	training's custom_asymmetric_eval: 5.58449	valid_1's rmse: 2.26933	valid_1's custom_asymmetric_eval: 5.68823
[300]	training's rmse: 2.23135	training's custom_asymmetric_eval: 5.49984	valid_1's rmse: 2.26746	valid_1's custom_asymmetric_eval: 5.68002
[350]	training's rmse: 2.21479	training's custom_asymmetric_eval: 5.41847	valid_1's rmse: 2.26662	valid_1's custom_asymmetric_eval: 5.67777
Early stopping, best iteration is:
[328]	training's rmse: 2.2215	training's custom_asymmetric_eval: 5.45108	valid_1's rmse: 2.26638	valid_1's custom_asymmetric_eval: 5.67471
make shifting..
have nans 0 in 12277556 samples(0.0%)
have nans 0 in 7672008 samples(0.0%)
setting labels..
start training..
Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 2.53307	training's custom_asymmetric_eval: 7.23429	valid_1's rmse: 2.46637	valid_1's custom_asymmetric_eval: 6.86193
[100]	training's rmse: 2.36094	training's custom_asymmetric_eval: 6.19335	valid_1

[250]	training's rmse: 2.27073	training's custom_asymmetric_eval: 5.69605	valid_1's rmse: 2.29194	valid_1's custom_asymmetric_eval: 5.7964
[300]	training's rmse: 2.25133	training's custom_asymmetric_eval: 5.59876	valid_1's rmse: 2.29045	valid_1's custom_asymmetric_eval: 5.78903
[350]	training's rmse: 2.23495	training's custom_asymmetric_eval: 5.51771	valid_1's rmse: 2.29095	valid_1's custom_asymmetric_eval: 5.79222
Early stopping, best iteration is:
[305]	training's rmse: 2.24943	training's custom_asymmetric_eval: 5.58949	valid_1's rmse: 2.29029	valid_1's custom_asymmetric_eval: 5.78822
make shifting..
have nans 0 in 12277556 samples(0.0%)
have nans 0 in 7672008 samples(0.0%)
setting labels..
start training..
Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 2.56187	training's custom_asymmetric_eval: 7.3976	valid_1's rmse: 2.48248	valid_1's custom_asymmetric_eval: 6.94497
[100]	training's rmse: 2.39047	training's custom_asymmetric_eval: 6.34755	valid_1'

In [115]:
feature_names = feature_names[:-28]

In [17]:
predictions = []

for i in tqdm_notebook(range(1, 29)):
    model = utils.load_pickle(f"fold_{i}.pkl")
    prediction = evaluate_data[["id"]].reset_index(drop=True)
    prediction["day"] = 1913 + i
    prediction["prediction"] = model.predict(evaluate_data[feature_names])
#     evaluate_data[f"prediction_{i}"] = prediction["prediction"]
#     feature_names.append(f"prediction_{i}")
    predictions.append(prediction)

HBox(children=(IntProgress(value=0, max=28), HTML(value='')))

In [18]:
predictions = pd.concat(predictions)

In [21]:
make_submission(predictions)

HBox(children=(IntProgress(value=0, max=30490), HTML(value='')))

(60980, 29)


In [16]:
from sklearn.model_selection import GroupKFold
k=3
kf = GroupKFold(k)
target = train_data["demand"]

prediction = np.zeros(evaluate_data.shape[0])
vall = np.zeros(target.shape[0])

print(f"have for train {train_data.shape}")
groups = train_data["week"].apply(str) + "_" + train_data["year"].apply(str)
for fold_number, (train_idx, val_idx) in enumerate(kf.split(train_data, target, groups)):
    print(f"{fold_number} iteration")
    train_set = lgb.Dataset(
        train_data.iloc[train_idx].drop(columns=non_feature_columns + ['demand']),
        label=target.iloc[train_idx],
        categorical_feature=cat_features)

    valid_set = lgb.Dataset(
        train_data.iloc[val_idx].drop(columns=non_feature_columns + ['demand']),
        label=target.iloc[val_idx],
        categorical_feature=cat_features)

    model = lgb.train(
        params,
        train_set,
        valid_sets = [train_set, valid_set],
        verbose_eval = 50,
        num_boost_round = 2000,
        early_stopping_rounds = 50, 
        fobj = custom_asymmetric_train, 
        feval = custom_asymmetric_valid
    )
    utils.save_pickle(f"/mnt/m5_all_train_data/fold_{fold_number}.pkl", model)
    
    prediction += model.predict(evaluate_data.drop(columns=non_feature_columns + ['demand'])) / k
    vall[val_idx] = model.predict(train_data.iloc[val_idx].drop(columns=non_feature_columns + ['demand']))

have for train (21234917, 312)


KeyboardInterrupt: 

In [16]:
k=3
prediction = np.zeros(evaluate_data.shape[0])

for fold_number in range(k):
    print(f"{fold_number} iteration")
    
    model = utils.load_pickle(f"/mnt/m5_all_train_data/fold_{fold_number}.pkl")
    
    prediction += model.predict(evaluate_data.drop(columns=non_feature_columns + ['demand'])) / k

0 iteration
1 iteration
2 iteration


In [17]:
prediction.shape

(853720,)

In [18]:
prediction

array([0.75460327, 2.06362885, 1.5948935 , ..., 0.23694366, 0.23694366,
       0.32777138])

In [15]:
def feature_selection(model, data, target, metric=mean_squared_error):
    standart_score = metric(target, model.predict(data))
    column_score = {}
    
    for column in tqdm_notebook(data.columns):
        column_copy = data[column].copy() # for restore after
        
        data[column] = np.random.permutation(data[column].values)
        prediction = model.predict(data)
        score = metric(target, prediction)
        column_score[column] = standart_score - score
        
        data[column] = column_copy
    
    return column_score
    

In [10]:
train_data.shape

(23457073, 135)

In [11]:
model = utils.load_pickle("model_last_lqb.pickle")

In [None]:
sample_data = train_data.sample(n=1000000)

scores = feature_selection(model, sample_data.drop(columns=non_feature_columns + ['demand']), sample_data['demand'])

HBox(children=(IntProgress(value=0, max=129), HTML(value='')))

In [18]:
stat = list(scores.items())

In [21]:
sorted(stat, key = lambda x: x[1])

[('mean_by_item_id_on_demand_at_56', -8.362889550913088),
 ('mean_by_item_id_on_demand_at_28', -2.483399785295159),
 ('sell_price', -1.5392525650632765),
 ('mean_by_item_id_on_demand_at_14', -1.2828870109179906),
 ('stddev_by_item_id_on_demand_at_56', -0.7929109543216972),
 ('max_by_item_id_on_demand_at_14', -0.546092736574149),
 ('max_by_item_id_on_demand_at_56', -0.45393249096581023),
 ('day_of_week', -0.43255045789472835),
 ('mean_by_item_id_on_sell_price_at_56', -0.339888555306155),
 ('stddev_by_item_id_on_demand_at_28', -0.3257408588468418),
 ('stddev_by_item_id_on_demand_at_14', -0.32442980723089354),
 ('max_by_item_id_on_sell_price_at_56', -0.309442211217029),
 ('dat_of_year', -0.283672609639952),
 ('store_id', -0.26281563178926914),
 ('previous_demand_by_item_id_at_1', -0.23594022054598263),
 ('stddev_by_cat_id_on_demand_at_14', -0.23575187947201082),
 ('min_by_item_id_on_sell_price_at_14', -0.23110244734017282),
 ('max_by_item_id_on_demand_at_28', -0.2279290607896951),
 ('mean

In [20]:
def make_submission(data, predictions = None, name="submission.csv"):
    if predictions is not None:
        preds = data[["id", "day"]]
        preds["prediction"] = predictions
    else:
        preds = data[["id", "day", "prediction"]]

    submission = []

    for iid, group in tqdm_notebook(preds.groupby("id")):
        group = group[group.day > 1913].sort_values("day")
        assert group.shape[0] == 28
        submission.append([iid] + group.prediction.tolist())
        
    predict_columns = [f"F{i}" for i in range(1, 29)]
    submission = pd.DataFrame(submission, columns=["id"] + predict_columns)
    sample = pd.read_csv('sample_submission.csv')
    sample = sample[~sample.id.isin(submission.id)]

    result_submission = pd.concat([submission, sample])
    print(result_submission.shape)
    result_submission.to_csv(name, index=False)

In [22]:
make_submission(evaluate_data, prediction)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(IntProgress(value=0, max=30490), HTML(value='')))


(60980, 29)


In [30]:
prediction = model.predict(evaluate_data.drop(columns=non_feature_columns + ['demand']))
make_submission(evaluate_data, prediction)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


HBox(children=(IntProgress(value=0, max=30490), HTML(value='')))


(60980, 29)


In [18]:
mean_squared_error(target, vall)

5.069556186646546

In [25]:
for store_id in stores:
    print(f"processing {store_id}")
    features = utils.read_parquet(f"{train_data_dir}/{store_id}_data.parquet")
    train_data = features \
        .where(SF.col("day") <= 1885) \
        .where(SF.col("day") > 1120) \
        .toPandas()

    test_data = features \
        .where(SF.col("day") > 1885) \
        .where(SF.col("day") <= 1913) \
        .toPandas()
    

    target = train_data["demand"]
    train_data.drop(columns=non_feature_columns + ["demand"], inplace=True)
    
    test_target = test_data["demand"]
    test_data.drop(columns=non_feature_columns + ["demand"], inplace=True)
    
    print(f"have for train {train_data.shape}, test {test_data.shape}")
    
    train_data = lgb.Dataset(train_data, label=target, categorical_feature=cat_features)
    
    valid_data = lgb.Dataset(test_data, label=test_target, categorical_feature=cat_features)
    
    model = lgb.train(
        params,
        train_data,
        valid_sets = [train_data, valid_data],
        verbose_eval = 50,
        num_boost_round = 10000,
        early_stopping_rounds = 50, 
        fobj = custom_asymmetric_train, 
        feval = custom_asymmetric_valid
    )
    preds = model.predict(test_data)
    print(f"mse = {mean_squared_error(test_target, preds)}")
    
    utils.save_pickle(f"{train_data_dir}/{store_id}_model.bin", model)

processing CA_1




have for train (2276492, 128), test (85372, 128)




Training until validation scores don't improve for 50 rounds
[50]	training's custom_asymmetric_eval: 5.28413	valid_1's custom_asymmetric_eval: 5.28686
Early stopping, best iteration is:
[48]	training's custom_asymmetric_eval: 5.32168	valid_1's custom_asymmetric_eval: 5.27534
mse = 4.803474874533536
processing CA_2




have for train (2144577, 128), test (85372, 128)




Training until validation scores don't improve for 50 rounds
[50]	training's custom_asymmetric_eval: 3.61668	valid_1's custom_asymmetric_eval: 4.70584
Early stopping, best iteration is:
[40]	training's custom_asymmetric_eval: 3.72185	valid_1's custom_asymmetric_eval: 4.69407
mse = 4.266637403726252
processing CA_3




have for train (2273097, 128), test (85372, 128)




Training until validation scores don't improve for 50 rounds
[50]	training's custom_asymmetric_eval: 9.67373	valid_1's custom_asymmetric_eval: 8.84125
[100]	training's custom_asymmetric_eval: 8.4713	valid_1's custom_asymmetric_eval: 8.81137
Early stopping, best iteration is:
[81]	training's custom_asymmetric_eval: 8.83927	valid_1's custom_asymmetric_eval: 8.78487
mse = 8.136635648989277
processing CA_4




have for train (2257060, 128), test (85372, 128)




Training until validation scores don't improve for 50 rounds
[50]	training's custom_asymmetric_eval: 2.14694	valid_1's custom_asymmetric_eval: 2.20869
Early stopping, best iteration is:
[36]	training's custom_asymmetric_eval: 2.2183	valid_1's custom_asymmetric_eval: 2.20515
mse = 1.9962724377856542
processing TX_1




have for train (2279516, 128), test (85372, 128)




Training until validation scores don't improve for 50 rounds
[50]	training's custom_asymmetric_eval: 3.64844	valid_1's custom_asymmetric_eval: 3.56351
Early stopping, best iteration is:
[31]	training's custom_asymmetric_eval: 3.9282	valid_1's custom_asymmetric_eval: 3.48184
mse = 3.1599931965224193
processing TX_2




have for train (2278389, 128), test (85372, 128)




Training until validation scores don't improve for 50 rounds
[50]	training's custom_asymmetric_eval: 4.64998	valid_1's custom_asymmetric_eval: 4.1466
Early stopping, best iteration is:
[23]	training's custom_asymmetric_eval: 5.42237	valid_1's custom_asymmetric_eval: 3.91691
mse = 3.5658944818311618
processing TX_3




have for train (2267567, 128), test (85372, 128)




Training until validation scores don't improve for 50 rounds
[50]	training's custom_asymmetric_eval: 4.1545	valid_1's custom_asymmetric_eval: 4.06623
Early stopping, best iteration is:
[23]	training's custom_asymmetric_eval: 4.75005	valid_1's custom_asymmetric_eval: 3.95547
mse = 3.5923416950845755
processing WI_1




have for train (2277493, 128), test (85372, 128)




Training until validation scores don't improve for 50 rounds
[50]	training's custom_asymmetric_eval: 3.09331	valid_1's custom_asymmetric_eval: 3.22568
[100]	training's custom_asymmetric_eval: 2.8412	valid_1's custom_asymmetric_eval: 3.27913
Early stopping, best iteration is:
[51]	training's custom_asymmetric_eval: 3.08641	valid_1's custom_asymmetric_eval: 3.22277
mse = 2.9458078199969537
processing WI_2




have for train (2276604, 128), test (85372, 128)




Training until validation scores don't improve for 50 rounds
[50]	training's custom_asymmetric_eval: 6.51876	valid_1's custom_asymmetric_eval: 10.5165
[100]	training's custom_asymmetric_eval: 5.5164	valid_1's custom_asymmetric_eval: 10.3331
[150]	training's custom_asymmetric_eval: 5.00204	valid_1's custom_asymmetric_eval: 10.3276
Early stopping, best iteration is:
[103]	training's custom_asymmetric_eval: 5.47311	valid_1's custom_asymmetric_eval: 10.3104
mse = 9.347140120664857
processing WI_3




have for train (2272558, 128), test (85372, 128)




Training until validation scores don't improve for 50 rounds
[50]	training's custom_asymmetric_eval: 3.72283	valid_1's custom_asymmetric_eval: 4.82572
[100]	training's custom_asymmetric_eval: 3.2996	valid_1's custom_asymmetric_eval: 4.81875
Early stopping, best iteration is:
[56]	training's custom_asymmetric_eval: 3.64919	valid_1's custom_asymmetric_eval: 4.79747
mse = 4.374154486800307


In [26]:
features_columns = train_data.feature_name
utils.save_pickle("/mnt/m5_features.json", features_columns)

In [27]:
features_columns = utils.load_pickle("/mnt/m5_features.json")

In [28]:
from tqdm import tqdm_notebook
preds = []

for store_id in tqdm_notebook(stores):
    model = utils.load_pickle(f"{train_data_dir}/{store_id}_model.bin")
    inference_features = utils \
        .read_parquet(f"{train_data_dir}/{store_id}_data.parquet") \
        .where(SF.col("day") > 1913) \
        .toPandas()
    pred = inference_features[["id", "day"]]
    prediction = model.predict(inference_features[features_columns])
    pred["prediction"] = prediction
    preds.append(pred)


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':





In [29]:
preds = pd.concat(preds)

In [30]:
submission = []

for iid, group in tqdm_notebook(preds.groupby("id")):
    group = group[group.day > 1913].sort_values("day")
    assert group.shape[0] == 28
    submission.append([iid] + group.prediction.tolist())

In [31]:
predict_columns = [f"F{i}" for i in range(1, 29)]
submission = pd.DataFrame(submission, columns=["id"] + predict_columns)
sample = pd.read_csv('sample_submission.csv')
sample = sample[~sample.id.isin(submission.id)]

result_submission = pd.concat([submission, sample])
print(result_submission.shape)
result_submission.to_csv("submission.csv", index=False)

(60980, 29)


In [None]:
import pyspark.sql.types as ST
from tqdm import tqdm_notebook

for i in tqdm_notebook(range(1, 29)):    
    print(f"processing {i} day")
    
    models = {}
    features = []
    
    for store_id in stores:
        models[store_id] = utils.load_pickle(f"{train_data_dir}/{store_id}_model.bin")
        features.append(utils \
            .read_parquet(f"{train_data_dir}/{store_id}_data.parquet") \
            .withColumn("demand", SF.col("demand").cast(ST.FloatType())) \
            .where(SF.col("dat") > 1913 - 60) \
            .toPandas())
    features = pd.concat(features)    
    
    
    
    schema = features.select(meta_columns + feature_meta_columns).schema
    schema = schema.add(ST.StructField("demand", ST.FloatType(), True))

    @SF.pandas_udf(schema, SF.PandasUDFType.GROUPED_MAP)
    def predict_func(df):
        store = df["store_id"].iloc[0]
        preds =  models[store].predict(df[features_columns].astype(float))
        df["demand"] = preds
        return df[meta_columns + feature_meta_columns + ["demand"]]

    
    data = features \
        .where(SF.col("day") > 1913 - 60) \
        .where(SF.col("day") <= 1913) \
        .select(meta_columns + feature_meta_columns + ["demand"])

    for_prediction = features \
            .where(SF.col("day") == 1914)
    
    predictions = for_prediction.groupby("store_id").apply(predict_func)

    next_data = features \
        .where(SF.col("day") == 1913 + i + 1) \
        .select(meta_columns + feature_meta_columns)

    if i == 28:
        data = utils.concatenate([data, predictions])
        utils.write_parquet(data, f"{train_data_dir}/inference_{i}.parquet")
        break
    else:
        data = utils.concatenate([data, predictions, next_data])
        
    data = get_features(data, inference=True)
    
    utils.write_parquet(data, f"{train_data_dir}/inference_{i}.parquet")
    data = utils.read_parquet(f"{train_data_dir}/inference_{i}.parquet").cache()

    for_prediction = data.where(SF.col("day") == 1913 + i + 1)

    data = data.where(SF.col("day") < 1913 + i + 1).select(meta_columns + feature_meta_columns)



In [114]:
tes =  utils.read_parquet("/mnt/m5_all_train_data/CA_1_1_iter_inference.parquet").toPandas()



In [124]:
tes.dtypes["previous_demand_by_item_id&store_id_at_1"]

dtype('O')

In [118]:
tes.demand.isna().sum()

3049

In [9]:
test_data = pd.read_parquet("/mnt/m5_test_data.parquet")
inference_data = pd.read_parquet("/mnt/m5_inference_data.parquet")

In [10]:
train_data = utils.read_pandas_parquet_from_spark('/mnt/m5_train_data.parquet')
# потому что ебучий спарк не может отправить в пандас такой объем данных

HBox(children=(IntProgress(value=0, max=36), HTML(value='')))




In [11]:
train_data.shape

(16845993, 235)

In [12]:
meta_columns = ["store_id", "state_id", "cat_id", "dept_id", "item_id", "id"]
target = train_data["demand"]
train_data.drop(columns= meta_columns + ["demand"], inplace=True)
train_data.dtypes.unique() # check for types

array([dtype('int64'), dtype('int32'), dtype('float32'), dtype('float64')],
      dtype=object)

In [13]:
test_target = test_data["demand"]
test_data.drop(columns=meta_columns + ["demand"], inplace=True)

In [14]:
for column in train_data.columns:
    train_data[column] = train_data[column].astype("float32")
    test_data[column] = test_data[column].astype("float32")

In [11]:
def get_model(activation = "tanh"):
    model = Sequential()
    model.add(Dense(128, input_shape=(train_data.shape[1],)))
    model.add(Activation(activation))
    model.add(Dropout(0.4))
    model.add(Dense(100))
    model.add(Activation(activation))
    model.add(Dense(64))
    model.add(Activation(activation))
    model.add(Dense(32))
    model.add(Activation(activation))
    model.add(Dropout(0.4))
    model.add(Dense(1)) # result layer
    return model

In [16]:
model = get_model()
model.summary()

lr = 0.0005
decay = 2e-6
tensorboard_callback = TensorBoard(
    log_dir=f"/home/timothyxp/kaggle/m5_accuracy/tensorboard_logs/mlp_fe1_lr={lr}_decay={decay}_ex=1")
model.compile(
    loss="mse",
    optimizer=Adam(lr, decay=decay),
    metrics=["mae"]
)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_11 (Dense)             (None, 128)               29312     
_________________________________________________________________
activation_9 (Activation)    (None, 128)               0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 100)               12900     
_________________________________________________________________
activation_10 (Activation)   (None, 100)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 64)                6464      
_________________________________________________________________
activation_11 (Activation)   (None, 64)               

In [32]:
meta_data.head(1)

Unnamed: 0,store_id,state_id,cat_id,dept_id,item_id,id,prediction,day
0,1,0,0,1,227,FOODS_2_012_CA_2_validation,0.641679,1940


In [17]:
model.fit(
    train_data,
    target,
    batch_size=512,
    epochs=200,
    verbose=1,
    shuffle=True,
    callbacks=[tensorboard_callback],
    validation_data=(
        train_data,
        target
    )
)

Train on 16845993 samples, validate on 16845993 samples
Epoch 1/200
 2619392/16845993 [===>..........................] - ETA: 4:00 - loss: 12.8114 - mae: 1.6069

KeyboardInterrupt: 

In [18]:
params = {
        'boosting_type': 'gbdt',
        'objective': 'tweedie',
        'tweedie_variance_power': 1.1,
        'metric': 'rmse',
        'subsample': 0.5,
        'subsample_freq': 1,
        'learning_rate': 0.03,
        'n_estimators': 300,
        'boost_from_average': False,
} 

lqb_model = lgb.LGBMRegressor(**params)

In [19]:
lqb_model.fit(train_data, target, eval_set=(test_data, test_target), verbose=4)

[4]	valid_0's rmse: 3.50962
[8]	valid_0's rmse: 3.38814
[12]	valid_0's rmse: 3.26013
[16]	valid_0's rmse: 3.13554
[20]	valid_0's rmse: 3.01981
[24]	valid_0's rmse: 2.91612
[28]	valid_0's rmse: 2.82506
[32]	valid_0's rmse: 2.74643
[36]	valid_0's rmse: 2.68035
[40]	valid_0's rmse: 2.62406
[44]	valid_0's rmse: 2.57644
[48]	valid_0's rmse: 2.53613
[52]	valid_0's rmse: 2.50304
[56]	valid_0's rmse: 2.47347
[60]	valid_0's rmse: 2.45077
[64]	valid_0's rmse: 2.43096
[68]	valid_0's rmse: 2.4137
[72]	valid_0's rmse: 2.39892
[76]	valid_0's rmse: 2.38751
[80]	valid_0's rmse: 2.37778
[84]	valid_0's rmse: 2.36942
[88]	valid_0's rmse: 2.3624
[92]	valid_0's rmse: 2.35583
[96]	valid_0's rmse: 2.35058
[100]	valid_0's rmse: 2.34591
[104]	valid_0's rmse: 2.34151
[108]	valid_0's rmse: 2.33817
[112]	valid_0's rmse: 2.3353
[116]	valid_0's rmse: 2.33213
[120]	valid_0's rmse: 2.3289
[124]	valid_0's rmse: 2.32686
[128]	valid_0's rmse: 2.32399
[132]	valid_0's rmse: 2.3219
[136]	valid_0's rmse: 2.31891
[140]	valid

LGBMRegressor(boost_from_average=False, boosting_type='gbdt',
       class_weight=None, colsample_bytree=1.0, importance_type='split',
       learning_rate=0.03, max_depth=-1, metric='rmse',
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=300, n_jobs=-1, num_leaves=31, objective='tweedie',
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=0.5, subsample_for_bin=200000, subsample_freq=1,
       tweedie_variance_power=1.1)

In [20]:
lqb_model

LGBMRegressor(boost_from_average=False, boosting_type='gbdt',
       class_weight=None, colsample_bytree=1.0, importance_type='split',
       learning_rate=0.03, max_depth=-1, metric='rmse',
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=300, n_jobs=-1, num_leaves=31, objective='tweedie',
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=0.5, subsample_for_bin=200000, subsample_freq=1,
       tweedie_variance_power=1.1)

In [21]:
test_pred = lqb_model.predict(test_data)

In [22]:
from sklearn.metrics import mean_squared_error
mean_squared_error(test_target, test_pred)

5.258001482143438

In [23]:
meta_data = inference_data[meta_columns]
inference_data.drop(columns=meta_columns + ["demand"], inplace=True)

In [24]:
preds = lqb_model.predict(inference_data)

In [25]:
meta_data["prediction"] = preds
meta_data["day"] = inference_data["day"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [26]:
submit = []
for iid, group in meta_data[["day", "prediction", "id"]].groupby("id"):
    group["day"] = "F" + (group["day"] - 1913).apply(str)
    group = group.sort_values(by=["day"])
    submit.append([iid] + group["prediction"].tolist())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [27]:
predict_columns = [f"F{i}" for i in range(1, 29)]
submission = pd.DataFrame(submit, columns=["id"] + predict_columns)
sample = pd.read_csv('sample_submission.csv')
sample = sample[~sample.id.isin(submission.id)]

result_submission = pd.concat([submission, sample])
print(result_submission.shape)
result_submission.to_csv("submission.csv", index=False)

(60980, 29)

In [119]:
sample.shape

(30490, 29)

In [120]:
sample.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
30490,HOBBIES_1_001_CA_1_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30491,HOBBIES_1_002_CA_1_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30492,HOBBIES_1_003_CA_1_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30493,HOBBIES_1_004_CA_1_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30494,HOBBIES_1_005_CA_1_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
sample.shape

(30490, 29)