In [1]:
# CA
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb

In [2]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [3]:
pd.options.display.max_columns = 50

h = 28 
max_lags = 57
tr_last = 1913
fday = datetime(2016,4, 25) 
fday

datetime.datetime(2016, 4, 25, 0, 0)

In [4]:
def create_dt(is_train = True, nrows = None, first_day = 1200, validate = True, store_id = None):
    prices = pd.read_csv("sell_prices.csv", dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
    
    if not store_id is None:
        prices = prices.loc[prices['store_id'] == store_id]
        #prices = prices.drop(columns=['store_id'])
            
    cal = pd.read_csv("calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    if validate:
        dt = pd.read_csv("sales_train_validation.csv", 
                         nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    else:
        dt = pd.read_csv("sales_train_evaluation.csv", 
                         nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [5]:
def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [6]:
%%time
FIRST_DAY = 1
STORE_ID = 0
df = create_dt(is_train=True, first_day= FIRST_DAY, store_id = STORE_ID)
df.shape

CPU times: user 29.7 s, sys: 11.9 s, total: 41.5 s
Wall time: 44 s


(4702895, 22)

In [7]:
df.shape

(4702895, 22)

In [8]:
df.head()

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_1,12.0,2011-01-29,11101,2,1,1,2011,0,0,0,0,0.0,0.0,0.0,0.46
1,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_2,15.0,2011-01-30,11101,3,2,1,2011,0,0,0,0,0.0,0.0,0.0,0.46
2,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_3,0.0,2011-01-31,11101,1,3,1,2011,0,0,0,0,0.0,0.0,0.0,0.46
3,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_4,0.0,2011-02-01,11101,5,4,2,2011,0,0,0,0,1.0,1.0,0.0,0.46
4,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_5,0.0,2011-02-02,11101,6,5,2,2011,0,0,0,0,1.0,0.0,1.0,0.46


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4702895 entries, 0 to 4702894
Data columns (total 22 columns):
id              object
item_id         int16
dept_id         int16
store_id        int16
cat_id          int16
state_id        int16
d               object
sales           float32
date            datetime64[ns]
wm_yr_wk        int16
weekday         int16
wday            int16
month           int16
year            int16
event_name_1    int16
event_type_1    int16
event_name_2    int16
event_type_2    int16
snap_CA         float32
snap_TX         float32
snap_WI         float32
sell_price      float32
dtypes: datetime64[ns](1), float32(5), int16(14), object(2)
memory usage: 358.8+ MB


In [10]:
%%time

create_fea(df)
df.shape

CPU times: user 14.3 s, sys: 930 ms, total: 15.2 s
Wall time: 12.3 s


(4702895, 31)

In [11]:
df.head()

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
0,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_1,12.0,2011-01-29,11101,2,1,1,2011,0,0,0,0,0.0,0.0,0.0,0.46,,,,,,,4,1,29
1,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_2,15.0,2011-01-30,11101,3,2,1,2011,0,0,0,0,0.0,0.0,0.0,0.46,,,,,,,4,1,30
2,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_3,0.0,2011-01-31,11101,1,3,1,2011,0,0,0,0,0.0,0.0,0.0,0.46,,,,,,,5,1,31
3,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_4,0.0,2011-02-01,11101,5,4,2,2011,0,0,0,0,1.0,1.0,0.0,0.46,,,,,,,5,1,1
4,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_5,0.0,2011-02-02,11101,6,5,2,2011,0,0,0,0,1.0,0.0,1.0,0.46,,,,,,,5,1,2


In [12]:
df.dropna(inplace = True)
print(df.shape)
df.head()

(4535200, 31)


Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
67234,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_56,0.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.0,0.0,0.42,0.0,0.0,2.0,4.0,1.642857,4.535714,12,1,25
67241,HOBBIES_1_009_CA_1_validation,8,0,0,0,0,d_56,0.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.0,0.0,1.77,0.0,0.0,2.0,2.0,2.142857,2.178571,12,1,25
67248,HOBBIES_1_010_CA_1_validation,9,0,0,0,0,d_56,0.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.0,0.0,3.17,0.0,0.0,0.0,0.142857,0.035714,0.214286,12,1,25
67255,HOBBIES_1_012_CA_1_validation,11,0,0,0,0,d_56,1.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.0,0.0,6.27,1.0,1.0,0.428571,0.857143,0.535714,0.678571,12,1,25
67262,HOBBIES_1_015_CA_1_validation,14,0,0,0,0,d_56,1.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.0,0.0,0.72,2.0,1.0,5.428571,8.0,6.142857,4.321429,12,1,25


In [13]:
df["store_id"].value_counts()

0    4535200
Name: store_id, dtype: int64

In [14]:
# change here according to store_id



df_CA1 = df.drop(columns = ['state_id', "store_id", 'snap_TX', 'snap_WI'])

In [15]:
print(df_CA1.shape)
df_CA1.head()

(4535200, 27)


Unnamed: 0,id,item_id,dept_id,cat_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
67234,HOBBIES_1_008_CA_1_validation,7,0,0,d_56,0.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.42,0.0,0.0,2.0,4.0,1.642857,4.535714,12,1,25
67241,HOBBIES_1_009_CA_1_validation,8,0,0,d_56,0.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,1.77,0.0,0.0,2.0,2.0,2.142857,2.178571,12,1,25
67248,HOBBIES_1_010_CA_1_validation,9,0,0,d_56,0.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,3.17,0.0,0.0,0.0,0.142857,0.035714,0.214286,12,1,25
67255,HOBBIES_1_012_CA_1_validation,11,0,0,d_56,1.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,6.27,1.0,1.0,0.428571,0.857143,0.535714,0.678571,12,1,25
67262,HOBBIES_1_015_CA_1_validation,14,0,0,d_56,1.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.72,2.0,1.0,5.428571,8.0,6.142857,4.321429,12,1,25


In [16]:
cat_feats = ['item_id', 'dept_id', 'cat_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday", "year"]
train_cols = df_CA1.columns[~df_CA1.columns.isin(useless_cols)]
X_train = df_CA1[train_cols]
y_train = df_CA1["sales"]

In [17]:
X_train.head()

Unnamed: 0,item_id,dept_id,cat_id,wday,month,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
67234,7,0,0,7,3,0,0,0,0,0.0,0.42,0.0,0.0,2.0,4.0,1.642857,4.535714,12,1,25
67241,8,0,0,7,3,0,0,0,0,0.0,1.77,0.0,0.0,2.0,2.0,2.142857,2.178571,12,1,25
67248,9,0,0,7,3,0,0,0,0,0.0,3.17,0.0,0.0,0.0,0.142857,0.035714,0.214286,12,1,25
67255,11,0,0,7,3,0,0,0,0,0.0,6.27,1.0,1.0,0.428571,0.857143,0.535714,0.678571,12,1,25
67262,14,0,0,7,3,0,0,0,0,0.0,0.72,2.0,1.0,5.428571,8.0,6.142857,4.321429,12,1,25


In [18]:
X_train.shape[0] * 0.3

1360560.0

In [19]:
%%time

np.random.seed(777)

fake_valid_inds = np.random.choice(X_train.index.values, 500_000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)
# This is a random sample, we're not gonna apply any time series train-test-split tricks here!

CPU times: user 1.42 s, sys: 199 ms, total: 1.62 s
Wall time: 1.27 s


In [20]:
fake_valid_inds.shape

(500000,)

In [21]:
train_inds.shape

(4035200,)

In [22]:
del df, X_train, y_train, fake_valid_inds,train_inds ; gc.collect()

130

In [151]:
params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 1400,
    'num_leaves': 128,
    "min_data_in_leaf": 100,
}

In [152]:
%%time

m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=20) 



[20]	valid_0's rmse: 2.98807
[40]	valid_0's rmse: 2.64594
[60]	valid_0's rmse: 2.56086
[80]	valid_0's rmse: 2.53969
[100]	valid_0's rmse: 2.53123
[120]	valid_0's rmse: 2.5244
[140]	valid_0's rmse: 2.51824
[160]	valid_0's rmse: 2.51189
[180]	valid_0's rmse: 2.50454
[200]	valid_0's rmse: 2.49867
[220]	valid_0's rmse: 2.49425
[240]	valid_0's rmse: 2.49127
[260]	valid_0's rmse: 2.48831
[280]	valid_0's rmse: 2.48247
[300]	valid_0's rmse: 2.47913
[320]	valid_0's rmse: 2.47687
[340]	valid_0's rmse: 2.47371
[360]	valid_0's rmse: 2.47089
[380]	valid_0's rmse: 2.46834
[400]	valid_0's rmse: 2.46599
[420]	valid_0's rmse: 2.46203
[440]	valid_0's rmse: 2.4588
[460]	valid_0's rmse: 2.45779
[480]	valid_0's rmse: 2.45619
[500]	valid_0's rmse: 2.4539
[520]	valid_0's rmse: 2.45258
[540]	valid_0's rmse: 2.45084
[560]	valid_0's rmse: 2.44977
[580]	valid_0's rmse: 2.44851
[600]	valid_0's rmse: 2.44703
[620]	valid_0's rmse: 2.44585
[640]	valid_0's rmse: 2.44449
[660]	valid_0's rmse: 2.44411
[680]	valid_0's r

In [153]:
m_lgb.save_model("model_" + str(STORE_ID) + ".lgb")

<lightgbm.basic.Booster at 0x7ff0d9659fd0>

In [154]:
#m_lgb = lgb.Booster(model_file= "model_" + str(STORE_ID) + ".lgb")

In [155]:
m_lgb

<lightgbm.basic.Booster at 0x7ff0d9659fd0>

In [177]:
%%time

# alphas = [1.028, 1.023, 1.018]
alphas = [1.025, 1.017, 1.013]
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = create_dt(False, store_id = 0)
    mask = te["store_id"] == 0
    te_CA1 = te[mask]
    te_CA1 = te_CA1.drop("snap_TX", axis = 1)
    te_CA1 = te_CA1.drop("snap_WI", axis = 1)
    
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        tst = te_CA1[(te_CA1.date >= day - timedelta(days=max_lags)) & (te_CA1.date <= day)].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day , train_cols]
        te_CA1.loc[te_CA1.date == day, "sales"] = alpha*m_lgb.predict(tst) # magic multiplier by 
        

    te_sub = te_CA1.loc[te_CA1.date >= fday, ["id", "sales"]].copy()

    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    #te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)


sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("CA_1.csv",index=False)

0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2016-05-10 00:00:00
16 2016-05-11 00:00:00
17 2016-05-12 00:00:00
18 2016-05-13 00:00:00
19 2016-05-14 00:00:00
20 2016-05-15 00:00:00
21 2016-05-16 00:00:00
22 2016-05-17 00:00:00
23 2016-05-18 00:00:00
24 2016-05-19 00:00:00
25 2016-05-20 00:00:00
26 2016-05-21 00:00:00
27 2016-05-22 00:00:00
0 1.025 0.3333333333333333
0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2

In [178]:
sub.shape

(6098, 29)

In [179]:
evaluation_set = pd.read_csv("sales_train_evaluation.csv")
evaluation_set.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,...,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,0,3,5,0,0,1,1,0,2,1,2,2,1,0,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,2,1,0,0,0,0,2,1,3,0,0,1,0,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,4,1,6,4,0,0,0,2,2,4,2,1,1,1,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,3,1,0,3,2,3,1,1,3,2,3,2,2,2,2,0,0,0,2,1,0,0,2,1,0


In [180]:
mask = evaluation_set["store_id"] == "CA_1"
CA_1_eva = evaluation_set[mask]
CA_1_eva.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,...,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,0,3,5,0,0,1,1,0,2,1,2,2,1,0,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,2,1,0,0,0,0,2,1,3,0,0,1,0,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,4,1,6,4,0,0,0,2,2,4,2,1,1,1,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,3,1,0,3,2,3,1,1,3,2,3,2,2,2,2,0,0,0,2,1,0,0,2,1,0


In [181]:
CA_1_eva = CA_1_eva.sort_values('id')
CA_1_eva

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,...,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
1612,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,3,0,0,1,4,2,0,2,0,0,0,0,3,1,3,0,2,1,2,...,0,0,1,1,0,6,1,0,3,0,0,0,2,0,0,2,3,1,0,0,0,1,0,0,0
1613,FOODS_1_002_CA_1_evaluation,FOODS_1_002,FOODS_1,FOODS,CA_1,CA,0,1,0,1,1,1,0,0,0,0,1,1,0,2,0,1,0,0,0,...,0,0,1,0,1,3,0,0,1,0,2,1,0,0,1,2,1,0,1,1,1,0,1,1,2
1614,FOODS_1_003_CA_1_evaluation,FOODS_1_003,FOODS_1,FOODS,CA_1,CA,0,0,0,0,1,3,1,1,1,0,1,0,0,2,1,1,1,0,0,...,0,0,0,0,3,1,0,0,1,2,2,0,0,0,0,2,0,0,0,0,0,2,2,0,1
1615,FOODS_1_004_CA_1_evaluation,FOODS_1_004,FOODS_1,FOODS,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,6,4,6,4,3,3,4,2,3,4,0,7,7,1,5,6,2,5,1,2,1,1,3,4
1616,FOODS_1_005_CA_1_evaluation,FOODS_1_005,FOODS_1,FOODS,CA_1,CA,3,9,3,3,0,2,1,2,1,7,4,0,2,3,6,15,5,0,0,...,0,1,3,4,2,1,1,4,0,3,2,2,1,1,1,2,1,11,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1607,HOUSEHOLD_2_512_CA_1_evaluation,HOUSEHOLD_2_512,HOUSEHOLD_2,HOUSEHOLD,CA_1,CA,0,0,0,0,0,2,1,1,2,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,2,1,0,0,1,1,1,0,1,1,1,1,0,0,0,2,2,0,0
1608,HOUSEHOLD_2_513_CA_1_evaluation,HOUSEHOLD_2_513,HOUSEHOLD_2,HOUSEHOLD,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,2,1,0,1,0,1,0,0
1609,HOUSEHOLD_2_514_CA_1_evaluation,HOUSEHOLD_2_514,HOUSEHOLD_2,HOUSEHOLD,CA_1,CA,2,1,1,0,1,0,1,0,0,0,1,2,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,2,0,1,0,0,0,1,0,0,0
1610,HOUSEHOLD_2_515_CA_1_evaluation,HOUSEHOLD_2_515,HOUSEHOLD_2,HOUSEHOLD,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0


In [182]:
CA_1_eva = CA_1_eva.iloc[:, -28:]
CA_1_eva

Unnamed: 0,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
1612,2,0,0,0,0,1,1,0,6,1,0,3,0,0,0,2,0,0,2,3,1,0,0,0,1,0,0,0
1613,0,0,0,0,0,1,0,1,3,0,0,1,0,2,1,0,0,1,2,1,0,1,1,1,0,1,1,2
1614,3,1,0,0,0,0,0,3,1,0,0,1,2,2,0,0,0,0,2,0,0,0,0,0,2,2,0,1
1615,0,4,5,2,6,4,6,4,3,3,4,2,3,4,0,7,7,1,5,6,2,5,1,2,1,1,3,4
1616,1,0,0,0,1,3,4,2,1,1,4,0,3,2,2,1,1,1,2,1,11,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1607,0,0,1,0,0,0,0,0,0,2,1,0,0,1,1,1,0,1,1,1,1,0,0,0,2,2,0,0
1608,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,2,1,0,1,0,1,0,0
1609,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,2,0,1,0,0,0,1,0,0,0
1610,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0


In [183]:
pred_period1 = sub[0:3049]
pred_period1

F,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,0.878364,0.829572,0.848728,0.836217,1.173502,1.134845,1.439513,1.059209,0.917849,1.000201,0.813199,1.217841,1.397406,1.049995,1.002165,0.887082,0.886191,0.861336,1.131346,1.370400,1.265341,0.933114,0.816087,0.813362,0.822805,1.087391,1.248901,1.224653
1,FOODS_1_002_CA_1_validation,0.414614,0.399910,0.433227,0.484933,0.403850,0.309014,0.495441,0.339045,0.353143,0.376748,0.268261,0.346113,0.565997,0.290914,0.390233,0.359515,0.356162,0.403333,0.374120,0.540778,0.578704,0.402710,0.353642,0.401473,0.460387,0.441846,0.534733,0.490458
2,FOODS_1_003_CA_1_validation,0.843870,0.718708,0.677600,0.697648,0.809299,0.588544,0.679869,0.992295,0.843043,0.849760,0.632878,0.933543,0.879856,0.456238,0.898400,0.755887,0.775140,0.768175,0.910920,0.847311,0.908218,0.855722,0.749595,0.731016,0.753382,0.844895,0.817588,0.821473
3,FOODS_1_004_CA_1_validation,0.029978,0.033677,0.035516,0.039073,0.048831,0.069186,0.033178,1.079415,1.068895,1.073385,1.095266,1.097741,1.580490,1.000525,0.883000,1.079952,1.331692,1.524504,1.984955,3.333186,3.135908,2.453523,2.165799,1.718986,1.866847,1.712999,2.641309,2.765411
4,FOODS_1_005_CA_1_validation,1.194233,1.054762,1.261058,1.285837,1.206779,1.422418,1.373997,1.138873,1.113892,1.163622,1.000102,1.337408,1.420022,1.587784,1.034771,0.979699,0.937789,0.936413,1.073373,1.422926,1.343912,0.948132,0.833976,0.843749,0.902930,1.037588,1.250754,1.227034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,HOUSEHOLD_2_512_CA_1_validation,0.729156,0.669769,0.692886,0.720662,0.716123,1.042600,1.210155,0.724151,0.594300,0.597692,0.626124,0.702935,0.826547,0.607114,0.734776,0.601657,0.672184,0.676922,0.766367,0.900967,0.966661,0.744732,0.772489,0.723920,0.735400,0.816349,0.894720,0.988479
3045,HOUSEHOLD_2_513_CA_1_validation,0.493034,0.429075,0.397221,0.450880,0.465377,0.629204,0.668265,0.595862,0.514106,0.516006,0.287266,0.377188,0.620693,0.505673,0.420077,0.406354,0.377031,0.342279,0.431738,0.604017,0.677624,0.408741,0.381495,0.361719,0.360387,0.392729,0.696057,0.749905
3046,HOUSEHOLD_2_514_CA_1_validation,0.287094,0.234703,0.198081,0.183279,0.245102,0.474192,0.367370,0.281413,0.245411,0.238366,0.257623,0.269196,0.387918,0.324677,0.244823,0.250958,0.238867,0.223752,0.273854,0.419129,0.403396,0.269247,0.244710,0.237939,0.222775,0.281859,0.399614,0.384432
3047,HOUSEHOLD_2_515_CA_1_validation,0.005935,0.006065,0.005694,0.005354,0.006115,0.004818,0.005969,0.334317,0.383289,0.322592,0.445297,0.262077,0.263997,0.259794,0.323546,0.333814,0.280828,0.308275,0.308416,0.340683,0.306617,0.354505,0.365438,0.342323,0.308178,0.283009,0.313172,0.274423


In [184]:
pred_period1 = pred_period1.sort_values('id')
pred_period1 

F,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,0.878364,0.829572,0.848728,0.836217,1.173502,1.134845,1.439513,1.059209,0.917849,1.000201,0.813199,1.217841,1.397406,1.049995,1.002165,0.887082,0.886191,0.861336,1.131346,1.370400,1.265341,0.933114,0.816087,0.813362,0.822805,1.087391,1.248901,1.224653
1,FOODS_1_002_CA_1_validation,0.414614,0.399910,0.433227,0.484933,0.403850,0.309014,0.495441,0.339045,0.353143,0.376748,0.268261,0.346113,0.565997,0.290914,0.390233,0.359515,0.356162,0.403333,0.374120,0.540778,0.578704,0.402710,0.353642,0.401473,0.460387,0.441846,0.534733,0.490458
2,FOODS_1_003_CA_1_validation,0.843870,0.718708,0.677600,0.697648,0.809299,0.588544,0.679869,0.992295,0.843043,0.849760,0.632878,0.933543,0.879856,0.456238,0.898400,0.755887,0.775140,0.768175,0.910920,0.847311,0.908218,0.855722,0.749595,0.731016,0.753382,0.844895,0.817588,0.821473
3,FOODS_1_004_CA_1_validation,0.029978,0.033677,0.035516,0.039073,0.048831,0.069186,0.033178,1.079415,1.068895,1.073385,1.095266,1.097741,1.580490,1.000525,0.883000,1.079952,1.331692,1.524504,1.984955,3.333186,3.135908,2.453523,2.165799,1.718986,1.866847,1.712999,2.641309,2.765411
4,FOODS_1_005_CA_1_validation,1.194233,1.054762,1.261058,1.285837,1.206779,1.422418,1.373997,1.138873,1.113892,1.163622,1.000102,1.337408,1.420022,1.587784,1.034771,0.979699,0.937789,0.936413,1.073373,1.422926,1.343912,0.948132,0.833976,0.843749,0.902930,1.037588,1.250754,1.227034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,HOUSEHOLD_2_512_CA_1_validation,0.729156,0.669769,0.692886,0.720662,0.716123,1.042600,1.210155,0.724151,0.594300,0.597692,0.626124,0.702935,0.826547,0.607114,0.734776,0.601657,0.672184,0.676922,0.766367,0.900967,0.966661,0.744732,0.772489,0.723920,0.735400,0.816349,0.894720,0.988479
3045,HOUSEHOLD_2_513_CA_1_validation,0.493034,0.429075,0.397221,0.450880,0.465377,0.629204,0.668265,0.595862,0.514106,0.516006,0.287266,0.377188,0.620693,0.505673,0.420077,0.406354,0.377031,0.342279,0.431738,0.604017,0.677624,0.408741,0.381495,0.361719,0.360387,0.392729,0.696057,0.749905
3046,HOUSEHOLD_2_514_CA_1_validation,0.287094,0.234703,0.198081,0.183279,0.245102,0.474192,0.367370,0.281413,0.245411,0.238366,0.257623,0.269196,0.387918,0.324677,0.244823,0.250958,0.238867,0.223752,0.273854,0.419129,0.403396,0.269247,0.244710,0.237939,0.222775,0.281859,0.399614,0.384432
3047,HOUSEHOLD_2_515_CA_1_validation,0.005935,0.006065,0.005694,0.005354,0.006115,0.004818,0.005969,0.334317,0.383289,0.322592,0.445297,0.262077,0.263997,0.259794,0.323546,0.333814,0.280828,0.308275,0.308416,0.340683,0.306617,0.354505,0.365438,0.342323,0.308178,0.283009,0.313172,0.274423


In [185]:
pred_period1 = pred_period1.drop(columns='id')
pred_period1.columns = CA_1_eva.columns

In [186]:
pred_period1

Unnamed: 0,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,0.878364,0.829572,0.848728,0.836217,1.173502,1.134845,1.439513,1.059209,0.917849,1.000201,0.813199,1.217841,1.397406,1.049995,1.002165,0.887082,0.886191,0.861336,1.131346,1.370400,1.265341,0.933114,0.816087,0.813362,0.822805,1.087391,1.248901,1.224653
1,0.414614,0.399910,0.433227,0.484933,0.403850,0.309014,0.495441,0.339045,0.353143,0.376748,0.268261,0.346113,0.565997,0.290914,0.390233,0.359515,0.356162,0.403333,0.374120,0.540778,0.578704,0.402710,0.353642,0.401473,0.460387,0.441846,0.534733,0.490458
2,0.843870,0.718708,0.677600,0.697648,0.809299,0.588544,0.679869,0.992295,0.843043,0.849760,0.632878,0.933543,0.879856,0.456238,0.898400,0.755887,0.775140,0.768175,0.910920,0.847311,0.908218,0.855722,0.749595,0.731016,0.753382,0.844895,0.817588,0.821473
3,0.029978,0.033677,0.035516,0.039073,0.048831,0.069186,0.033178,1.079415,1.068895,1.073385,1.095266,1.097741,1.580490,1.000525,0.883000,1.079952,1.331692,1.524504,1.984955,3.333186,3.135908,2.453523,2.165799,1.718986,1.866847,1.712999,2.641309,2.765411
4,1.194233,1.054762,1.261058,1.285837,1.206779,1.422418,1.373997,1.138873,1.113892,1.163622,1.000102,1.337408,1.420022,1.587784,1.034771,0.979699,0.937789,0.936413,1.073373,1.422926,1.343912,0.948132,0.833976,0.843749,0.902930,1.037588,1.250754,1.227034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,0.729156,0.669769,0.692886,0.720662,0.716123,1.042600,1.210155,0.724151,0.594300,0.597692,0.626124,0.702935,0.826547,0.607114,0.734776,0.601657,0.672184,0.676922,0.766367,0.900967,0.966661,0.744732,0.772489,0.723920,0.735400,0.816349,0.894720,0.988479
3045,0.493034,0.429075,0.397221,0.450880,0.465377,0.629204,0.668265,0.595862,0.514106,0.516006,0.287266,0.377188,0.620693,0.505673,0.420077,0.406354,0.377031,0.342279,0.431738,0.604017,0.677624,0.408741,0.381495,0.361719,0.360387,0.392729,0.696057,0.749905
3046,0.287094,0.234703,0.198081,0.183279,0.245102,0.474192,0.367370,0.281413,0.245411,0.238366,0.257623,0.269196,0.387918,0.324677,0.244823,0.250958,0.238867,0.223752,0.273854,0.419129,0.403396,0.269247,0.244710,0.237939,0.222775,0.281859,0.399614,0.384432
3047,0.005935,0.006065,0.005694,0.005354,0.006115,0.004818,0.005969,0.334317,0.383289,0.322592,0.445297,0.262077,0.263997,0.259794,0.323546,0.333814,0.280828,0.308275,0.308416,0.340683,0.306617,0.354505,0.365438,0.342323,0.308178,0.283009,0.313172,0.274423


In [187]:
CA_1_eva

Unnamed: 0,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
1612,2,0,0,0,0,1,1,0,6,1,0,3,0,0,0,2,0,0,2,3,1,0,0,0,1,0,0,0
1613,0,0,0,0,0,1,0,1,3,0,0,1,0,2,1,0,0,1,2,1,0,1,1,1,0,1,1,2
1614,3,1,0,0,0,0,0,3,1,0,0,1,2,2,0,0,0,0,2,0,0,0,0,0,2,2,0,1
1615,0,4,5,2,6,4,6,4,3,3,4,2,3,4,0,7,7,1,5,6,2,5,1,2,1,1,3,4
1616,1,0,0,0,1,3,4,2,1,1,4,0,3,2,2,1,1,1,2,1,11,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1607,0,0,1,0,0,0,0,0,0,2,1,0,0,1,1,1,0,1,1,1,1,0,0,0,2,2,0,0
1608,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,2,1,0,1,0,1,0,0
1609,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,2,0,1,0,0,0,1,0,0,0
1610,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0


In [188]:
from typing import Union

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm


class WRMSSEEvaluator(object):

    def __init__(self, train_df: pd.DataFrame, valid_df: pd.DataFrame, calendar: pd.DataFrame, prices: pd.DataFrame):
        train_y = train_df.loc[:, train_df.columns.str.startswith('d_')]
        train_target_columns = train_y.columns.tolist()
        weight_columns = train_y.iloc[:, -28:].columns.tolist()

        train_df['all_id'] = 0  # for lv1 aggregation

        id_columns = train_df.loc[:, ~train_df.columns.str.startswith('d_')].columns.tolist()
        valid_target_columns = valid_df.loc[:, valid_df.columns.str.startswith('d_')].columns.tolist()

        if not all([c in valid_df.columns for c in id_columns]):
            valid_df = pd.concat([train_df[id_columns], valid_df], axis=1, sort=False)
            #valid_df = valid_df.loc[:,~valid_df.columns.duplicated()]

        self.train_df = train_df
        self.valid_df = valid_df
        self.calendar = calendar
        self.prices = prices

        self.weight_columns = weight_columns
        self.id_columns = id_columns
        self.valid_target_columns = valid_target_columns

        weight_df = self.get_weight_df()


        self.group_ids = (
            'all_id',
            'state_id',
            'store_id',
            'cat_id',
            'dept_id',
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            'item_id',
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )
        print("start")
        for i, group_id in enumerate(tqdm(self.group_ids)):
            train_y = train_df.groupby(group_id)[train_target_columns].sum()
            scale = []
            for _, row in train_y.iterrows():
                series = row.values[np.argmax(row.values != 0):]
                scale.append(((series[1:] - series[:-1]) ** 2).mean())
            
            setattr(self, f'lv{i + 1}_scale', np.array(scale))
            setattr(self, f'lv{i + 1}_train_df', train_y)
            print(group_id)
            setattr(self, f'lv{i + 1}_valid_df', valid_df.groupby(group_id)[valid_target_columns].sum())

            lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
            setattr(self, f'lv{i + 1}_weight', lv_weight / lv_weight.sum())

    def get_weight_df(self) -> pd.DataFrame:
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns].set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
        weight_df = weight_df.loc[zip(self.train_df.item_id, self.train_df.store_id), :].reset_index(drop=True)
        weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)
        return weight_df

    def rmsse(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        valid_y = getattr(self, f'lv{lv}_valid_df')
        score = ((valid_y - valid_preds) ** 2).mean(axis=1)
        scale = getattr(self, f'lv{lv}_scale')
        return (score / scale).map(np.sqrt)

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds], axis=1, sort=False)

        all_scores = []
        for i, group_id in enumerate(self.group_ids):
            lv_scores = self.rmsse(valid_preds.groupby(group_id)[self.valid_target_columns].sum(), i + 1)
            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1, sort=False).prod(axis=1)
            all_scores.append(lv_scores.sum())

        return np.mean(all_scores)

In [189]:
#train_df = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_validation.csv')
#train_fold_df = train_df.iloc[:, :-28]
#valid_fold_df = train_df.iloc[:, -28:]
#valid_preds = valid_fold_df.copy() + np.random.randint(100, size=valid_fold_df.shape)
calendar = pd.read_csv("calendar.csv")
prices = pd.read_csv("sell_prices.csv")

In [190]:
train_fold_df = evaluation_set.iloc[:, :-28]
train_fold_df = train_fold_df.sort_values('id')

In [191]:
mask = train_fold_df["store_id"] == "CA_1"
train_fold_df = train_fold_df[mask]

In [192]:
train_fold_df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,...,d_1889,d_1890,d_1891,d_1892,d_1893,d_1894,d_1895,d_1896,d_1897,d_1898,d_1899,d_1900,d_1901,d_1902,d_1903,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
1612,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,3,0,0,1,4,2,0,2,0,0,0,0,3,1,3,0,2,1,2,...,0,4,0,0,4,1,3,0,1,0,2,2,0,1,1,0,2,0,4,1,1,0,1,1,0
1613,FOODS_1_002_CA_1_evaluation,FOODS_1_002,FOODS_1,FOODS,CA_1,CA,0,1,0,1,1,1,0,0,0,0,1,1,0,2,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,3,1,0,0,1,2,0,0,0
1614,FOODS_1_003_CA_1_evaluation,FOODS_1_003,FOODS_1,FOODS,CA_1,CA,0,0,0,0,1,3,1,1,1,0,1,0,0,2,1,1,1,0,0,...,1,1,0,0,1,0,1,1,0,1,0,0,1,2,0,3,0,2,1,1,0,1,0,1,0
1615,FOODS_1_004_CA_1_evaluation,FOODS_1_004,FOODS_1,FOODS,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1616,FOODS_1_005_CA_1_evaluation,FOODS_1_005,FOODS_1,FOODS,CA_1,CA,3,9,3,3,0,2,1,2,1,7,4,0,2,3,6,15,5,0,0,...,0,3,0,0,2,0,0,3,0,2,0,2,0,1,2,3,1,1,2,0,2,2,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1607,HOUSEHOLD_2_512_CA_1_evaluation,HOUSEHOLD_2_512,HOUSEHOLD_2,HOUSEHOLD,CA_1,CA,0,0,0,0,0,2,1,1,2,0,0,0,0,0,0,0,2,0,0,...,1,0,1,1,0,1,1,0,0,0,0,1,2,0,0,0,1,0,6,1,0,1,0,1,0
1608,HOUSEHOLD_2_513_CA_1_evaluation,HOUSEHOLD_2_513,HOUSEHOLD_2,HOUSEHOLD,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,2,0,0,0,0,0,1,1,0,3
1609,HOUSEHOLD_2_514_CA_1_evaluation,HOUSEHOLD_2_514,HOUSEHOLD_2,HOUSEHOLD,CA_1,CA,2,1,1,0,1,0,1,0,0,0,1,2,0,0,0,0,0,0,0,...,0,0,2,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0
1610,HOUSEHOLD_2_515_CA_1_evaluation,HOUSEHOLD_2_515,HOUSEHOLD_2,HOUSEHOLD,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [193]:
train_fold_df.shape

(3049, 1919)

In [194]:
CA_1_eva.shape

(3049, 28)

In [195]:
pred_period1.shape

(3049, 28)

In [196]:
CA_1_eva.columns

Index(['d_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919', 'd_1920',
       'd_1921', 'd_1922', 'd_1923', 'd_1924', 'd_1925', 'd_1926', 'd_1927',
       'd_1928', 'd_1929', 'd_1930', 'd_1931', 'd_1932', 'd_1933', 'd_1934',
       'd_1935', 'd_1936', 'd_1937', 'd_1938', 'd_1939', 'd_1940', 'd_1941'],
      dtype='object')

In [197]:
train_fold_df.reset_index(drop = True)
CA_1_eva.reset_index(drop = True)
evaluator = WRMSSEEvaluator(train_fold_df.reset_index(drop = True), CA_1_eva.reset_index(drop = True), calendar, prices)
evaluator.score(pred_period1.reset_index(drop = True))

start


HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

all_id
state_id
store_id
cat_id
dept_id
['state_id', 'cat_id']
['state_id', 'dept_id']
['store_id', 'cat_id']
['store_id', 'dept_id']
item_id
['item_id', 'state_id']
['item_id', 'store_id']



0.44200160064488947

In [98]:
0.44833263676135743

0.44529593426108205