In [1]:
# CA
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb

In [4]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [5]:
pd.options.display.max_columns = 50

h = 28 
max_lags = 57
tr_last = 1913
fday = datetime(2016,4, 25) 
fday

datetime.datetime(2016, 4, 25, 0, 0)

In [6]:
def create_dt(is_train = True, nrows = None, first_day = 1200, validate = True, store_id = None):
    prices = pd.read_csv("sell_prices.csv", dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
    
    if not store_id is None:
        prices = prices.loc[prices['store_id'] == store_id]
        #prices = prices.drop(columns=['store_id'])
            
    cal = pd.read_csv("calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    if validate:
        dt = pd.read_csv("sales_train_validation.csv", 
                         nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    else:
        dt = pd.read_csv("sales_train_evaluation.csv", 
                         nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [7]:
def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [8]:
%%time
FIRST_DAY = 1
STORE_ID = 1
df = create_dt(is_train=True, first_day= FIRST_DAY, store_id = STORE_ID)
df.shape

CPU times: user 33.2 s, sys: 13.6 s, total: 46.9 s
Wall time: 56.5 s


(4671941, 22)

In [9]:
df.head()

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_008_CA_3_validation,7,0,2,0,0,d_1,11.0,2011-01-29,11101,2,1,1,2011,0,0,0,0,0.0,0.0,0.0,0.46
1,HOBBIES_1_008_CA_3_validation,7,0,2,0,0,d_2,12.0,2011-01-30,11101,3,2,1,2011,0,0,0,0,0.0,0.0,0.0,0.46
2,HOBBIES_1_008_CA_3_validation,7,0,2,0,0,d_3,33.0,2011-01-31,11101,1,3,1,2011,0,0,0,0,0.0,0.0,0.0,0.46
3,HOBBIES_1_008_CA_3_validation,7,0,2,0,0,d_4,13.0,2011-02-01,11101,5,4,2,2011,0,0,0,0,1.0,1.0,0.0,0.46
4,HOBBIES_1_008_CA_3_validation,7,0,2,0,0,d_5,53.0,2011-02-02,11101,6,5,2,2011,0,0,0,0,1.0,0.0,1.0,0.46


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4671941 entries, 0 to 4671940
Data columns (total 22 columns):
id              object
item_id         int16
dept_id         int16
store_id        int16
cat_id          int16
state_id        int16
d               object
sales           float32
date            datetime64[ns]
wm_yr_wk        int16
weekday         int16
wday            int16
month           int16
year            int16
event_name_1    int16
event_type_1    int16
event_name_2    int16
event_type_2    int16
snap_CA         float32
snap_TX         float32
snap_WI         float32
sell_price      float32
dtypes: datetime64[ns](1), float32(5), int16(14), object(2)
memory usage: 356.4+ MB


In [11]:
df['store_id'].value_counts()

2    4671941
Name: store_id, dtype: int64

In [12]:
%%time

create_fea(df)
df.shape

CPU times: user 16 s, sys: 1.21 s, total: 17.3 s
Wall time: 15.7 s


(4671941, 31)

In [13]:
df.head()

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
0,HOBBIES_1_008_CA_3_validation,7,0,2,0,0,d_1,11.0,2011-01-29,11101,2,1,1,2011,0,0,0,0,0.0,0.0,0.0,0.46,,,,,,,4,1,29
1,HOBBIES_1_008_CA_3_validation,7,0,2,0,0,d_2,12.0,2011-01-30,11101,3,2,1,2011,0,0,0,0,0.0,0.0,0.0,0.46,,,,,,,4,1,30
2,HOBBIES_1_008_CA_3_validation,7,0,2,0,0,d_3,33.0,2011-01-31,11101,1,3,1,2011,0,0,0,0,0.0,0.0,0.0,0.46,,,,,,,5,1,31
3,HOBBIES_1_008_CA_3_validation,7,0,2,0,0,d_4,13.0,2011-02-01,11101,5,4,2,2011,0,0,0,0,1.0,1.0,0.0,0.46,,,,,,,5,1,1
4,HOBBIES_1_008_CA_3_validation,7,0,2,0,0,d_5,53.0,2011-02-02,11101,6,5,2,2011,0,0,0,0,1.0,0.0,1.0,0.46,,,,,,,5,1,2


In [14]:
df.dropna(inplace = True)
print(df.shape)
df.head()

(4504246, 31)


Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
66758,HOBBIES_1_008_CA_3_validation,7,0,2,0,0,d_56,44.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.0,0.0,0.42,30.0,8.0,29.714285,23.285715,18.0,18.392857,12,1,25
66765,HOBBIES_1_009_CA_3_validation,8,0,2,0,0,d_56,0.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.0,0.0,1.56,0.0,0.0,1.142857,1.428571,1.5,1.5,12,1,25
66772,HOBBIES_1_010_CA_3_validation,9,0,2,0,0,d_56,1.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.0,0.0,2.97,0.0,0.0,0.714286,0.714286,0.535714,0.678571,12,1,25
66779,HOBBIES_1_012_CA_3_validation,11,0,2,0,0,d_56,1.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.0,0.0,6.27,1.0,3.0,1.0,1.857143,1.142857,1.571429,12,1,25
66786,HOBBIES_1_015_CA_3_validation,14,0,2,0,0,d_56,6.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.0,0.0,0.72,2.0,14.0,6.714286,8.428572,8.75,7.357143,12,1,25


In [15]:
df["store_id"].value_counts()

2    4504246
Name: store_id, dtype: int64

In [16]:
# change here according to store_id

df_CA1 = df.drop(columns = ['state_id', "store_id", 'snap_TX', 'snap_WI'])

In [17]:
print(df_CA1.shape)
df_CA1.head()

(4504246, 27)


Unnamed: 0,id,item_id,dept_id,cat_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
66758,HOBBIES_1_008_CA_3_validation,7,0,0,d_56,44.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.42,30.0,8.0,29.714285,23.285715,18.0,18.392857,12,1,25
66765,HOBBIES_1_009_CA_3_validation,8,0,0,d_56,0.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,1.56,0.0,0.0,1.142857,1.428571,1.5,1.5,12,1,25
66772,HOBBIES_1_010_CA_3_validation,9,0,0,d_56,1.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,2.97,0.0,0.0,0.714286,0.714286,0.535714,0.678571,12,1,25
66779,HOBBIES_1_012_CA_3_validation,11,0,0,d_56,1.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,6.27,1.0,3.0,1.0,1.857143,1.142857,1.571429,12,1,25
66786,HOBBIES_1_015_CA_3_validation,14,0,0,d_56,6.0,2011-03-25,11108,0,7,3,2011,0,0,0,0,0.0,0.72,2.0,14.0,6.714286,8.428572,8.75,7.357143,12,1,25


In [18]:
cat_feats = ['item_id', 'dept_id', 'cat_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday", "year"]
train_cols = df_CA1.columns[~df_CA1.columns.isin(useless_cols)]
X_train = df_CA1[train_cols]
y_train = df_CA1["sales"]

In [19]:
X_train.head()

Unnamed: 0,item_id,dept_id,cat_id,wday,month,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
66758,7,0,0,7,3,0,0,0,0,0.0,0.42,30.0,8.0,29.714285,23.285715,18.0,18.392857,12,1,25
66765,8,0,0,7,3,0,0,0,0,0.0,1.56,0.0,0.0,1.142857,1.428571,1.5,1.5,12,1,25
66772,9,0,0,7,3,0,0,0,0,0.0,2.97,0.0,0.0,0.714286,0.714286,0.535714,0.678571,12,1,25
66779,11,0,0,7,3,0,0,0,0,0.0,6.27,1.0,3.0,1.0,1.857143,1.142857,1.571429,12,1,25
66786,14,0,0,7,3,0,0,0,0,0.0,0.72,2.0,14.0,6.714286,8.428572,8.75,7.357143,12,1,25


In [20]:
%%time

np.random.seed(777)

fake_valid_inds = np.random.choice(X_train.index.values, 500_000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)
# This is a random sample, we're not gonna apply any time series train-test-split tricks here!

CPU times: user 1.53 s, sys: 256 ms, total: 1.79 s
Wall time: 1.46 s


In [21]:
fake_valid_inds.shape

(500000,)

In [22]:
train_inds.shape

(4004246,)

In [23]:
del df, X_train, y_train, fake_valid_inds,train_inds ; gc.collect()

90

In [24]:
params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 1200,
    'num_leaves': 128,
    "min_data_in_leaf": 100,
}

In [25]:
%%time

m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=20) 



[20]	valid_0's rmse: 4.38563
[40]	valid_0's rmse: 3.82861
[60]	valid_0's rmse: 3.69677
[80]	valid_0's rmse: 3.65965
[100]	valid_0's rmse: 3.64498
[120]	valid_0's rmse: 3.63519
[140]	valid_0's rmse: 3.62373
[160]	valid_0's rmse: 3.61124
[180]	valid_0's rmse: 3.59914
[200]	valid_0's rmse: 3.58494
[220]	valid_0's rmse: 3.57727
[240]	valid_0's rmse: 3.56588
[260]	valid_0's rmse: 3.55597
[280]	valid_0's rmse: 3.54922
[300]	valid_0's rmse: 3.54278
[320]	valid_0's rmse: 3.53272
[340]	valid_0's rmse: 3.52631
[360]	valid_0's rmse: 3.52002
[380]	valid_0's rmse: 3.51593
[400]	valid_0's rmse: 3.51013
[420]	valid_0's rmse: 3.50288
[440]	valid_0's rmse: 3.49841
[460]	valid_0's rmse: 3.49633
[480]	valid_0's rmse: 3.49026
[500]	valid_0's rmse: 3.48648
[520]	valid_0's rmse: 3.4822
[540]	valid_0's rmse: 3.47952
[560]	valid_0's rmse: 3.47584
[580]	valid_0's rmse: 3.47234
[600]	valid_0's rmse: 3.46893
[620]	valid_0's rmse: 3.46547
[640]	valid_0's rmse: 3.46426
[660]	valid_0's rmse: 3.46273
[680]	valid_0's

In [26]:
m_lgb.save_model("model_" + str(STORE_ID) + ".lgb")

<lightgbm.basic.Booster at 0x7f98a91ca890>

In [27]:
#m_lgb = lgb.Booster(model_file= "model_" + str(STORE_ID) + ".lgb")

In [28]:
m_lgb

<lightgbm.basic.Booster at 0x7f98a91ca890>

In [30]:
%%time

alphas = [1.028, 1.023, 1.018]
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = create_dt(False, store_id = 1)
    mask = te["store_id"] == 1
    te_CA1 = te[mask]
    te_CA1 = te_CA1.drop("snap_TX", axis = 1)
    te_CA1 = te_CA1.drop("snap_WI", axis = 1)
    
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        tst = te_CA1[(te_CA1.date >= day - timedelta(days=max_lags)) & (te_CA1.date <= day)].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day , train_cols]
        te_CA1.loc[te_CA1.date == day, "sales"] = alpha*m_lgb.predict(tst) # magic multiplier by 
        

    te_sub = te_CA1.loc[te_CA1.date >= fday, ["id", "sales"]].copy()

    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    #te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)


sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("CA_2.csv",index=False)

0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2016-05-10 00:00:00
16 2016-05-11 00:00:00
17 2016-05-12 00:00:00
18 2016-05-13 00:00:00
19 2016-05-14 00:00:00
20 2016-05-15 00:00:00
21 2016-05-16 00:00:00
22 2016-05-17 00:00:00
23 2016-05-18 00:00:00
24 2016-05-19 00:00:00
25 2016-05-20 00:00:00
26 2016-05-21 00:00:00
27 2016-05-22 00:00:00
0 1.028 0.3333333333333333
0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2

In [77]:
sub.shape

(6098, 29)

In [76]:
sub.head()

F,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_3_validation,1.206988,1.085061,0.98427,0.978099,1.321499,1.064815,1.785777,0.873728,0.898837,0.844383,0.93221,1.14212,1.643105,1.538392,1.159466,1.042358,0.928478,0.901299,1.215475,1.76042,1.852182,1.227675,1.067192,0.986361,0.935664,1.284434,1.654123,1.903352
1,FOODS_1_002_CA_3_validation,0.516288,0.483796,0.513579,0.505823,0.41295,0.520568,0.683233,0.515541,0.480019,0.506328,0.579737,0.452254,0.543407,0.733803,0.549102,0.53383,0.511066,0.503416,0.431699,0.5574,0.640403,0.545621,0.525551,0.509158,0.523097,0.444505,0.546274,0.62508
2,FOODS_1_003_CA_3_validation,0.909705,0.768261,0.879471,0.816307,0.900153,1.390661,1.135084,0.919849,0.86048,0.875115,0.693357,0.904542,1.299739,0.776227,0.944743,0.882258,0.869662,0.864576,0.903471,1.290856,1.102637,0.910138,0.857237,0.911219,0.918977,0.974731,1.39454,1.209604
3,FOODS_1_004_CA_3_validation,0.632958,0.662426,0.201111,0.213544,0.240101,0.257691,0.334925,0.30762,0.282181,0.285804,0.287585,0.32002,0.399565,0.443964,0.328079,0.330375,0.34024,0.370582,0.386855,0.504916,0.490735,0.449029,0.512389,0.522089,0.551723,0.566864,0.773257,0.940997
4,FOODS_1_005_CA_3_validation,2.106064,1.756214,1.736908,1.641401,1.930983,2.539602,1.791107,1.661761,1.777059,1.711315,1.266173,1.791195,2.329885,1.719375,1.580214,1.533351,1.494325,1.423851,1.600085,2.263964,2.281264,1.629829,1.465484,1.364178,1.321698,1.444605,1.859589,2.030914


In [78]:
sub.tail()

F,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
3044,HOUSEHOLD_2_512_CA_3_evaluation,1.132292,0.969994,1.088643,1.13333,1.376913,1.530162,1.467778,1.292819,1.522118,1.603029,1.410445,1.886277,1.960347,1.560496,1.745769,1.638929,1.555383,1.484855,1.542624,1.887684,1.953548,1.555129,1.469173,1.41488,1.511435,1.604545,1.95688,2.032895
3045,HOUSEHOLD_2_513_CA_3_evaluation,1.350279,1.192961,1.113899,1.184158,1.69958,1.814625,1.700663,1.767309,1.760778,1.632103,1.531615,1.501435,1.743459,1.222281,1.464031,1.464566,1.417666,1.447428,1.596289,1.683206,1.890511,1.607305,1.585087,1.5509,1.561031,1.582138,1.936671,2.080852
3046,HOUSEHOLD_2_514_CA_3_evaluation,0.162072,0.233353,0.185977,0.196151,0.187312,0.320115,0.270724,0.224778,0.206238,0.201331,0.156628,0.207073,0.267383,0.278845,0.225371,0.221792,0.202238,0.200549,0.210975,0.287232,0.331648,0.218936,0.218061,0.207314,0.205724,0.214551,0.287474,0.33846
3047,HOUSEHOLD_2_515_CA_3_evaluation,0.175777,0.170541,0.163519,0.15163,0.179331,0.154998,0.194287,0.171467,0.195292,0.169771,0.093668,0.18367,0.233353,0.13601,0.174317,0.18009,0.173975,0.165858,0.206044,0.24204,0.241713,0.181922,0.177841,0.17203,0.162489,0.192452,0.22825,0.217183
3048,HOUSEHOLD_2_516_CA_3_evaluation,0.126493,0.125558,0.131001,0.107366,0.121293,0.153937,0.156494,0.169734,0.167725,0.169741,0.353588,0.191351,0.195772,0.190042,0.155961,0.15418,0.153336,0.156282,0.17702,0.191131,0.153635,0.155736,0.154547,0.156479,0.156698,0.172831,0.188593,0.15521


In [79]:
sub.loc[3048]

F,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
3048,HOUSEHOLD_2_516_CA_3_validation,0.126493,0.125558,0.131001,0.107366,0.121293,0.153937,0.156494,0.169734,0.167725,0.169741,0.353588,0.191351,0.195772,0.190042,0.155961,0.15418,0.153336,0.156282,0.17702,0.191131,0.153635,0.155736,0.154547,0.156479,0.156698,0.172831,0.188593,0.15521
3048,HOUSEHOLD_2_516_CA_3_evaluation,0.126493,0.125558,0.131001,0.107366,0.121293,0.153937,0.156494,0.169734,0.167725,0.169741,0.353588,0.191351,0.195772,0.190042,0.155961,0.15418,0.153336,0.156282,0.17702,0.191131,0.153635,0.155736,0.154547,0.156479,0.156698,0.172831,0.188593,0.15521


In [80]:
evaluation_set = pd.read_csv("sales_train_evaluation.csv")
evaluation_set.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,...,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,0,3,5,0,0,1,1,0,2,1,2,2,1,0,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,2,1,0,0,0,0,2,1,3,0,0,1,0,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,4,1,6,4,0,0,0,2,2,4,2,1,1,1,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,3,1,0,3,2,3,1,1,3,2,3,2,2,2,2,0,0,0,2,1,0,0,2,1,0


In [108]:
mask = evaluation_set["store_id"] == "CA_2"
CA_1_eva = evaluation_set[mask]
CA_1_eva.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,...,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
6098,HOBBIES_1_001_CA_3_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_3,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,2,1,0,1,2,1,1,4,1,1,0,0,0,2,6,0,1,0,2,1,0,1,0
6099,HOBBIES_1_002_CA_3_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_3,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
6100,HOBBIES_1_003_CA_3_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_3,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0
6101,HOBBIES_1_004_CA_3_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_3,CA,0,0,0,0,0,0,0,0,0,0,0,0,10,2,0,0,0,0,0,...,10,1,12,9,9,2,10,5,5,7,0,0,0,10,8,2,10,6,0,5,2,4,2,0,5
6102,HOBBIES_1_005_CA_3_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_3,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,5,3,1,3,1,4,5,2,2,2,0,2,2,0,3,1,2,3,2,0,1,2,2,2,1


In [109]:
CA_1_eva = CA_1_eva.sort_values('id')
CA_1_eva

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,...,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
7710,FOODS_1_001_CA_3_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_3,CA,1,2,1,1,1,2,0,1,1,1,0,0,3,3,2,7,1,2,4,...,0,8,1,0,0,1,0,0,1,1,0,0,0,0,0,1,2,2,0,0,1,0,3,2,2
7711,FOODS_1_002_CA_3_evaluation,FOODS_1_002,FOODS_1,FOODS,CA_3,CA,1,0,1,2,0,0,0,1,1,0,1,0,0,0,1,0,0,0,0,...,2,0,2,0,0,1,1,2,0,0,0,0,1,0,3,1,1,2,0,0,0,0,1,2,0
7712,FOODS_1_003_CA_3_evaluation,FOODS_1_003,FOODS_1,FOODS,CA_3,CA,1,1,1,3,0,2,1,1,1,1,1,0,0,2,3,5,1,5,0,...,1,0,0,3,0,0,1,3,0,1,1,0,0,0,3,1,0,3,2,4,0,0,0,0,3
7713,FOODS_1_004_CA_3_evaluation,FOODS_1_004,FOODS_1,FOODS,CA_3,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,5,12,6,10,13,7,12,7,4,5,16,12,9,8,8,12,7,7,11,19
7714,FOODS_1_005_CA_3_evaluation,FOODS_1_005,FOODS_1,FOODS,CA_3,CA,10,3,2,10,1,4,6,1,13,11,2,4,10,3,13,3,1,7,3,...,0,2,1,5,1,1,4,2,8,3,1,4,1,3,2,0,6,1,4,0,3,1,11,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7705,HOUSEHOLD_2_512_CA_3_evaluation,HOUSEHOLD_2_512,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,0,0,0,0,0,1,0,0,0,0,0,0,1,0,2,1,0,1,0,...,1,1,0,2,1,0,1,0,0,0,3,0,0,0,0,1,1,1,0,2,0,0,2,1,2
7706,HOUSEHOLD_2_513_CA_3_evaluation,HOUSEHOLD_2_513,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,1,2,1,1,3,1,0,2,2,1,4,3,1,3,0,0,0,0,5,0,0,3,2,1
7707,HOUSEHOLD_2_514_CA_3_evaluation,HOUSEHOLD_2_514,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,0,2,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3,1
7708,HOUSEHOLD_2_515_CA_3_evaluation,HOUSEHOLD_2_515,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0


In [110]:
CA_1_eva = CA_1_eva.iloc[:, -28:]
CA_1_eva

Unnamed: 0,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
7710,1,0,1,0,8,1,0,0,1,0,0,1,1,0,0,0,0,0,1,2,2,0,0,1,0,3,2,2
7711,0,1,1,2,0,2,0,0,1,1,2,0,0,0,0,1,0,3,1,1,2,0,0,0,0,1,2,0
7712,1,1,0,1,0,0,3,0,0,1,3,0,1,1,0,0,0,3,1,0,3,2,4,0,0,0,0,3
7713,0,0,0,0,0,0,0,0,5,12,6,10,13,7,12,7,4,5,16,12,9,8,8,12,7,7,11,19
7714,4,0,12,0,2,1,5,1,1,4,2,8,3,1,4,1,3,2,0,6,1,4,0,3,1,11,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7705,1,0,0,1,1,0,2,1,0,1,0,0,0,3,0,0,0,0,1,1,1,0,2,0,0,2,1,2
7706,1,3,0,2,1,2,1,1,3,1,0,2,2,1,4,3,1,3,0,0,0,0,5,0,0,3,2,1
7707,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3,1
7708,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0


In [111]:
pred_period1 = sub[0:3049]
pred_period1

F,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_3_validation,1.206988,1.085061,0.984270,0.978099,1.321499,1.064815,1.785777,0.873728,0.898837,0.844383,0.932210,1.142120,1.643105,1.538392,1.159466,1.042358,0.928478,0.901299,1.215475,1.760420,1.852182,1.227675,1.067192,0.986361,0.935664,1.284434,1.654123,1.903352
1,FOODS_1_002_CA_3_validation,0.516288,0.483796,0.513579,0.505823,0.412950,0.520568,0.683233,0.515541,0.480019,0.506328,0.579737,0.452254,0.543407,0.733803,0.549102,0.533830,0.511066,0.503416,0.431699,0.557400,0.640403,0.545621,0.525551,0.509158,0.523097,0.444505,0.546274,0.625080
2,FOODS_1_003_CA_3_validation,0.909705,0.768261,0.879471,0.816307,0.900153,1.390661,1.135084,0.919849,0.860480,0.875115,0.693357,0.904542,1.299739,0.776227,0.944743,0.882258,0.869662,0.864576,0.903471,1.290856,1.102637,0.910138,0.857237,0.911219,0.918977,0.974731,1.394540,1.209604
3,FOODS_1_004_CA_3_validation,0.632958,0.662426,0.201111,0.213544,0.240101,0.257691,0.334925,0.307620,0.282181,0.285804,0.287585,0.320020,0.399565,0.443964,0.328079,0.330375,0.340240,0.370582,0.386855,0.504916,0.490735,0.449029,0.512389,0.522089,0.551723,0.566864,0.773257,0.940997
4,FOODS_1_005_CA_3_validation,2.106064,1.756214,1.736908,1.641401,1.930983,2.539602,1.791107,1.661761,1.777059,1.711315,1.266173,1.791195,2.329885,1.719375,1.580214,1.533351,1.494325,1.423851,1.600085,2.263964,2.281264,1.629829,1.465484,1.364178,1.321698,1.444605,1.859589,2.030914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,HOUSEHOLD_2_512_CA_3_validation,1.132292,0.969994,1.088643,1.133330,1.376913,1.530162,1.467778,1.292819,1.522118,1.603029,1.410445,1.886277,1.960347,1.560496,1.745769,1.638929,1.555383,1.484855,1.542624,1.887684,1.953548,1.555129,1.469173,1.414880,1.511435,1.604545,1.956880,2.032895
3045,HOUSEHOLD_2_513_CA_3_validation,1.350279,1.192961,1.113899,1.184158,1.699580,1.814625,1.700663,1.767309,1.760778,1.632103,1.531615,1.501435,1.743459,1.222281,1.464031,1.464566,1.417666,1.447428,1.596289,1.683206,1.890511,1.607305,1.585087,1.550900,1.561031,1.582138,1.936671,2.080852
3046,HOUSEHOLD_2_514_CA_3_validation,0.162072,0.233353,0.185977,0.196151,0.187312,0.320115,0.270724,0.224778,0.206238,0.201331,0.156628,0.207073,0.267383,0.278845,0.225371,0.221792,0.202238,0.200549,0.210975,0.287232,0.331648,0.218936,0.218061,0.207314,0.205724,0.214551,0.287474,0.338460
3047,HOUSEHOLD_2_515_CA_3_validation,0.175777,0.170541,0.163519,0.151630,0.179331,0.154998,0.194287,0.171467,0.195292,0.169771,0.093668,0.183670,0.233353,0.136010,0.174317,0.180090,0.173975,0.165858,0.206044,0.242040,0.241713,0.181922,0.177841,0.172030,0.162489,0.192452,0.228250,0.217183


In [112]:
pred_period1 = pred_period1.sort_values('id')
pred_period1 

F,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_3_validation,1.206988,1.085061,0.984270,0.978099,1.321499,1.064815,1.785777,0.873728,0.898837,0.844383,0.932210,1.142120,1.643105,1.538392,1.159466,1.042358,0.928478,0.901299,1.215475,1.760420,1.852182,1.227675,1.067192,0.986361,0.935664,1.284434,1.654123,1.903352
1,FOODS_1_002_CA_3_validation,0.516288,0.483796,0.513579,0.505823,0.412950,0.520568,0.683233,0.515541,0.480019,0.506328,0.579737,0.452254,0.543407,0.733803,0.549102,0.533830,0.511066,0.503416,0.431699,0.557400,0.640403,0.545621,0.525551,0.509158,0.523097,0.444505,0.546274,0.625080
2,FOODS_1_003_CA_3_validation,0.909705,0.768261,0.879471,0.816307,0.900153,1.390661,1.135084,0.919849,0.860480,0.875115,0.693357,0.904542,1.299739,0.776227,0.944743,0.882258,0.869662,0.864576,0.903471,1.290856,1.102637,0.910138,0.857237,0.911219,0.918977,0.974731,1.394540,1.209604
3,FOODS_1_004_CA_3_validation,0.632958,0.662426,0.201111,0.213544,0.240101,0.257691,0.334925,0.307620,0.282181,0.285804,0.287585,0.320020,0.399565,0.443964,0.328079,0.330375,0.340240,0.370582,0.386855,0.504916,0.490735,0.449029,0.512389,0.522089,0.551723,0.566864,0.773257,0.940997
4,FOODS_1_005_CA_3_validation,2.106064,1.756214,1.736908,1.641401,1.930983,2.539602,1.791107,1.661761,1.777059,1.711315,1.266173,1.791195,2.329885,1.719375,1.580214,1.533351,1.494325,1.423851,1.600085,2.263964,2.281264,1.629829,1.465484,1.364178,1.321698,1.444605,1.859589,2.030914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,HOUSEHOLD_2_512_CA_3_validation,1.132292,0.969994,1.088643,1.133330,1.376913,1.530162,1.467778,1.292819,1.522118,1.603029,1.410445,1.886277,1.960347,1.560496,1.745769,1.638929,1.555383,1.484855,1.542624,1.887684,1.953548,1.555129,1.469173,1.414880,1.511435,1.604545,1.956880,2.032895
3045,HOUSEHOLD_2_513_CA_3_validation,1.350279,1.192961,1.113899,1.184158,1.699580,1.814625,1.700663,1.767309,1.760778,1.632103,1.531615,1.501435,1.743459,1.222281,1.464031,1.464566,1.417666,1.447428,1.596289,1.683206,1.890511,1.607305,1.585087,1.550900,1.561031,1.582138,1.936671,2.080852
3046,HOUSEHOLD_2_514_CA_3_validation,0.162072,0.233353,0.185977,0.196151,0.187312,0.320115,0.270724,0.224778,0.206238,0.201331,0.156628,0.207073,0.267383,0.278845,0.225371,0.221792,0.202238,0.200549,0.210975,0.287232,0.331648,0.218936,0.218061,0.207314,0.205724,0.214551,0.287474,0.338460
3047,HOUSEHOLD_2_515_CA_3_validation,0.175777,0.170541,0.163519,0.151630,0.179331,0.154998,0.194287,0.171467,0.195292,0.169771,0.093668,0.183670,0.233353,0.136010,0.174317,0.180090,0.173975,0.165858,0.206044,0.242040,0.241713,0.181922,0.177841,0.172030,0.162489,0.192452,0.228250,0.217183


In [113]:
CA_1_eva.head()

Unnamed: 0,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
7710,1,0,1,0,8,1,0,0,1,0,0,1,1,0,0,0,0,0,1,2,2,0,0,1,0,3,2,2
7711,0,1,1,2,0,2,0,0,1,1,2,0,0,0,0,1,0,3,1,1,2,0,0,0,0,1,2,0
7712,1,1,0,1,0,0,3,0,0,1,3,0,1,1,0,0,0,3,1,0,3,2,4,0,0,0,0,3
7713,0,0,0,0,0,0,0,0,5,12,6,10,13,7,12,7,4,5,16,12,9,8,8,12,7,7,11,19
7714,4,0,12,0,2,1,5,1,1,4,2,8,3,1,4,1,3,2,0,6,1,4,0,3,1,11,2,1


In [114]:
pred_period1 = pred_period1.drop(columns='id')
pred_period1.columns = CA_1_eva.columns

In [115]:
pred_period1

Unnamed: 0,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,1.206988,1.085061,0.984270,0.978099,1.321499,1.064815,1.785777,0.873728,0.898837,0.844383,0.932210,1.142120,1.643105,1.538392,1.159466,1.042358,0.928478,0.901299,1.215475,1.760420,1.852182,1.227675,1.067192,0.986361,0.935664,1.284434,1.654123,1.903352
1,0.516288,0.483796,0.513579,0.505823,0.412950,0.520568,0.683233,0.515541,0.480019,0.506328,0.579737,0.452254,0.543407,0.733803,0.549102,0.533830,0.511066,0.503416,0.431699,0.557400,0.640403,0.545621,0.525551,0.509158,0.523097,0.444505,0.546274,0.625080
2,0.909705,0.768261,0.879471,0.816307,0.900153,1.390661,1.135084,0.919849,0.860480,0.875115,0.693357,0.904542,1.299739,0.776227,0.944743,0.882258,0.869662,0.864576,0.903471,1.290856,1.102637,0.910138,0.857237,0.911219,0.918977,0.974731,1.394540,1.209604
3,0.632958,0.662426,0.201111,0.213544,0.240101,0.257691,0.334925,0.307620,0.282181,0.285804,0.287585,0.320020,0.399565,0.443964,0.328079,0.330375,0.340240,0.370582,0.386855,0.504916,0.490735,0.449029,0.512389,0.522089,0.551723,0.566864,0.773257,0.940997
4,2.106064,1.756214,1.736908,1.641401,1.930983,2.539602,1.791107,1.661761,1.777059,1.711315,1.266173,1.791195,2.329885,1.719375,1.580214,1.533351,1.494325,1.423851,1.600085,2.263964,2.281264,1.629829,1.465484,1.364178,1.321698,1.444605,1.859589,2.030914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,1.132292,0.969994,1.088643,1.133330,1.376913,1.530162,1.467778,1.292819,1.522118,1.603029,1.410445,1.886277,1.960347,1.560496,1.745769,1.638929,1.555383,1.484855,1.542624,1.887684,1.953548,1.555129,1.469173,1.414880,1.511435,1.604545,1.956880,2.032895
3045,1.350279,1.192961,1.113899,1.184158,1.699580,1.814625,1.700663,1.767309,1.760778,1.632103,1.531615,1.501435,1.743459,1.222281,1.464031,1.464566,1.417666,1.447428,1.596289,1.683206,1.890511,1.607305,1.585087,1.550900,1.561031,1.582138,1.936671,2.080852
3046,0.162072,0.233353,0.185977,0.196151,0.187312,0.320115,0.270724,0.224778,0.206238,0.201331,0.156628,0.207073,0.267383,0.278845,0.225371,0.221792,0.202238,0.200549,0.210975,0.287232,0.331648,0.218936,0.218061,0.207314,0.205724,0.214551,0.287474,0.338460
3047,0.175777,0.170541,0.163519,0.151630,0.179331,0.154998,0.194287,0.171467,0.195292,0.169771,0.093668,0.183670,0.233353,0.136010,0.174317,0.180090,0.173975,0.165858,0.206044,0.242040,0.241713,0.181922,0.177841,0.172030,0.162489,0.192452,0.228250,0.217183


In [116]:
CA_1_eva

Unnamed: 0,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
7710,1,0,1,0,8,1,0,0,1,0,0,1,1,0,0,0,0,0,1,2,2,0,0,1,0,3,2,2
7711,0,1,1,2,0,2,0,0,1,1,2,0,0,0,0,1,0,3,1,1,2,0,0,0,0,1,2,0
7712,1,1,0,1,0,0,3,0,0,1,3,0,1,1,0,0,0,3,1,0,3,2,4,0,0,0,0,3
7713,0,0,0,0,0,0,0,0,5,12,6,10,13,7,12,7,4,5,16,12,9,8,8,12,7,7,11,19
7714,4,0,12,0,2,1,5,1,1,4,2,8,3,1,4,1,3,2,0,6,1,4,0,3,1,11,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7705,1,0,0,1,1,0,2,1,0,1,0,0,0,3,0,0,0,0,1,1,1,0,2,0,0,2,1,2
7706,1,3,0,2,1,2,1,1,3,1,0,2,2,1,4,3,1,3,0,0,0,0,5,0,0,3,2,1
7707,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3,1
7708,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0


In [117]:
from typing import Union

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm


class WRMSSEEvaluator(object):

    def __init__(self, train_df: pd.DataFrame, valid_df: pd.DataFrame, calendar: pd.DataFrame, prices: pd.DataFrame):
        train_y = train_df.loc[:, train_df.columns.str.startswith('d_')]
        train_target_columns = train_y.columns.tolist()
        weight_columns = train_y.iloc[:, -28:].columns.tolist()

        train_df['all_id'] = 0  # for lv1 aggregation

        id_columns = train_df.loc[:, ~train_df.columns.str.startswith('d_')].columns.tolist()
        valid_target_columns = valid_df.loc[:, valid_df.columns.str.startswith('d_')].columns.tolist()
        
        import IPython
        IPython.embed()
        if not all([c in valid_df.columns for c in id_columns]):
            valid_df = pd.concat([train_df[id_columns], valid_df], axis=1, sort=False)
            #valid_df = valid_df.loc[:,~valid_df.columns.duplicated()]
        IPython.embed()
        self.train_df = train_df
        self.valid_df = valid_df
        self.calendar = calendar
        self.prices = prices

        self.weight_columns = weight_columns
        self.id_columns = id_columns
        self.valid_target_columns = valid_target_columns

        weight_df = self.get_weight_df()


        self.group_ids = (
            'all_id',
            'state_id',
            'store_id',
            'cat_id',
            'dept_id',
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            'item_id',
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )
        print("start")
        for i, group_id in enumerate(tqdm(self.group_ids)):
            train_y = train_df.groupby(group_id)[train_target_columns].sum()
            scale = []
            for _, row in train_y.iterrows():
                series = row.values[np.argmax(row.values != 0):]
                scale.append(((series[1:] - series[:-1]) ** 2).mean())
            
            setattr(self, f'lv{i + 1}_scale', np.array(scale))
            setattr(self, f'lv{i + 1}_train_df', train_y)
            print(group_id)
            setattr(self, f'lv{i + 1}_valid_df', valid_df.groupby(group_id)[valid_target_columns].sum())

            lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
            setattr(self, f'lv{i + 1}_weight', lv_weight / lv_weight.sum())

    def get_weight_df(self) -> pd.DataFrame:
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns].set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
        weight_df = weight_df.loc[zip(self.train_df.item_id, self.train_df.store_id), :].reset_index(drop=True)
        weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)
        return weight_df

    def rmsse(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        valid_y = getattr(self, f'lv{lv}_valid_df')
        score = ((valid_y - valid_preds) ** 2).mean(axis=1)
        scale = getattr(self, f'lv{lv}_scale')
        return (score / scale).map(np.sqrt)

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds], axis=1, sort=False)

        all_scores = []
        for i, group_id in enumerate(self.group_ids):
            lv_scores = self.rmsse(valid_preds.groupby(group_id)[self.valid_target_columns].sum(), i + 1)
            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1, sort=False).prod(axis=1)
            all_scores.append(lv_scores.sum())

        return np.mean(all_scores)

In [118]:
#train_df = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_validation.csv')
#train_fold_df = train_df.iloc[:, :-28]
#valid_fold_df = train_df.iloc[:, -28:]
#valid_preds = valid_fold_df.copy() + np.random.randint(100, size=valid_fold_df.shape)
calendar = pd.read_csv("calendar.csv")
prices = pd.read_csv("sell_prices.csv")

In [119]:
train_fold_df = evaluation_set.iloc[:, :-28]
train_fold_df = train_fold_df.sort_values('id')

In [120]:
train_fold_df.shape

(30490, 1919)

In [121]:
mask = train_fold_df["store_id"] == "CA_2"
train_fold_df = train_fold_df[mask]

In [122]:
train_fold_df.reset_index(drop = True)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,...,d_1889,d_1890,d_1891,d_1892,d_1893,d_1894,d_1895,d_1896,d_1897,d_1898,d_1899,d_1900,d_1901,d_1902,d_1903,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,FOODS_1_001_CA_3_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_3,CA,1,2,1,1,1,2,0,1,1,1,0,0,3,3,2,7,1,2,4,...,0,0,1,0,0,0,1,0,0,4,2,1,0,0,0,0,0,13,0,0,0,0,0,1,0
1,FOODS_1_002_CA_3_evaluation,FOODS_1_002,FOODS_1,FOODS,CA_3,CA,1,0,1,2,0,0,0,1,1,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0,1,0,0,1,2,1,0,1,0,0,1,0,0,1,1
2,FOODS_1_003_CA_3_evaluation,FOODS_1_003,FOODS_1,FOODS,CA_3,CA,1,1,1,3,0,2,1,1,1,1,1,0,0,2,3,5,1,5,0,...,2,3,1,1,0,0,0,0,3,1,1,3,0,0,0,0,1,2,3,0,3,0,0,4,0
3,FOODS_1_004_CA_3_evaluation,FOODS_1_004,FOODS_1,FOODS,CA_3,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,FOODS_1_005_CA_3_evaluation,FOODS_1_005,FOODS_1,FOODS,CA_3,CA,10,3,2,10,1,4,6,1,13,11,2,4,10,3,13,3,1,7,3,...,9,1,0,1,9,0,1,0,5,1,4,6,3,1,2,4,2,0,1,0,1,0,3,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,HOUSEHOLD_2_512_CA_3_evaluation,HOUSEHOLD_2_512,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,0,0,0,0,0,1,0,0,0,0,0,0,1,0,2,1,0,1,0,...,0,0,1,2,2,1,1,0,3,0,4,3,2,1,0,1,2,0,0,6,0,5,1,3,2
3045,HOUSEHOLD_2_513_CA_3_evaluation,HOUSEHOLD_2_513,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,1,0,1,0,2,1,0,0,5,2,0,0,1,2,1,0,1,0,1,2,2,4,2,5
3046,HOUSEHOLD_2_514_CA_3_evaluation,HOUSEHOLD_2_514,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,0,2,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
3047,HOUSEHOLD_2_515_CA_3_evaluation,HOUSEHOLD_2_515,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0


In [123]:
CA_1_eva.reset_index(drop = True)

Unnamed: 0,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,1,0,1,0,8,1,0,0,1,0,0,1,1,0,0,0,0,0,1,2,2,0,0,1,0,3,2,2
1,0,1,1,2,0,2,0,0,1,1,2,0,0,0,0,1,0,3,1,1,2,0,0,0,0,1,2,0
2,1,1,0,1,0,0,3,0,0,1,3,0,1,1,0,0,0,3,1,0,3,2,4,0,0,0,0,3
3,0,0,0,0,0,0,0,0,5,12,6,10,13,7,12,7,4,5,16,12,9,8,8,12,7,7,11,19
4,4,0,12,0,2,1,5,1,1,4,2,8,3,1,4,1,3,2,0,6,1,4,0,3,1,11,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,1,0,0,1,1,0,2,1,0,1,0,0,0,3,0,0,0,0,1,1,1,0,2,0,0,2,1,2
3045,1,3,0,2,1,2,1,1,3,1,0,2,2,1,4,3,1,3,0,0,0,0,5,0,0,3,2,1
3046,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3,1
3047,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0


In [124]:
train_fold_df.shape

(3049, 1919)

In [125]:
CA_1_eva.shape

(3049, 28)

In [126]:
pred_period1.shape

(3049, 28)

In [127]:
CA_1_eva.columns

Index(['d_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919', 'd_1920',
       'd_1921', 'd_1922', 'd_1923', 'd_1924', 'd_1925', 'd_1926', 'd_1927',
       'd_1928', 'd_1929', 'd_1930', 'd_1931', 'd_1932', 'd_1933', 'd_1934',
       'd_1935', 'd_1936', 'd_1937', 'd_1938', 'd_1939', 'd_1940', 'd_1941'],
      dtype='object')

In [128]:
evaluator = WRMSSEEvaluator(train_fold_df.reset_index(drop = True), CA_1_eva.reset_index(drop = True), calendar, prices)
evaluator.score(pred_period1.reset_index(drop = True))

Python 3.7.4 (default, Aug 13 2019, 15:17:50) 
Type 'copyright', 'credits' or 'license' for more information
IPython 7.8.0 -- An enhanced Interactive Python. Type '?' for help.

In [1]: exit

Python 3.7.4 (default, Aug 13 2019, 15:17:50) 
Type 'copyright', 'credits' or 'license' for more information
IPython 7.8.0 -- An enhanced Interactive Python. Type '?' for help.

In [1]: exit

start
HBox(children=(IntProgress(value=0, max=12), HTML(value='')))
all_id
state_id
store_id
cat_id
dept_id
['state_id', 'cat_id']
['state_id', 'dept_id']
['store_id', 'cat_id']
['store_id', 'dept_id']
item_id
['item_id', 'state_id']
['item_id', 'store_id']



0.5809044846978456

In [71]:
pred_period1.shape

(3049, 28)

In [72]:
CA_1_eva.shape

(3049, 28)

In [73]:
CA_1_eva

Unnamed: 0,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
1612,2,0,0,0,0,1,1,0,6,1,0,3,0,0,0,2,0,0,2,3,1,0,0,0,1,0,0,0
1613,0,0,0,0,0,1,0,1,3,0,0,1,0,2,1,0,0,1,2,1,0,1,1,1,0,1,1,2
1614,3,1,0,0,0,0,0,3,1,0,0,1,2,2,0,0,0,0,2,0,0,0,0,0,2,2,0,1
1615,0,4,5,2,6,4,6,4,3,3,4,2,3,4,0,7,7,1,5,6,2,5,1,2,1,1,3,4
1616,1,0,0,0,1,3,4,2,1,1,4,0,3,2,2,1,1,1,2,1,11,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1607,0,0,1,0,0,0,0,0,0,2,1,0,0,1,1,1,0,1,1,1,1,0,0,0,2,2,0,0
1608,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,2,1,0,1,0,1,0,0
1609,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,2,0,1,0,0,0,1,0,0,0
1610,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0


In [74]:
train_fold_df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,...,d_1890,d_1891,d_1892,d_1893,d_1894,d_1895,d_1896,d_1897,d_1898,d_1899,d_1900,d_1901,d_1902,d_1903,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,all_id
7710,FOODS_1_001_CA_3_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_3,CA,1,2,1,1,1,2,0,1,1,1,0,0,3,3,2,7,1,2,4,...,0,1,0,0,0,1,0,0,4,2,1,0,0,0,0,0,13,0,0,0,0,0,1,0,0
7711,FOODS_1_002_CA_3_evaluation,FOODS_1_002,FOODS_1,FOODS,CA_3,CA,1,0,1,2,0,0,0,1,1,0,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1,0,0,1,2,1,0,1,0,0,1,0,0,1,1,0
7712,FOODS_1_003_CA_3_evaluation,FOODS_1_003,FOODS_1,FOODS,CA_3,CA,1,1,1,3,0,2,1,1,1,1,1,0,0,2,3,5,1,5,0,...,3,1,1,0,0,0,0,3,1,1,3,0,0,0,0,1,2,3,0,3,0,0,4,0,0
7713,FOODS_1_004_CA_3_evaluation,FOODS_1_004,FOODS_1,FOODS,CA_3,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7714,FOODS_1_005_CA_3_evaluation,FOODS_1_005,FOODS_1,FOODS,CA_3,CA,10,3,2,10,1,4,6,1,13,11,2,4,10,3,13,3,1,7,3,...,1,0,1,9,0,1,0,5,1,4,6,3,1,2,4,2,0,1,0,1,0,3,11,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7705,HOUSEHOLD_2_512_CA_3_evaluation,HOUSEHOLD_2_512,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,0,0,0,0,0,1,0,0,0,0,0,0,1,0,2,1,0,1,0,...,0,1,2,2,1,1,0,3,0,4,3,2,1,0,1,2,0,0,6,0,5,1,3,2,0
7706,HOUSEHOLD_2_513_CA_3_evaluation,HOUSEHOLD_2_513,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,2,1,0,0,5,2,0,0,1,2,1,0,1,0,1,2,2,4,2,5,0
7707,HOUSEHOLD_2_514_CA_3_evaluation,HOUSEHOLD_2_514,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,0,2,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
7708,HOUSEHOLD_2_515_CA_3_evaluation,HOUSEHOLD_2_515,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0
