In [7]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

warnings.filterwarnings('ignore')

In [8]:
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

In [9]:
# Read data
def get_data_by_store(store):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)
    
    # Leave only relevant store
    df = df[df['store_id']==store]

    # With memory limits we have to read 
    # lags and mean encoding features
    # separately and drop items that we don't need.
    # As our Features Grids are aligned 
    # we can use index to keep only necessary rows
    # Alignment is good for us as concat uses less memory than merge.
    df2 = pd.read_pickle(MEAN_ENC)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2 # to not reach memory limit 
    
    df = pd.concat([df, df3], axis=1)
    del df3 # to not reach memory limit 
    
    if store_id in ['CA_1', 'CA_2', 'CA_3','CA_4','TX_1','TX_2','TX_3']:
        remove_features = ['id','state_id','store_id','date','wm_yr_wk','d',TARGET,'cluster','snow_m',
                          'rolling_quantile_97_28', 'rolling_quantile_87.5_28', 'rolling_quantile_50_28', 'rolling_quantile_22.5_28', 'rolling_quantile_3_28', 'rolling_quantile_97_56', 'rolling_quantile_87.5_56', 'rolling_quantile_50_56', 'rolling_quantile_22.5_56', 'rolling_quantile_3_56', 'rolling_quantile_97_168', 'rolling_quantile_87.5_168', 'rolling_quantile_50_168', 'rolling_quantile_22.5_168', 'rolling_quantile_3_168']
    else:
        remove_features = ['id','state_id','store_id','date','wm_yr_wk','d',TARGET,'cluster',
                          'rolling_quantile_97_28', 'rolling_quantile_87.5_28', 'rolling_quantile_50_28', 'rolling_quantile_22.5_28', 'rolling_quantile_3_28', 'rolling_quantile_97_56', 'rolling_quantile_87.5_56', 'rolling_quantile_50_56', 'rolling_quantile_22.5_56', 'rolling_quantile_3_56', 'rolling_quantile_97_168', 'rolling_quantile_87.5_168', 'rolling_quantile_50_168', 'rolling_quantile_22.5_168', 'rolling_quantile_3_168']
    
    # Create features list
    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    # Skipping first n rows
    df = df[df['d']>=START_TRAIN].reset_index(drop=True)
    
    return df, features

# Recombine Test set after training
def get_base_test():
    base_test = pd.DataFrame()

    for store_id in STORES_IDS:
        temp_df = pd.read_pickle('test_'+store_id+str(VER)+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test


########################### Helper to make dynamic rolling lags
#################################################################################
def make_lag(LAG_DAY):
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(LAG_DAY)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(LAG_DAY)).astype(np.float16)
    return lag_df[[col_name]]

In [10]:
def make_lag_roll(LAG_DAY,lag_df_new):
   
    lag_df = base_test[['id','d',TARGET]]
    
    lag_df=lag_df.sort_values(by=["d"])
  
    for i in range(0,len(LAG_DAY)):

        shift_day = LAG_DAY[i][0]
        roll_wind = LAG_DAY[i][1]
        col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
        lag_df[col_name] = (lag_df.groupby(['id'])[TARGET]).transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
    lag_df_new=lag_df.drop(columns=["sales"])
    return lag_df_new

In [11]:
import lightgbm as lgb
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    "lambda":0.1,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 1400,
                    'boost_from_average': False,
                    'verbose': -1,
                } 



# lgb_params ={
#         "objective" : "tweedie",
#         "metric" :"rmse",
#         "force_row_wise" : True,
#         "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
#         "sub_row" : 0.75,
#         "bagging_freq" : 1,
#         "lambda_l2" : 0.1,
#         "metric": ["rmse"],
#         "nthread": -1,
#         "tweedie_variance_power":1.1,
#     'verbosity': 1,
# #     'num_iterations' : 1500,
#     'num_leaves': 128,
#     "min_data_in_leaf": 104,
#     }




# Let's look closer on params

## 'boosting_type': 'gbdt'
# we have 'goss' option for faster training
# but it normally leads to underfit.
# Also there is good 'dart' mode
# but it takes forever to train
# and model performance depends 
# a lot on random factor 
# https://www.kaggle.com/c/home-credit-default-risk/discussion/60921

## 'objective': 'tweedie'
# Tweedie Gradient Boosting for Extremely
# Unbalanced Zero-inflated Data
# https://arxiv.org/pdf/1811.10192.pdf
# and many more articles about tweediie
#
# Strange (for me) but Tweedie is close in results
# to my own ugly loss.
# My advice here - make OWN LOSS function
# https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/140564
# https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/143070
# I think many of you already using it (after poisson kernel appeared) 
# (kagglers are very good with "params" testing and tuning).
# Try to figure out why Tweedie works.
# probably it will show you new features options
# or data transformation (Target transformation?).

## 'tweedie_variance_power': 1.1
# default = 1.5
# set this closer to 2 to shift towards a Gamma distribution
# set this closer to 1 to shift towards a Poisson distribution
# my CV shows 1.1 is optimal 
# but you can make your own choice

## 'metric': 'rmse'
# Doesn't mean anything to us
# as competition metric is different
# and we don't use early stoppings here.
# So rmse serves just for general 
# model performance overview.
# Also we use "fake" validation set
# (as it makes part of the training set)
# so even general rmse score doesn't mean anything))
# https://www.kaggle.com/c/m5-forecasting-accuracy/discussion/133834

## 'subsample': 0.5
# Serves to fight with overfit
# this will randomly select part of data without resampling
# Chosen by CV (my CV can be wrong!)
# Next kernel will be about CV

##'subsample_freq': 1
# frequency for bagging
# default value - seems ok

## 'learning_rate': 0.03
# Chosen by CV
# Smaller - longer training
# but there is an option to stop 
# in "local minimum"
# Bigger - faster training
# but there is a chance to
# not find "global minimum" minimum

## 'num_leaves': 2**11-1
## 'min_data_in_leaf': 2**12-1
# Force model to use more features
# We need it to reduce "recursive"
# error impact.
# Also it leads to overfit
# that's why we use small 

# 'max_bin': 100
## l1, l2 regularizations
# https://towardsdatascience.com/l1-and-l2-regularization-methods-ce25e7fc831c
# Good tiny explanation
# l2 can work with bigger num_leaves
# but my CV doesn't show boost
                    
## 'n_estimators': 1400
# CV shows that there should be
# different values for each state/store.
# Current value was chosen 
# for general purpose.
# As we don't use any early stopings
# careful to not overfit Public LB.

##'feature_fraction': 0.5
# LightGBM will randomly select 
# part of features on each iteration (tree).
# We have maaaany features
# and many of them are "duplicates"
# and many just "noise"
# good values here - 0.5-0.7 (by CV)

## 'boost_from_average': False
# There is some "problem"
# to code boost_from_average for 
# custom loss
# 'True' makes training faster
# BUT carefull use it
# https://github.com/microsoft/LightGBM/issues/1514

In [12]:
VER = 3                        # Our model version
SEED = 42                      # We want all things
seed_everything(SEED)            # to be as deterministic 
lgb_params['seed'] = SEED        # as possible
N_CORES = psutil.cpu_count()     # Available CPU cores


#LIMITS and const
TARGET      = 'sales'            # Our target
START_TRAIN = 0                  # We can skip some rows (Nans/faster training)
END_TRAIN   = 1941               # End day of our train set, change this part for final
P_HORIZON   = 28                 # Prediction horizon

#FEATURES to remove
## These features lead to overfit
## or values not present in test set
mean_features   = ['enc_cat_id_mean','enc_cat_id_std',
                   'enc_dept_id_mean','enc_dept_id_std',
                   'enc_item_id_mean','enc_item_id_std'] 

#PATHS for Features
BASE     = 'grid_part_1.pkl'
PRICE    = 'grid_part_2.pkl'
CALENDAR = 'grid_part_3.pkl'
LAGS     = 'lags_df_28_v3.pkl'
MEAN_ENC = 'mean_encoding_df.pkl'


# AUX(pretrained) Models paths

#STORES ids
STORES_IDS = pd.read_csv('sales_train_evaluation.csv')['store_id']#change this part for final
STORES_IDS = list(STORES_IDS.unique())

#SPLITS for lags creation
SHIFT_DAY  = 28
N_LAGS     = 15
LAGS_SPLIT = [col for col in range(SHIFT_DAY,SHIFT_DAY+N_LAGS)]
ROLS_SPLIT = []
for i in [1,7,14]:
    for j in [7,14,28,56]:
        ROLS_SPLIT.append([i,j])

In [13]:
for store_id in STORES_IDS:
    print('Train', store_id)
    
    # Get grid for current store
    grid_df, features_columns = get_data_by_store(store_id)
    
    print(features_columns)
    # Masks for 
    # Train (All data less than 1913)
    # "Validation" (Last 28 days - not real validation set)
    # Test (All data greater than 1913 day, 
    #       with some gap for recursive features)
    train_mask = grid_df['d']<=END_TRAIN
    valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
    preds_mask = grid_df['d']>(END_TRAIN-100)
    w_weight=grid_df['tm_y'].map({0:0.88,1:0.91,2:0.94,3:0.97,4:1,5:1.03})
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
    train_data = lgb.Dataset(grid_df[train_mask][features_columns], 
                            label=grid_df[train_mask][TARGET],
                            weight=w_weight[train_mask])
    
    valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], 
                            label=grid_df[valid_mask][TARGET],
                            weight=w_weight[valid_mask])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
    grid_df = grid_df[preds_mask].reset_index(drop=True)
    keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
    grid_df = grid_df[keep_cols]
    grid_df.to_pickle('test_'+store_id+str(VER)+'.pkl')
    del grid_df
    gc.collect()
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
    seed_everything(SEED)
    estimator = lgb.train(lgb_params,
                          train_data,
                          valid_sets = [valid_data],
                          verbose_eval = 100,
                          )
    imp_type = "gain"
    features = estimator.feature_name()
    importances = estimator.feature_importance(imp_type)
    importance_df=pd.DataFrame(features,columns=['features'])
    importance_df['importances']=importances
    importance_df=importance_df.sort_values(by='importances', ascending=False)
    importance_df.to_csv(store_id+'_fe_imp_'+str(VER)+'.csv',index=False)
    del importance_df
    gc.collect()
    
    # Save model - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
    model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
    pickle.dump(estimator, open(model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
    #     !rm train_data.bin
    del train_data, valid_data, estimator
    gc.collect()

Train CA_1
['item_id', 'dept_id', 'cat_id', 'release', 'sell_price', 'price_max', 'price_min', 'price_std', 'price_mean', 'price_norm', 'price_rank_dept', 'price_nunique', 'item_nunique', 'price_momentum', 'price_momentum_m', 'price_momentum_y', 'temperature_high', 'temperature_con', 'rainfall_m', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'is_first_half_month', 'event_bef_weekend', 'event_after_weekend', 'NBA', 'event_attention_after', 'event_attention_bef', 'event_attention_sum', 'tm_d', 'tm_w', 'tm_m', 'tm_q', 'tm_y', 'tm_wm', 'tm_dw', 'tm_w_end', 'enc_cat_id_mean', 'enc_cat_id_std', 'enc_dept_id_mean', 'enc_dept_id_std', 'enc_item_id_mean', 'enc_item_id_std', 'sales_lag_28', 'sales_lag_29', 'sales_lag_30', 'sales_lag_31', 'sales_lag_32', 'sales_lag_33', 'sales_lag_34', 'sales_lag_35', 'sales_lag_36', 'sales_lag_37', 'sales_lag_38', 'sales_lag_39', 'sales_lag_40', 'sales_lag_41', 'sales_lag_42', 'rolling_mean_7', 'rolling_std_7',

[100]	valid_0's rmse: 1.61594
[200]	valid_0's rmse: 1.58323
[300]	valid_0's rmse: 1.56971
[400]	valid_0's rmse: 1.56221
[500]	valid_0's rmse: 1.55542
[600]	valid_0's rmse: 1.54941
[700]	valid_0's rmse: 1.54422
[800]	valid_0's rmse: 1.53875
[900]	valid_0's rmse: 1.53429
[1000]	valid_0's rmse: 1.53031
[1100]	valid_0's rmse: 1.5258
[1200]	valid_0's rmse: 1.52214
[1300]	valid_0's rmse: 1.51756
[1400]	valid_0's rmse: 1.51388
Train TX_2
['item_id', 'dept_id', 'cat_id', 'release', 'sell_price', 'price_max', 'price_min', 'price_std', 'price_mean', 'price_norm', 'price_rank_dept', 'price_nunique', 'item_nunique', 'price_momentum', 'price_momentum_m', 'price_momentum_y', 'temperature_high', 'temperature_con', 'rainfall_m', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'is_first_half_month', 'event_bef_weekend', 'event_after_weekend', 'NBA', 'event_attention_after', 'event_attention_bef', 'event_attention_sum', 'tm_d', 'tm_w', 'tm_m', 'tm_q', 'tm

[100]	valid_0's rmse: 1.91969
[200]	valid_0's rmse: 1.87168
[300]	valid_0's rmse: 1.85618
[400]	valid_0's rmse: 1.84568
[500]	valid_0's rmse: 1.83675
[600]	valid_0's rmse: 1.82874
[700]	valid_0's rmse: 1.81982
[800]	valid_0's rmse: 1.8135
[900]	valid_0's rmse: 1.80739
[1000]	valid_0's rmse: 1.80081
[1100]	valid_0's rmse: 1.79536
[1200]	valid_0's rmse: 1.79113
[1300]	valid_0's rmse: 1.78537
[1400]	valid_0's rmse: 1.78155


In [14]:
# Create Dummy DataFrame to store predictions
all_preds = pd.DataFrame()

# Join back the Test dataset with 
# a small part of the training data 
# to make recursive features
base_test = get_base_test()

# Timer to measure predictions time 
main_time = time.time()

# Loop over each prediction day
# As rolling lags are the most timeconsuming
# we will calculate it for whole day


for PREDICT_DAY in range(1,29):    
    print('Predict | Day:', PREDICT_DAY)
    start_time = time.time()

    # Make temporary grid to calculate rolling lags
    grid_df = base_test.copy()
    
    
    lag_df_new = pd.DataFrame()

    lag_df_new=make_lag_roll(ROLS_SPLIT,lag_df_new)


    grid_df = grid_df.merge(lag_df_new, on=['id','d'], how='left')


    for store_id in STORES_IDS:
        
        if store_id in ['CA_1', 'CA_2', 'CA_3','CA_4','TX_1','TX_2','TX_3']:
            MODEL_FEATURES = ['item_id', 'dept_id', 'cat_id', 'release', 'sell_price', 'price_max', 
                              'price_min', 'price_std', 'price_mean', 'price_norm', 'price_rank_dept',
                              'price_nunique', 'item_nunique', 'price_momentum', 'price_momentum_m', 
                              'price_momentum_y', 'temperature_high', 'temperature_con', 'rainfall_m', 
                              'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 
                              'snap_TX', 'snap_WI', 'is_first_half_month', 'event_bef_weekend', 'event_after_weekend',
                              'NBA', 'event_attention_after', 'event_attention_bef', 'event_attention_sum', 'tm_d',
                              'tm_w', 'tm_m', 'tm_q', 'tm_y', 'tm_wm', 'tm_dw', 'tm_w_end', 'enc_cat_id_mean', 
                              'enc_cat_id_std', 'enc_dept_id_mean', 'enc_dept_id_std', 'enc_item_id_mean', 
                              'enc_item_id_std', 'sales_lag_28', 'sales_lag_29', 'sales_lag_30', 'sales_lag_31', 
                              'sales_lag_32', 'sales_lag_33', 'sales_lag_34', 'sales_lag_35', 'sales_lag_36',
                              'sales_lag_37', 'sales_lag_38', 'sales_lag_39', 'sales_lag_40', 'sales_lag_41', 
                              'sales_lag_42', 'rolling_mean_7', 'rolling_std_7', 'rolling_mean_14', 'rolling_std_14', 
                              'rolling_mean_28', 'rolling_std_28', 'rolling_mean_56', 'rolling_std_56', 
                              'rolling_mean_168', 'rolling_std_168', 'rolling_mean_tmp_1_7', 'rolling_mean_tmp_1_14',
                              'rolling_mean_tmp_1_28', 'rolling_mean_tmp_1_56', 'rolling_mean_tmp_7_7', 
                              'rolling_mean_tmp_7_14', 'rolling_mean_tmp_7_28', 'rolling_mean_tmp_7_56', 
                              'rolling_mean_tmp_14_7', 'rolling_mean_tmp_14_14', 'rolling_mean_tmp_14_28', 'rolling_mean_tmp_14_56']
        else:
            MODEL_FEATURES = ['item_id', 'dept_id', 'cat_id', 'release', 'sell_price', 'price_max', 
                              'price_min', 'price_std', 'price_mean', 'price_norm', 'price_rank_dept',
                              'price_nunique', 'item_nunique', 'price_momentum', 'price_momentum_m', 
                              'price_momentum_y', 'temperature_high', 'temperature_con', 'rainfall_m', 'snow_m',
                              'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 
                              'snap_TX', 'snap_WI', 'is_first_half_month', 'event_bef_weekend', 'event_after_weekend',
                              'NBA', 'event_attention_after', 'event_attention_bef', 'event_attention_sum', 'tm_d',
                              'tm_w', 'tm_m', 'tm_q', 'tm_y', 'tm_wm', 'tm_dw', 'tm_w_end', 'enc_cat_id_mean', 
                              'enc_cat_id_std', 'enc_dept_id_mean', 'enc_dept_id_std', 'enc_item_id_mean', 
                              'enc_item_id_std', 'sales_lag_28', 'sales_lag_29', 'sales_lag_30', 'sales_lag_31', 
                              'sales_lag_32', 'sales_lag_33', 'sales_lag_34', 'sales_lag_35', 'sales_lag_36',
                              'sales_lag_37', 'sales_lag_38', 'sales_lag_39', 'sales_lag_40', 'sales_lag_41', 
                              'sales_lag_42', 'rolling_mean_7', 'rolling_std_7', 'rolling_mean_14', 'rolling_std_14', 
                              'rolling_mean_28', 'rolling_std_28', 'rolling_mean_56', 'rolling_std_56', 
                              'rolling_mean_168', 'rolling_std_168', 'rolling_mean_tmp_1_7', 'rolling_mean_tmp_1_14',
                              'rolling_mean_tmp_1_28', 'rolling_mean_tmp_1_56', 'rolling_mean_tmp_7_7', 
                              'rolling_mean_tmp_7_14', 'rolling_mean_tmp_7_28', 'rolling_mean_tmp_7_56', 
                              'rolling_mean_tmp_14_7', 'rolling_mean_tmp_14_14', 'rolling_mean_tmp_14_28', 'rolling_mean_tmp_14_56']
        # Read all our models and make predictions
        # for each day/store pairs
        model_path = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin' 

        estimator = pickle.load(open(model_path, 'rb'))

        day_mask = base_test['d']==(END_TRAIN+PREDICT_DAY)
        store_mask = base_test['store_id']==store_id

        mask = (day_mask)&(store_mask)
        base_test[TARGET][mask] = estimator.predict(grid_df[mask][MODEL_FEATURES])

    # Make good column naming and add 
    # to all_preds DataFrame
    temp_df = base_test[day_mask][['id',TARGET]]
    temp_df.columns = ['id','F'+str(PREDICT_DAY)]
    if 'id' in list(all_preds):
        all_preds = all_preds.merge(temp_df, on=['id'], how='left')
    else:
        all_preds = temp_df.copy()

    print('#'*10, ' %0.2f min round |' % ((time.time() - start_time) / 60),
                  ' %0.2f min total |' % ((time.time() - main_time) / 60),
                  ' %0.2f day sales |' % (temp_df['F'+str(PREDICT_DAY)].sum()))
    
    del temp_df, lag_df_new

all_preds = all_preds.reset_index(drop=True)
all_preds.head()

Predict | Day: 1
##########  6.26 min round |  6.26 min total |  39940.96 day sales |
Predict | Day: 2
##########  6.25 min round |  12.51 min total |  37478.91 day sales |
Predict | Day: 3
##########  6.16 min round |  18.67 min total |  37072.50 day sales |
Predict | Day: 4
##########  6.18 min round |  24.85 min total |  37229.89 day sales |
Predict | Day: 5
##########  6.22 min round |  31.07 min total |  42737.04 day sales |
Predict | Day: 6
##########  6.16 min round |  37.23 min total |  50730.33 day sales |
Predict | Day: 7
##########  6.22 min round |  43.44 min total |  51383.39 day sales |
Predict | Day: 8
##########  6.16 min round |  49.61 min total |  45792.62 day sales |
Predict | Day: 9
##########  6.20 min round |  55.81 min total |  38586.46 day sales |
Predict | Day: 10
##########  6.17 min round |  61.98 min total |  43933.42 day sales |
Predict | Day: 11
##########  6.16 min round |  68.14 min total |  45738.50 day sales |
Predict | Day: 12
##########  6.21 min rou

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_evaluation,0.871715,0.740823,0.729272,0.78777,0.964455,1.19888,1.048892,1.031525,0.860413,...,0.957519,1.327103,1.267551,0.940818,0.809681,0.759431,0.815764,1.013732,1.26327,1.032682
1,HOBBIES_1_002_CA_1_evaluation,0.202318,0.181573,0.189802,0.197203,0.219842,0.275442,0.33265,0.215238,0.186765,...,0.259099,0.360444,0.390963,0.239641,0.246143,0.24986,0.244179,0.30056,0.394818,0.395531
2,HOBBIES_1_003_CA_1_evaluation,0.583275,0.539881,0.557931,0.555725,0.72048,0.896028,0.87849,0.593979,0.494182,...,0.644588,0.782881,0.79245,0.582088,0.490889,0.499248,0.506289,0.717257,0.784503,0.778569
3,HOBBIES_1_004_CA_1_evaluation,1.547378,1.301782,1.291804,1.351488,1.997121,2.807,2.877837,1.955012,1.405988,...,1.785163,2.537764,2.97815,1.672182,1.365614,1.309382,1.295799,1.803006,2.539671,2.803592
4,HOBBIES_1_005_CA_1_evaluation,1.051996,0.94403,0.886019,0.970791,1.069658,1.463994,1.442337,1.20079,0.9461,...,1.214784,1.461678,1.406394,1.046216,0.912192,1.033188,1.010895,1.272049,1.467816,1.420012


In [15]:
all_preds.tail()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
30485,FOODS_3_823_WI_3_evaluation,0.44791,0.410288,0.435384,0.502258,0.50421,0.514611,0.62553,0.532222,0.503045,...,0.558066,0.740527,0.894037,0.537693,0.549094,0.550521,0.457552,0.512809,0.529316,0.654919
30486,FOODS_3_824_WI_3_evaluation,0.288325,0.264065,0.234911,0.251829,0.236991,0.264338,0.284278,0.266665,0.232188,...,0.261918,0.389088,0.407397,0.273636,0.354071,0.332723,0.252303,0.251764,0.304472,0.321524
30487,FOODS_3_825_WI_3_evaluation,0.686911,0.487501,0.435093,0.433528,0.477902,0.580186,0.625219,0.640111,0.479588,...,0.857967,1.23356,1.332488,0.914569,1.128744,1.024473,0.681182,0.769696,0.802135,0.997408
30488,FOODS_3_826_WI_3_evaluation,1.157899,1.077533,0.980766,0.940052,1.163494,1.270242,1.265217,1.186162,1.095228,...,1.171475,1.549542,1.514961,1.244382,1.328783,1.394999,1.175473,1.226866,1.29653,1.372951
30489,FOODS_3_827_WI_3_evaluation,1.88667,1.684333,1.646254,1.770348,2.152549,2.231746,2.009391,1.846938,1.578984,...,1.733238,1.935421,1.782624,1.545675,1.678027,1.673649,1.64255,1.921347,2.024265,2.032788


In [16]:
all_preds.shape

(30490, 29)

In [17]:
all_preds.describe()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
count,30490.0,30490.0,30490.0,30490.0,30490.0,30490.0,30490.0,30490.0,30490.0,30490.0,...,30490.0,30490.0,30490.0,30490.0,30490.0,30490.0,30490.0,30490.0,30490.0,30490.0
mean,1.309969,1.22922,1.21589,1.221052,1.401674,1.663835,1.685254,1.50189,1.265545,1.440912,...,1.531355,1.929065,1.954514,1.514215,1.414108,1.487371,1.373109,1.545985,1.787825,1.649998
std,2.682452,2.508659,2.480715,2.500355,2.959639,3.495356,3.471978,3.086707,2.53472,2.854623,...,3.239086,4.113723,4.134026,3.076588,2.995685,3.138808,2.867314,3.343571,3.949186,3.618581
min,0.001337,0.015409,0.017044,0.018711,0.02078,0.020091,0.019376,0.016279,0.011991,0.015878,...,0.01622,0.019304,0.019017,0.013573,0.014718,0.012739,0.015467,0.016737,0.017004,0.014679
25%,0.264733,0.271613,0.269213,0.270642,0.301116,0.350112,0.355311,0.315951,0.280947,0.314738,...,0.318354,0.387893,0.392886,0.305012,0.284568,0.298825,0.287048,0.32135,0.361036,0.335025
50%,0.607742,0.585137,0.572363,0.572396,0.648369,0.767843,0.787126,0.69928,0.601046,0.678079,...,0.69335,0.864666,0.889723,0.697945,0.636501,0.661504,0.622591,0.697839,0.793357,0.749669
75%,1.322388,1.225537,1.218688,1.21982,1.387277,1.660314,1.69952,1.482228,1.267837,1.431411,...,1.489354,1.875673,1.924002,1.502078,1.3788,1.446056,1.339334,1.493697,1.706726,1.606655
max,104.046681,97.155341,98.427506,100.878712,139.16347,155.373349,145.434235,123.521475,101.95479,100.930131,...,137.331283,170.798539,170.746052,115.752798,110.133046,103.6299,106.456351,137.656262,174.370301,151.625014


In [18]:
# all the following is changed

In [19]:
# replace validation part
train_df = pd.read_csv('sales_train_evaluation.csv')
train_df=train_df[['id','d_1914','d_1915','d_1916','d_1917','d_1918','d_1919','d_1920','d_1921','d_1922','d_1923',
                  'd_1924','d_1925','d_1926','d_1927','d_1928','d_1929','d_1930','d_1931','d_1932','d_1933',
                  'd_1934','d_1935','d_1936','d_1937','d_1938','d_1939','d_1940','d_1941']]

In [20]:
train_df.head()

Unnamed: 0,id,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,0,0,0,2,0,3,5,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,0,1,0,0,0,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,0,0,1,1,0,2,1,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,0,0,1,2,4,1,6,4,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,1,0,2,3,1,0,3,2,3,...,0,0,0,2,1,0,0,2,1,0


In [21]:
submission = pd.read_csv('sample_submission.csv')

In [22]:
submission.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
submission.tail()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
60975,FOODS_3_823_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60976,FOODS_3_824_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60977,FOODS_3_825_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60978,FOODS_3_826_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60979,FOODS_3_827_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
train_df['id']=train_df['id'].str.replace('evaluation','validation')

In [25]:
train_df.head()

Unnamed: 0,id,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_validation,0,0,0,2,0,3,5,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_validation,0,1,0,0,0,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,1,1,0,2,1,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_validation,0,0,1,2,4,1,6,4,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_validation,1,0,2,3,1,0,3,2,3,...,0,0,0,2,1,0,0,2,1,0


In [26]:
train_df.columns=submission.columns

In [27]:
train_df.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,2,0,3,5,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_validation,0,1,0,0,0,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,1,1,0,2,1,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_validation,0,0,1,2,4,1,6,4,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_validation,1,0,2,3,1,0,3,2,3,...,0,0,0,2,1,0,0,2,1,0


In [28]:
train_df.tail()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
30485,FOODS_3_823_WI_3_validation,0,0,0,2,2,0,0,0,2,...,1,0,3,0,1,1,0,0,1,1
30486,FOODS_3_824_WI_3_validation,0,1,1,1,0,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0
30487,FOODS_3_825_WI_3_validation,0,0,1,1,0,2,1,1,0,...,0,0,1,2,0,1,0,1,0,2
30488,FOODS_3_826_WI_3_validation,1,3,0,1,2,1,0,2,1,...,1,1,1,4,6,0,1,1,1,0
30489,FOODS_3_827_WI_3_validation,0,0,0,0,0,1,1,1,2,...,1,2,0,5,4,0,2,2,5,1


In [29]:
train_df.shape

(30490, 29)

In [30]:
submission.shape

(60980, 29)

In [31]:
submission = submission[['id']]
sub1 = submission.merge(train_df, on=['id'], how='left')

In [32]:
sub1.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.0,0.0,0.0,2.0,0.0,3.0,5.0,0.0,0.0,...,2.0,4.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,1.0
1,HOBBIES_1_002_CA_1_validation,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,HOBBIES_1_003_CA_1_validation,0.0,0.0,1.0,1.0,0.0,2.0,1.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,2.0,3.0,0.0,1.0
3,HOBBIES_1_004_CA_1_validation,0.0,0.0,1.0,2.0,4.0,1.0,6.0,4.0,0.0,...,1.0,1.0,0.0,4.0,0.0,1.0,3.0,0.0,2.0,6.0
4,HOBBIES_1_005_CA_1_validation,1.0,0.0,2.0,3.0,1.0,0.0,3.0,2.0,3.0,...,0.0,0.0,0.0,2.0,1.0,0.0,0.0,2.0,1.0,0.0


In [33]:
sub1.tail()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
60975,FOODS_3_823_WI_3_evaluation,,,,,,,,,,...,,,,,,,,,,
60976,FOODS_3_824_WI_3_evaluation,,,,,,,,,,...,,,,,,,,,,
60977,FOODS_3_825_WI_3_evaluation,,,,,,,,,,...,,,,,,,,,,
60978,FOODS_3_826_WI_3_evaluation,,,,,,,,,,...,,,,,,,,,,
60979,FOODS_3_827_WI_3_evaluation,,,,,,,,,,...,,,,,,,,,,


In [34]:
sub1=sub1[:30490]

In [35]:
sub1.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.0,0.0,0.0,2.0,0.0,3.0,5.0,0.0,0.0,...,2.0,4.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,1.0
1,HOBBIES_1_002_CA_1_validation,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,HOBBIES_1_003_CA_1_validation,0.0,0.0,1.0,1.0,0.0,2.0,1.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,2.0,3.0,0.0,1.0
3,HOBBIES_1_004_CA_1_validation,0.0,0.0,1.0,2.0,4.0,1.0,6.0,4.0,0.0,...,1.0,1.0,0.0,4.0,0.0,1.0,3.0,0.0,2.0,6.0
4,HOBBIES_1_005_CA_1_validation,1.0,0.0,2.0,3.0,1.0,0.0,3.0,2.0,3.0,...,0.0,0.0,0.0,2.0,1.0,0.0,0.0,2.0,1.0,0.0


In [36]:
sub1.tail()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
30485,FOODS_3_823_WI_3_validation,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,2.0,...,1.0,0.0,3.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
30486,FOODS_3_824_WI_3_validation,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
30487,FOODS_3_825_WI_3_validation,0.0,0.0,1.0,1.0,0.0,2.0,1.0,1.0,0.0,...,0.0,0.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,2.0
30488,FOODS_3_826_WI_3_validation,1.0,3.0,0.0,1.0,2.0,1.0,0.0,2.0,1.0,...,1.0,1.0,1.0,4.0,6.0,0.0,1.0,1.0,1.0,0.0
30489,FOODS_3_827_WI_3_validation,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,...,1.0,2.0,0.0,5.0,4.0,0.0,2.0,2.0,5.0,1.0


In [37]:
sub2 = submission.merge(all_preds, on=['id'], how='left')

In [38]:
sub2.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,,,,,,,,,,...,,,,,,,,,,
1,HOBBIES_1_002_CA_1_validation,,,,,,,,,,...,,,,,,,,,,
2,HOBBIES_1_003_CA_1_validation,,,,,,,,,,...,,,,,,,,,,
3,HOBBIES_1_004_CA_1_validation,,,,,,,,,,...,,,,,,,,,,
4,HOBBIES_1_005_CA_1_validation,,,,,,,,,,...,,,,,,,,,,


In [39]:
sub2.tail()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
60975,FOODS_3_823_WI_3_evaluation,0.44791,0.410288,0.435384,0.502258,0.50421,0.514611,0.62553,0.532222,0.503045,...,0.558066,0.740527,0.894037,0.537693,0.549094,0.550521,0.457552,0.512809,0.529316,0.654919
60976,FOODS_3_824_WI_3_evaluation,0.288325,0.264065,0.234911,0.251829,0.236991,0.264338,0.284278,0.266665,0.232188,...,0.261918,0.389088,0.407397,0.273636,0.354071,0.332723,0.252303,0.251764,0.304472,0.321524
60977,FOODS_3_825_WI_3_evaluation,0.686911,0.487501,0.435093,0.433528,0.477902,0.580186,0.625219,0.640111,0.479588,...,0.857967,1.23356,1.332488,0.914569,1.128744,1.024473,0.681182,0.769696,0.802135,0.997408
60978,FOODS_3_826_WI_3_evaluation,1.157899,1.077533,0.980766,0.940052,1.163494,1.270242,1.265217,1.186162,1.095228,...,1.171475,1.549542,1.514961,1.244382,1.328783,1.394999,1.175473,1.226866,1.29653,1.372951
60979,FOODS_3_827_WI_3_evaluation,1.88667,1.684333,1.646254,1.770348,2.152549,2.231746,2.009391,1.846938,1.578984,...,1.733238,1.935421,1.782624,1.545675,1.678027,1.673649,1.64255,1.921347,2.024265,2.032788


In [40]:
sub2=sub2[30490:]

In [41]:
sub2.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
30490,HOBBIES_1_001_CA_1_evaluation,0.871715,0.740823,0.729272,0.78777,0.964455,1.19888,1.048892,1.031525,0.860413,...,0.957519,1.327103,1.267551,0.940818,0.809681,0.759431,0.815764,1.013732,1.26327,1.032682
30491,HOBBIES_1_002_CA_1_evaluation,0.202318,0.181573,0.189802,0.197203,0.219842,0.275442,0.33265,0.215238,0.186765,...,0.259099,0.360444,0.390963,0.239641,0.246143,0.24986,0.244179,0.30056,0.394818,0.395531
30492,HOBBIES_1_003_CA_1_evaluation,0.583275,0.539881,0.557931,0.555725,0.72048,0.896028,0.87849,0.593979,0.494182,...,0.644588,0.782881,0.79245,0.582088,0.490889,0.499248,0.506289,0.717257,0.784503,0.778569
30493,HOBBIES_1_004_CA_1_evaluation,1.547378,1.301782,1.291804,1.351488,1.997121,2.807,2.877837,1.955012,1.405988,...,1.785163,2.537764,2.97815,1.672182,1.365614,1.309382,1.295799,1.803006,2.539671,2.803592
30494,HOBBIES_1_005_CA_1_evaluation,1.051996,0.94403,0.886019,0.970791,1.069658,1.463994,1.442337,1.20079,0.9461,...,1.214784,1.461678,1.406394,1.046216,0.912192,1.033188,1.010895,1.272049,1.467816,1.420012


In [42]:
sub2.tail()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
60975,FOODS_3_823_WI_3_evaluation,0.44791,0.410288,0.435384,0.502258,0.50421,0.514611,0.62553,0.532222,0.503045,...,0.558066,0.740527,0.894037,0.537693,0.549094,0.550521,0.457552,0.512809,0.529316,0.654919
60976,FOODS_3_824_WI_3_evaluation,0.288325,0.264065,0.234911,0.251829,0.236991,0.264338,0.284278,0.266665,0.232188,...,0.261918,0.389088,0.407397,0.273636,0.354071,0.332723,0.252303,0.251764,0.304472,0.321524
60977,FOODS_3_825_WI_3_evaluation,0.686911,0.487501,0.435093,0.433528,0.477902,0.580186,0.625219,0.640111,0.479588,...,0.857967,1.23356,1.332488,0.914569,1.128744,1.024473,0.681182,0.769696,0.802135,0.997408
60978,FOODS_3_826_WI_3_evaluation,1.157899,1.077533,0.980766,0.940052,1.163494,1.270242,1.265217,1.186162,1.095228,...,1.171475,1.549542,1.514961,1.244382,1.328783,1.394999,1.175473,1.226866,1.29653,1.372951
60979,FOODS_3_827_WI_3_evaluation,1.88667,1.684333,1.646254,1.770348,2.152549,2.231746,2.009391,1.846938,1.578984,...,1.733238,1.935421,1.782624,1.545675,1.678027,1.673649,1.64255,1.921347,2.024265,2.032788


In [43]:
final_sub=pd.concat([sub1,sub2],axis=0)

In [44]:
final_sub.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.0,0.0,0.0,2.0,0.0,3.0,5.0,0.0,0.0,...,2.0,4.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,1.0
1,HOBBIES_1_002_CA_1_validation,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,HOBBIES_1_003_CA_1_validation,0.0,0.0,1.0,1.0,0.0,2.0,1.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,2.0,3.0,0.0,1.0
3,HOBBIES_1_004_CA_1_validation,0.0,0.0,1.0,2.0,4.0,1.0,6.0,4.0,0.0,...,1.0,1.0,0.0,4.0,0.0,1.0,3.0,0.0,2.0,6.0
4,HOBBIES_1_005_CA_1_validation,1.0,0.0,2.0,3.0,1.0,0.0,3.0,2.0,3.0,...,0.0,0.0,0.0,2.0,1.0,0.0,0.0,2.0,1.0,0.0


In [45]:
final_sub.tail()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
60975,FOODS_3_823_WI_3_evaluation,0.44791,0.410288,0.435384,0.502258,0.50421,0.514611,0.62553,0.532222,0.503045,...,0.558066,0.740527,0.894037,0.537693,0.549094,0.550521,0.457552,0.512809,0.529316,0.654919
60976,FOODS_3_824_WI_3_evaluation,0.288325,0.264065,0.234911,0.251829,0.236991,0.264338,0.284278,0.266665,0.232188,...,0.261918,0.389088,0.407397,0.273636,0.354071,0.332723,0.252303,0.251764,0.304472,0.321524
60977,FOODS_3_825_WI_3_evaluation,0.686911,0.487501,0.435093,0.433528,0.477902,0.580186,0.625219,0.640111,0.479588,...,0.857967,1.23356,1.332488,0.914569,1.128744,1.024473,0.681182,0.769696,0.802135,0.997408
60978,FOODS_3_826_WI_3_evaluation,1.157899,1.077533,0.980766,0.940052,1.163494,1.270242,1.265217,1.186162,1.095228,...,1.171475,1.549542,1.514961,1.244382,1.328783,1.394999,1.175473,1.226866,1.29653,1.372951
60979,FOODS_3_827_WI_3_evaluation,1.88667,1.684333,1.646254,1.770348,2.152549,2.231746,2.009391,1.846938,1.578984,...,1.733238,1.935421,1.782624,1.545675,1.678027,1.673649,1.64255,1.921347,2.024265,2.032788


In [46]:
final_sub.describe()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
count,60980.0,60980.0,60980.0,60980.0,60980.0,60980.0,60980.0,60980.0,60980.0,60980.0,...,60980.0,60980.0,60980.0,60980.0,60980.0,60980.0,60980.0,60980.0,60980.0,60980.0
mean,1.291144,1.196555,1.172803,1.18009,1.404281,1.658894,1.712289,1.459062,1.35947,1.369866,...,1.488423,1.85512,1.959891,1.451794,1.342951,1.352016,1.292704,1.470795,1.738747,1.716078
std,2.973043,2.742751,2.698433,2.699774,3.296382,3.797963,3.751909,3.272647,3.131778,3.092646,...,3.454514,4.273605,4.429373,3.197715,3.062429,3.073316,2.913035,3.430803,4.020074,3.965984
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.062723,0.0,0.0,0.0,...,0.0,0.051484,0.099705,0.0,0.0,0.0,0.0,0.0,0.028131,0.070596
50%,0.479861,0.448875,0.437892,0.434384,0.560304,0.748415,0.804849,0.609568,0.528643,0.552832,...,0.608027,0.87436,0.956525,0.609129,0.532451,0.534875,0.504871,0.609712,0.793876,0.782614
75%,1.246498,1.092544,1.069105,1.065233,1.405028,1.89411,2.0,1.510469,1.281412,1.3536,...,1.513243,2.0,2.0,1.519498,1.306199,1.324526,1.221027,1.485956,2.0,2.0
max,133.0,117.0,113.0,100.878712,196.0,155.373349,145.434235,123.521475,143.0,107.0,...,143.0,170.798539,187.0,115.752798,110.133046,103.6299,106.456351,137.656262,174.370301,151.625014


In [47]:
final_sub.to_csv('lgb_bystore_final2.csv',index=False)