In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Make features

In [0]:
FEATURE_TYPE = 'TZ_HD'

In [0]:
PATH_INPUT= "/content/drive/My Drive/Kaggle/ashrae-energy/input/"
PATH_OUTPUT = "/content/drive/My Drive/Kaggle/ashrae-energy/output/"

In [0]:
# General imports
import numpy as np
import pandas as pd
import os, gc, sys, warnings, random, math, psutil, pickle,datetime

from sklearn import metrics
from sklearn.model_selection import train_test_split


# warnings.filterwarnings('ignore')

In [0]:
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    
## Simple "Memory profilersls" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [0]:
########################### Vars
#################################################################################
SEED = 42
LOCAl_TEST = False
seed_everything(SEED)
TARGET = 'meter_reading'

In [19]:
########################### DATA LOAD
#################################################################################
print('Load Data')
train_df = pd.read_pickle(PATH_INPUT + 'train_df_' + FEATURE_TYPE +'.pkl')
test_df = pd.read_pickle(PATH_INPUT + 'test_df_' + FEATURE_TYPE + '.pkl')

Load Data


# Model

In [0]:
########################### Model params
import lightgbm as lgb
lgb_params = {
                    'objective':'regression',
                    'boosting_type':'gbdt',
                    'metric':'rmse',
                    'n_jobs':-1,
                    'learning_rate':0.05,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':800,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':100, 
                } 

#CV concept
##Basics
Cross-validation is a technique for evaluating ML models by training several ML models on subsets of the available input data and evaluating them on the complementary subset of the data.

In k-fold cross-validation, you split the input data into k subsets of data (also known as folds).

##Main strategy
Divide Train set in subsets (Training set itself + Validation set)  
Define Validation Metric (in our case it is RMSE/RMSLE)  
Stop training when Validation metric stops improving  
Make predictions for Test set  

Seems simple but he devil's always in the details.

In [22]:
########################### Create Holdout sets
#################################################################################
# Holdout set 1
# Split train set by building_id -> 20% to houldout
train_buildings, test_buildings = train_test_split(train_df['site_id'].unique(), test_size=0.20, random_state=SEED)

holdout_subset_1 = train_df[train_df['site_id'].isin(test_buildings)].reset_index(drop=True)
train_df = train_df[train_df['site_id'].isin(train_buildings)].reset_index(drop=True)

# Holdout set 2
# Split train set by site_id -> 20% to houldout                   
train_buildings, test_buildings = train_test_split(train_df['building_id'].unique(), test_size=0.20, random_state=SEED)

holdout_subset_2 = train_df[train_df['building_id'].isin(test_buildings)].reset_index(drop=True)
train_df = train_df[train_df['building_id'].isin(train_buildings)].reset_index(drop=True)
                    
# Holdout set 3
# Split train set by month -> first and last months to holdout
holdout_subset_3 = train_df[(train_df['DT_M']==1)|(train_df['DT_M']==12)].reset_index(drop=True)
train_df = train_df[(train_df['DT_M']!=1)&(train_df['DT_M']!=12)].reset_index(drop=True)

# Transform target and check shape
for df in [train_df, holdout_subset_1, holdout_subset_2, holdout_subset_3]:
    df[TARGET] = np.log1p(df[TARGET])
    print(df.shape)


(10073851, 26)
(4913301, 26)
(3152349, 26)
(2076599, 26)


In [0]:
########################### Features to use and eval sets
# for validation "purity" we will also remove site_id, building_id, DT_M
remove_columns = ['timestamp','timediff', 'site_id','building_id','DT_M',TARGET]
features_columns = [col for col in list(train_df) if col not in remove_columns]

X = train_df[features_columns]
y = train_df[TARGET]

split_by_building = train_df['building_id']
split_by_site = train_df['site_id']
split_by_month = train_df['DT_M']

del train_df

In [28]:
## Let's creat dataframes to compare results
## We will join prepdictions
RESULTS_1 = holdout_subset_1[[TARGET]]
RESULTS_2 = holdout_subset_2[[TARGET]]
RESULTS_3 = holdout_subset_3[[TARGET]]

all_results = {
        1: [RESULTS_1, holdout_subset_1, '    site_id holdout'],
        2: [RESULTS_2, holdout_subset_2, 'building_id holdout'],
        3: [RESULTS_3, holdout_subset_3, '      month holdout']
    }

for _,df in all_results.items():
    df[0]['test'] = 0    
    print('Ground RMSE for', df[2], '|',
          rmse(df[0][TARGET], df[0]['test']))
    del df[0]['test']
    print('#'*20)    
    
# We will always use same number of splits
# for training model
# Number of splits depends on data structure
# something in range 5-10
# 5 - is a common number of splits
# 10+ is too much (we will not have enough diversity in data)
# Here we will use 3 for faster training
# but you can change it by yourself
N_SPLITS = 3

Ground RMSE for     site_id holdout | 4.807110987267514
####################
Ground RMSE for building_id holdout | 4.618632715552271
####################
Ground RMSE for       month holdout | 4.614684845953343
####################


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [29]:
# We don't know where to stop
# so we will try to guess 
# number of boosting rounds
for n_rounds in [25,50,100,200]:
    print('#'*20)
    print('No Validation training...', n_rounds, 'boosting rounds')
    corrected_lgb_params = lgb_params.copy()
    corrected_lgb_params['n_estimators'] = n_rounds
    corrected_lgb_params['early_stopping_rounds'] = None

    train_data = lgb.Dataset(X, label=y)
    
    estimator = lgb.train(
                corrected_lgb_params,
                train_data
            )

    for _,df in all_results.items():
        df[0]['no_validation_'+str(n_rounds)] = estimator.predict(df[1][features_columns])
        print('RMSE for',
              df[2], '|',
              rmse(df[0][TARGET], df[0]['no_validation_'+str(n_rounds)]))
        print('#'*20)

# Be careful. We are printing rmse results
# for our simulated test set
# but in real Data set we do not have True labels (obviously)
# and can't be sure that we stopped in right round
# lb probing can give you some idea how good our training is
# but this leads to nowhere -> overfits or completely bad results
# bad practice for real life problems!
#

####################
No Validation training... 25 boosting rounds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


RMSE for     site_id holdout | 2.2742564723968663
####################
RMSE for building_id holdout | 1.7172894438254553
####################
RMSE for       month holdout | 1.3906882230495015
####################
####################
No Validation training... 50 boosting rounds
RMSE for     site_id holdout | 2.2874379206325846
####################
RMSE for building_id holdout | 1.6445555378951613
####################
RMSE for       month holdout | 1.1436442338626065
####################
####################
No Validation training... 100 boosting rounds
RMSE for     site_id holdout | 2.3209319267109754
####################
RMSE for building_id holdout | 1.6353771891762319
####################
RMSE for       month holdout | 0.9971273246014412
####################
####################
No Validation training... 200 boosting rounds
RMSE for     site_id holdout | 2.334050130177219
####################
RMSE for building_id holdout | 1.6522075943878518
####################
RMSE for       month

#Findings
The main finding here is that we have "data leakage" in our dataset. And not single one.

Leakage by site_id -> our model doesn't generalize well for unkown site_id  
Leakage by building_id -> our model doesn't generalize well for unkown building_id  
What we can do here and do we have to do anything?

Good thing is all our test buildings and test sites present in train set.

Probably we don't need to smooth differences between them and can even make differences more explicit.

In [30]:
print('#'*20)
print('KFold (with shuffle) training...')

from sklearn.model_selection import KFold
folds = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for _,df in all_results.items():
    df[0]['shuffle_kfold'] = 0
        
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    print('Fold:',fold_+1)
    tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]    
    vl_x, v_y = X.iloc[val_idx,:], y[val_idx]    
    train_data = lgb.Dataset(tr_x, label=tr_y)
    valid_data = lgb.Dataset(vl_x, label=v_y)  

    estimator = lgb.train(
            lgb_params,
            train_data,
            valid_sets = [train_data, valid_data],
            verbose_eval = 100,
        )
    
    for _,df in all_results.items():
        df[0]['shuffle_kfold'] += estimator.predict(df[1][features_columns])/N_SPLITS

for _,df in all_results.items():
    print('RMSE for', df[2], '|',
          rmse(df[0][TARGET], df[0]['shuffle_kfold']))
    print('#'*20)    

####################
KFold (with shuffle) training...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Fold: 1




Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 0.87474	valid_1's rmse: 0.876358
[200]	training's rmse: 0.750556	valid_1's rmse: 0.753105
[300]	training's rmse: 0.703227	valid_1's rmse: 0.707422
[400]	training's rmse: 0.672909	valid_1's rmse: 0.678719
[500]	training's rmse: 0.648457	valid_1's rmse: 0.655786
[600]	training's rmse: 0.629374	valid_1's rmse: 0.638129
[700]	training's rmse: 0.613532	valid_1's rmse: 0.623677
[800]	training's rmse: 0.600264	valid_1's rmse: 0.611714
Did not meet early stopping. Best iteration is:
[800]	training's rmse: 0.600264	valid_1's rmse: 0.611714


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Fold: 2
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 0.869661	valid_1's rmse: 0.870622
[200]	training's rmse: 0.738275	valid_1's rmse: 0.740923
[300]	training's rmse: 0.689533	valid_1's rmse: 0.693602
[400]	training's rmse: 0.660869	valid_1's rmse: 0.666227
[500]	training's rmse: 0.640752	valid_1's rmse: 0.647522
[600]	training's rmse: 0.62199	valid_1's rmse: 0.630044
[700]	training's rmse: 0.607214	valid_1's rmse: 0.616488
[800]	training's rmse: 0.593822	valid_1's rmse: 0.604449
Did not meet early stopping. Best iteration is:
[800]	training's rmse: 0.593822	valid_1's rmse: 0.604449
Fold: 3
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 0.8735	valid_1's rmse: 0.875105
[200]	training's rmse: 0.746877	valid_1's rmse: 0.749756
[300]	training's rmse: 0.701396	valid_1's rmse: 0.705682
[400]	training's rmse: 0.672582	valid_1's rmse: 0.678194
[500]	training's rmse: 0.649688	valid_1's rmse: 0.656686
[600]	training'

In [31]:
print('#'*20)
print('KFold (no shuffle) training...')

from sklearn.model_selection import KFold
folds = KFold(n_splits=N_SPLITS, shuffle=False, random_state=SEED)

for _,df in all_results.items():
    df[0]['no_shuffle_kfold'] = 0
        
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    print('Fold:',fold_+1)
    tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]    
    vl_x, v_y = X.iloc[val_idx,:], y[val_idx]    
    train_data = lgb.Dataset(tr_x, label=tr_y)
    valid_data = lgb.Dataset(vl_x, label=v_y)  

    estimator = lgb.train(
            lgb_params,
            train_data,
            valid_sets = [train_data, valid_data],
            verbose_eval = 100,
        )
    
    for _,df in all_results.items():
        df[0]['no_shuffle_kfold'] += estimator.predict(df[1][features_columns])/N_SPLITS

for _,df in all_results.items():
    print('RMSE for', df[2], '|',
          rmse(df[0][TARGET], df[0]['no_shuffle_kfold']))
    print('#'*20) 

####################
KFold (no shuffle) training...
Fold: 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 0.847835	valid_1's rmse: 1.15986
[200]	training's rmse: 0.72624	valid_1's rmse: 1.08594
[300]	training's rmse: 0.682169	valid_1's rmse: 1.06507
[400]	training's rmse: 0.651024	valid_1's rmse: 1.05655
[500]	training's rmse: 0.62815	valid_1's rmse: 1.04695
[600]	training's rmse: 0.611629	valid_1's rmse: 1.04193
[700]	training's rmse: 0.596353	valid_1's rmse: 1.0387
[800]	training's rmse: 0.585398	valid_1's rmse: 1.0358
Did not meet early stopping. Best iteration is:
[800]	training's rmse: 0.585398	valid_1's rmse: 1.0358


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Fold: 2
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 0.863509	valid_1's rmse: 1.10204
[200]	training's rmse: 0.73376	valid_1's rmse: 1.03265
[300]	training's rmse: 0.682657	valid_1's rmse: 1.00588
[400]	training's rmse: 0.649351	valid_1's rmse: 0.993896
[500]	training's rmse: 0.62701	valid_1's rmse: 0.985094
[600]	training's rmse: 0.608378	valid_1's rmse: 0.97795
[700]	training's rmse: 0.593227	valid_1's rmse: 0.973172
[800]	training's rmse: 0.580292	valid_1's rmse: 0.969315
Did not meet early stopping. Best iteration is:
[800]	training's rmse: 0.580292	valid_1's rmse: 0.969315
Fold: 3
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 0.849172	valid_1's rmse: 1.10609
[200]	training's rmse: 0.719012	valid_1's rmse: 1.05951
[300]	training's rmse: 0.670583	valid_1's rmse: 1.04589
[400]	training's rmse: 0.638682	valid_1's rmse: 1.03842
[500]	training's rmse: 0.617865	valid_1's rmse: 1.0359
[600]	training's rmse: 0

# Findings
The main finding here is that we have one more "data leakage".

Leakage by date/month
Consumptions differ a lot month by month.

We can't exclude any data by month as we need to predict consumptions for the whole year.

Our task becoming more and more interesting as we have to validate our features somehow.

We can't use normal kfold for validation because if the model knows how much energy was spent at 8 am it can make a good prediction for 9 am, but we don't have such data in our test set.

In [32]:

print('#'*20)
print('GroupKFold building_id split training...') 

from sklearn.model_selection import GroupKFold
folds = GroupKFold(n_splits=N_SPLITS)

for _,df in all_results.items():
    df[0]['Groupkfold_by_building'] = 0
      
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups=split_by_building)):
    print('Fold:',fold_+1)
    tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]    
    vl_x, v_y = X.iloc[val_idx,:], y[val_idx]    
    train_data = lgb.Dataset(tr_x, label=tr_y)
    valid_data = lgb.Dataset(vl_x, label=v_y)  

    estimator = lgb.train(
            lgb_params,
            train_data,
            valid_sets = [train_data, valid_data],
            verbose_eval = 100,
        )

    for _,df in all_results.items():
        df[0]['Groupkfold_by_building'] += estimator.predict(df[1][features_columns])/N_SPLITS

for _,df in all_results.items():
    print('RMSE for', df[2], '|',
          rmse(df[0][TARGET], df[0]['Groupkfold_by_building']))
    print('#'*20)  

####################
GroupKFold building_id split training...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Fold: 1




Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 0.77119	valid_1's rmse: 1.66047
Early stopping, best iteration is:
[43]	training's rmse: 1.02073	valid_1's rmse: 1.62707


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Fold: 2
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 0.804663	valid_1's rmse: 1.51358
Early stopping, best iteration is:
[69]	training's rmse: 0.911884	valid_1's rmse: 1.50585
Fold: 3
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 0.798701	valid_1's rmse: 1.58371
[200]	training's rmse: 0.679938	valid_1's rmse: 1.58645
Early stopping, best iteration is:
[102]	training's rmse: 0.794456	valid_1's rmse: 1.58297
RMSE for     site_id holdout | 2.2704603497893334
####################
RMSE for building_id holdout | 1.640569959892638
####################
RMSE for       month holdout | 1.1367912247815963
####################


In [33]:
print('#'*20)
print('GroupKFold site_id split training...') 

from sklearn.model_selection import GroupKFold
folds = GroupKFold(n_splits=N_SPLITS)

for _,df in all_results.items():
    df[0]['Groupkfold_by_site'] = 0
      
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups=split_by_site)):
    print('Fold:',fold_+1)
    tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]    
    vl_x, v_y = X.iloc[val_idx,:], y[val_idx]    
    train_data = lgb.Dataset(tr_x, label=tr_y)
    valid_data = lgb.Dataset(vl_x, label=v_y)  

    estimator = lgb.train(
            lgb_params,
            train_data,
            valid_sets = [train_data, valid_data],
            verbose_eval = 100,
        )

    for _,df in all_results.items():
        df[0]['Groupkfold_by_site'] += estimator.predict(df[1][features_columns])/N_SPLITS

for _,df in all_results.items():
    print('RMSE for', df[2], '|',
          rmse(df[0][TARGET], df[0]['Groupkfold_by_site']))
    print('#'*20)  

####################
GroupKFold site_id split training...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Fold: 1




Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 0.769206	valid_1's rmse: 1.79544
Early stopping, best iteration is:
[28]	training's rmse: 1.21042	valid_1's rmse: 1.66623


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Fold: 2
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 0.7078	valid_1's rmse: 1.98514
Early stopping, best iteration is:
[67]	training's rmse: 0.802957	valid_1's rmse: 1.97598
Fold: 3
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 0.861892	valid_1's rmse: 1.58356
Early stopping, best iteration is:
[33]	training's rmse: 1.22853	valid_1's rmse: 1.54034
RMSE for     site_id holdout | 2.250923852725451
####################
RMSE for building_id holdout | 1.6522709855706978
####################
RMSE for       month holdout | 1.3160760376500211
####################


In [34]:
print('#'*20)
print('GroupKFold month split training...') 

from sklearn.model_selection import GroupKFold
folds = GroupKFold(n_splits=N_SPLITS)

for _,df in all_results.items():
    df[0]['Groupkfold_by_month'] = 0
      
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups=split_by_month)):
    print('Fold:',fold_+1)
    tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]    
    vl_x, v_y = X.iloc[val_idx,:], y[val_idx]    
    train_data = lgb.Dataset(tr_x, label=tr_y)
    valid_data = lgb.Dataset(vl_x, label=v_y)  

    estimator = lgb.train(
            lgb_params,
            train_data,
            valid_sets = [train_data, valid_data],
            verbose_eval = 100,
        )

    for _,df in all_results.items():
        df[0]['Groupkfold_by_month'] += estimator.predict(df[1][features_columns])/N_SPLITS

for _,df in all_results.items():
    print('RMSE for', df[2], '|',
          rmse(df[0][TARGET], df[0]['Groupkfold_by_month']))
    print('#'*20)  


####################
GroupKFold month split training...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Fold: 1




Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 0.856559	valid_1's rmse: 0.986958
[200]	training's rmse: 0.725803	valid_1's rmse: 0.894818
[300]	training's rmse: 0.681801	valid_1's rmse: 0.870058
[400]	training's rmse: 0.651497	valid_1's rmse: 0.854304
[500]	training's rmse: 0.628581	valid_1's rmse: 0.843428
[600]	training's rmse: 0.606537	valid_1's rmse: 0.833487
[700]	training's rmse: 0.590192	valid_1's rmse: 0.825601
[800]	training's rmse: 0.577681	valid_1's rmse: 0.821295
Did not meet early stopping. Best iteration is:
[800]	training's rmse: 0.577681	valid_1's rmse: 0.821295


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Fold: 2
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 0.868092	valid_1's rmse: 0.939032
[200]	training's rmse: 0.732562	valid_1's rmse: 0.851734
[300]	training's rmse: 0.685349	valid_1's rmse: 0.826592
[400]	training's rmse: 0.656456	valid_1's rmse: 0.813387
[500]	training's rmse: 0.631096	valid_1's rmse: 0.800774
[600]	training's rmse: 0.610125	valid_1's rmse: 0.791017
[700]	training's rmse: 0.59408	valid_1's rmse: 0.785393
[800]	training's rmse: 0.582009	valid_1's rmse: 0.78125
Did not meet early stopping. Best iteration is:
[800]	training's rmse: 0.582009	valid_1's rmse: 0.78125
Fold: 3
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 0.871622	valid_1's rmse: 1.03008
[200]	training's rmse: 0.746366	valid_1's rmse: 0.95461
[300]	training's rmse: 0.698075	valid_1's rmse: 0.930615
[400]	training's rmse: 0.670373	valid_1's rmse: 0.918616
[500]	training's rmse: 0.64618	valid_1's rmse: 0.909512
[600]	training's r

#Findings
Same as before. "Leakage" prevents our model to generalize well.

#Summary
For test set predictions our training set MUST have all building_ids and all months to make more accurate predictions.

I would recommend trying train/skip/validate for feature validation:

Train set - first 4 month
Skip - next 4 month
Valid set - last 4 month
For test set predictions use slightly more boosting rounds than validation scheme early stopping will show.

Train several seed models (not kfold, just different seed).

Average results.

In [0]:
########################### Model

# Models saving
model_filename = 'lgbm'
models = []

# Load train_df from hdd
train_df = pd.read_pickle(pass_input + 'train_df_' + MODEL_TYPE +'.pkl')

remove_columns = ['timestamp', 'timediff', TARGET]
features_columns = [col for col in list(train_df) if col not in remove_columns]

if LOCAl_TEST:
    tr_data = lgb.Dataset(train_df.iloc[:15000000][features_columns], label=np.log1p(train_df.iloc[:15000000][TARGET]))
    vl_data = lgb.Dataset(train_df.iloc[15000000:][features_columns], label=np.log1p(train_df.iloc[15000000:][TARGET]))
    eval_sets = [tr_data,vl_data]
else:
    tr_data = lgb.Dataset(train_df[features_columns], label=np.log1p(train_df[TARGET]))
    eval_sets = [tr_data]

# Remove train_df from hdd
# os.system('rm train_df_{}.pkl'.format(MODEL_TYPE))

# Lets make 5 seeds mix model
for cur_seed in [42,43,44,45,46]:
    
    # Seed everything
    seed_everything(cur_seed)
    lgb_params['seed'] = cur_seed
    
    estimator = lgb.train(
                lgb_params,
                tr_data,
                valid_sets = eval_sets,
                verbose_eval = 100,
            )

    # For CV you may add fold number
    # pickle.dump(estimator, open(model_filename + '__fold_' + str(i) + '.bin', "wb"))
    pickle.dump(estimator, open(pass_output + model_filename + '__seed_' + str(cur_seed)  + MODEL_TYPE+ '.bin', 'wb'))
    models.append(pass_output + model_filename + '__seed_' + str(cur_seed)  +  MODEL_TYPE + '.bin')

if not LOCAl_TEST:
    del tr_data, train_df
    gc.collect()



Training until validation scores don't improve for 100 rounds.


KeyboardInterrupt: ignored

In [0]:
########################### Predict
#################################################################################
if not LOCAl_TEST:
    
    # Load test_df from hdd
    test_df = pd.read_pickle('test_df_' + MODEL_TYPE +'.pkl')
    
    # Remove unused columns
    test_df = test_df[features_columns]
    
    # Remove test_df from hdd
    # os.system('rm ' + 'test_df_' + MODEL_TYPE +'.pkl')
    
    # Read submission file
    submission = pd.read_pickle('/content/drive/My Drive/Kaggle/ashrae-energy/input/sample_submission.pkl')

    # Remove row_id for a while
    del submission['row_id']
    
    for model_path in models:
        print('Predictions for', model_path)
        #Just now 
        estimator = pickle.load(open(pass_output + model_path, 'rb'))
        # estimator = pickle.load(open(model_path, 'rb'))

        predictions = []
        batch_size = 2000000
        for batch in range(int(len(test_df)/batch_size)+1):
            print('Predicting batch:', batch)
            predictions += list(np.expm1(estimator.predict(test_df[features_columns].iloc[batch*batch_size:(batch+1)*batch_size])))
            
        submission['meter_reading'] += predictions
        
    # Average over models
    submission['meter_reading'] /= len(models)
    
    # Delete test_df
    del test_df
     
    # Fix negative values
    # https://note.nkmk.me/python-numpy-clip/
    submission['meter_reading'] = submission['meter_reading'].clip(0,None)

    # Restore row_id
    submission['row_id'] = submission.index
    
    ########################### Check
    print(submission.iloc[:20])
    print(submission['meter_reading'].describe())

In [0]:
########################### Export
#################################################################################
if not LOCAl_TEST:
    # submission.to_csv('submission.csv', index=False)
    submission.to_csv('submission' + MODEL_TYPE + '.csv.gz',index=False,compression='gzip')

In [0]:
########################### Export
#################################################################################
if not LOCAl_TEST:
    # submission.to_csv('submission.csv', index=False)
    submission.to_csv(pass_output + 'submission' + MODEL_TYPE + '.csv.gz',index=False,compression='gzip')