In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns
import gc, math, os, warnings
warnings.simplefilter("ignore")
from tqdm import tqdm_notebook

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
from catboost import CatBoostRegressor

In [2]:
FOLDER = "../features/"
df_train = pd.read_pickle(FOLDER + "train_features.pkl")
df_test = pd.read_pickle(FOLDER + "test_features.pkl")

print("train/test shape is:", df_train.shape, df_test.shape)

train/test shape is: (19228395, 18) (41697600, 17)


In [3]:
df_train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,apparent_temperature,log_meter_reading,DT_hour,DT_day_week,timestamp_og
16418015,1262,0,0.0,110.51,14,6,11.085938,,,3.934783,1.25,-3.282609,0.0,0.350981,4.714114,0,4,2016-01-01
16418016,1262,1,0.0,116.0,14,6,11.085938,,,3.934783,1.25,-3.282609,0.0,0.350981,4.762174,0,4,2016-01-01
16418017,1262,3,0.0,855.1,14,6,11.085938,,,3.934783,1.25,-3.282609,0.0,0.350981,6.752387,0,4,2016-01-01
2639831,226,0,0.0,29.45,2,6,11.210938,1930.0,,15.6,6.0,-5.6,0.0,11.43978,3.416086,0,4,2016-01-01
3350958,262,0,0.0,16.84,2,1,9.945312,1999.0,,15.6,6.0,-5.6,0.0,11.43978,2.881443,0,4,2016-01-01


In [4]:
# set category feature
categorical_features = [
    "building_id",  #
    "primary_use",
    "DT_day_week",
    "DT_hour",
]
for col in categorical_features:
    df_train[col] = df_train[col].astype('category')
    df_test[col] = df_test[col].astype('category')

In [5]:
# drop useless features
drop_features = [
    "timestamp",  #
    "timestamp_og",
    "site_id",
    'meter',
    "meter_reading",
    "log_meter_reading",
    "year_built",
    "floor_count",
]
all_features = [col for col in df_train.columns if col not in drop_features]

In [6]:
print("train/test shape is:", df_train.shape, df_test.shape)
print("features used # is", len(all_features))
df_train[all_features].head()

train/test shape is: (19228395, 18) (41697600, 17)
features used # is 10


Unnamed: 0,building_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,apparent_temperature,DT_hour,DT_day_week
16418015,1262,6,11.085938,3.934783,1.25,-3.282609,0.0,0.350981,0,4
16418016,1262,6,11.085938,3.934783,1.25,-3.282609,0.0,0.350981,0,4
16418017,1262,6,11.085938,3.934783,1.25,-3.282609,0.0,0.350981,0,4
2639831,226,6,11.210938,15.6,6.0,-5.6,0.0,11.43978,0,4
3350958,262,1,9.945312,15.6,6.0,-5.6,0.0,11.43978,0,4


In [7]:
cv = 2
models = {}
site_model = {}
cv_scores = {"site_id": [], "meter_id":[], "cv_score": []}
USE_GRID_SEARCH = False

for site_id in tqdm_notebook(range(16), desc="site_id"):
    
    models[site_id] = []
    site_model = {}
    for meter_id in range(4):
        
        ### train a single model for each site/meter
        X_train_site = df_train[(df_train.site_id==site_id) & (df_train.meter==meter_id)].reset_index(drop=True)
        y_train_site = X_train_site.log_meter_reading
        y_pred_train_site = np.zeros(X_train_site.shape[0])
        
        if(len(X_train_site)==0):
            print("Site Id:", site_id, "meter Id:", meter_id, "has no training data!")
            continue
        
        print("==> Training model for site_id:", site_id, "meter:", meter_id, "...")
        
        kf = KFold(n_splits=cv)
        skf = StratifiedKFold(n_splits=cv, shuffle=False, random_state=42)
        
        site_model[meter_id] = []
        score = 0

        #################### grid search #############################################
        if(USE_GRID_SEARCH):
            estimator = lgb.LGBMRegressor()
            param_grid = {
                'metric': ['rmse'],
                'num_leaves': [8, 16, 32],
                'learning_rate': [0.03],
                'n_estimators': [5000],
                'subsample': [0.8],
                'l2_leaf_reg': [5],
                "max_depth": [6],
                'random_seed': [42],
                'task_type':'GPU',
            }
            gbm = GridSearchCV(estimator, param_grid, cv=kf)
            gbm.fit(X_train_site[all_features], y_train_site)
            print('Best parameters found by grid search are:', gbm.best_params_)
        else:
            param = {
                'eval_metric': 'RMSE',
                'loss_function': 'RMSE',
#                 'num_leaves':32,
                'learning_rate': 0.01,
                'n_estimators': 8000,
                'l2_leaf_reg': 6,
                "max_depth": 5,
                'random_seed': 1993,
                'task_type':'GPU',
            }
            
        #skf.split(X_train_site, X_train_site.building_id)
        #kf.split(X_train_site)
        for fold, (train_index, valid_index) in enumerate(skf.split(X_train_site, X_train_site.building_id)):
            X_train, X_valid = X_train_site.loc[train_index, all_features], X_train_site.loc[valid_index, all_features]
            y_train, y_valid = y_train_site.iloc[train_index], y_train_site.iloc[valid_index]
            
#             print(X_train.building_id.nunique(), " == ", X_valid.building_id.nunique())
            print("train/valid size", X_train.shape, X_valid.shape)
            
            if(USE_GRID_SEARCH):
                model_lgb = CatBoostRegressor(**gbm.best_params_)
            else:
                model_lgb = CatBoostRegressor(**param)
            
            model_lgb.fit(X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                cat_features=categorical_features,
                early_stopping_rounds=200,
                verbose=1000)
        
            site_model[meter_id].append(model_lgb)

            y_pred_valid = model_lgb.predict(X_valid)
            y_pred_train_site[valid_index] = y_pred_valid

            rmse = np.sqrt(mean_squared_error(y_valid, y_pred_valid))
            print("Site Id:", site_id, "meter Id:", meter_id, ", Fold:", fold+1, ", RMSE:", rmse)
            score += rmse / cv

            gc.collect()
        
        cv_scores["site_id"].append(site_id)
        cv_scores["meter_id"].append(meter_id)
        cv_scores["cv_score"].append(score)
        
        print("\nSite Id:", site_id, "meter Id:", meter_id, ", CV RMSE:", np.sqrt(mean_squared_error(y_train_site, y_pred_train_site)), "\n")
    models[site_id].append(site_model)

HBox(children=(FloatProgress(value=0.0, description='site_id', max=16.0, style=ProgressStyle(description_width…

==> Training model for site_id: 0 meter: 0 ...
train/valid size (278987, 10) (279013, 10)
0:	learn: 1.2981758	test: 1.2504429	best: 1.2504429 (0)	total: 13.4ms	remaining: 1m 47s
1000:	learn: 0.3663096	test: 0.3934498	best: 0.3934498 (1000)	total: 9.83s	remaining: 1m 8s
2000:	learn: 0.3308717	test: 0.3605547	best: 0.3605547 (2000)	total: 19.9s	remaining: 59.6s
3000:	learn: 0.3137739	test: 0.3466158	best: 0.3466158 (3000)	total: 29.8s	remaining: 49.6s
4000:	learn: 0.3032409	test: 0.3391425	best: 0.3391425 (4000)	total: 39.4s	remaining: 39.3s
5000:	learn: 0.2956877	test: 0.3345346	best: 0.3345346 (5000)	total: 49s	remaining: 29.4s
6000:	learn: 0.2900280	test: 0.3312238	best: 0.3312238 (6000)	total: 58.7s	remaining: 19.6s
7000:	learn: 0.2854970	test: 0.3286211	best: 0.3286211 (6999)	total: 1m 8s	remaining: 9.78s
7999:	learn: 0.2815435	test: 0.3265855	best: 0.3265855 (7999)	total: 1m 18s	remaining: 0us
bestTest = 0.3265854508
bestIteration = 7999
Site Id: 0 meter Id: 0 , Fold: 1 , RMSE: 0.3

1000:	learn: 0.8698120	test: 0.9553520	best: 0.9553520 (1000)	total: 12.9s	remaining: 1m 30s
2000:	learn: 0.8136614	test: 0.9127182	best: 0.9127182 (2000)	total: 25.6s	remaining: 1m 16s
3000:	learn: 0.7804046	test: 0.8907234	best: 0.8907234 (3000)	total: 38.5s	remaining: 1m 4s
4000:	learn: 0.7584178	test: 0.8773469	best: 0.8773469 (4000)	total: 51.2s	remaining: 51.2s
5000:	learn: 0.7432106	test: 0.8716230	best: 0.8716230 (5000)	total: 1m 3s	remaining: 38.3s
6000:	learn: 0.7321039	test: 0.8680481	best: 0.8680481 (6000)	total: 1m 16s	remaining: 25.5s
7000:	learn: 0.7227858	test: 0.8657150	best: 0.8657150 (7000)	total: 1m 29s	remaining: 12.7s
7999:	learn: 0.7150910	test: 0.8640507	best: 0.8640071 (7991)	total: 1m 41s	remaining: 0us
bestTest = 0.8640071466
bestIteration = 7991
Shrink model to first 7992 iterations.
Site Id: 2 meter Id: 1 , Fold: 2 , RMSE: 0.8640072113092728

Site Id: 2 meter Id: 1 , CV RMSE: 0.9080025435377924 

Site Id: 2 meter Id: 2 has no training data!
==> Training mod

train/valid size (157707, 10) (157687, 10)
0:	learn: 1.1814899	test: 1.1361110	best: 1.1361110 (0)	total: 6.99ms	remaining: 55.9s
1000:	learn: 0.1675209	test: 0.3028922	best: 0.3028545 (993)	total: 8.05s	remaining: 56.3s
2000:	learn: 0.1508508	test: 0.2978334	best: 0.2978334 (2000)	total: 15.9s	remaining: 47.5s
3000:	learn: 0.1442663	test: 0.2959735	best: 0.2959702 (2999)	total: 23.5s	remaining: 39.2s
4000:	learn: 0.1396852	test: 0.2946667	best: 0.2946617 (3985)	total: 31.4s	remaining: 31.4s
5000:	learn: 0.1360817	test: 0.2938313	best: 0.2938313 (5000)	total: 39.3s	remaining: 23.6s
6000:	learn: 0.1331843	test: 0.2933749	best: 0.2933747 (5997)	total: 47.2s	remaining: 15.7s
7000:	learn: 0.1308406	test: 0.2929810	best: 0.2929792 (6993)	total: 55.1s	remaining: 7.87s
7999:	learn: 0.1288665	test: 0.2927770	best: 0.2927750 (7996)	total: 1m 3s	remaining: 0us
bestTest = 0.2927749901
bestIteration = 7996
Shrink model to first 7997 iterations.
Site Id: 6 meter Id: 0 , Fold: 2 , RMSE: 0.2927741270

0:	learn: 1.2290966	test: 1.2440611	best: 1.2440611 (0)	total: 18.3ms	remaining: 2m 26s
1000:	learn: 0.2557270	test: 0.2871067	best: 0.2871067 (1000)	total: 13.8s	remaining: 1m 36s
2000:	learn: 0.2114084	test: 0.2562458	best: 0.2562445 (1999)	total: 27.2s	remaining: 1m 21s
3000:	learn: 0.1948091	test: 0.2473722	best: 0.2473645 (2999)	total: 41.2s	remaining: 1m 8s
4000:	learn: 0.1853094	test: 0.2433417	best: 0.2433417 (4000)	total: 55.1s	remaining: 55.1s
5000:	learn: 0.1788925	test: 0.2413043	best: 0.2413040 (4997)	total: 1m 9s	remaining: 41.4s
6000:	learn: 0.1741448	test: 0.2400450	best: 0.2400450 (6000)	total: 1m 22s	remaining: 27.6s
7000:	learn: 0.1702785	test: 0.2393136	best: 0.2392973 (6979)	total: 1m 36s	remaining: 13.8s
7999:	learn: 0.1671259	test: 0.2387585	best: 0.2387501 (7970)	total: 1m 50s	remaining: 0us
bestTest = 0.2387500902
bestIteration = 7970
Shrink model to first 7971 iterations.
Site Id: 9 meter Id: 0 , Fold: 1 , RMSE: 0.2387500208388016
train/valid size (512006, 10)

bestTest = 1.134400476
bestIteration = 476
Shrink model to first 477 iterations.
Site Id: 11 meter Id: 1 , Fold: 1 , RMSE: 1.134400679979846
train/valid size (13843, 10) (13841, 10)
0:	learn: 2.1883884	test: 2.9836319	best: 2.9836319 (0)	total: 11.3ms	remaining: 1m 30s
bestTest = 1.363251972
bestIteration = 398
Shrink model to first 399 iterations.
Site Id: 11 meter Id: 1 , Fold: 2 , RMSE: 1.3632522398922682

Site Id: 11 meter Id: 1 , CV RMSE: 1.2540494962907593 

Site Id: 11 meter Id: 2 has no training data!
==> Training model for site_id: 11 meter: 3 ...
train/valid size (21700, 10) (21703, 10)
0:	learn: 2.0547287	test: 2.1885169	best: 2.1885169 (0)	total: 10.3ms	remaining: 1m 22s
1000:	learn: 0.7951501	test: 0.8710327	best: 0.8709330 (975)	total: 11.2s	remaining: 1m 18s
bestTest = 0.8701787085
bestIteration = 1083
Shrink model to first 1084 iterations.
Site Id: 11 meter Id: 3 , Fold: 1 , RMSE: 0.8701786063595794
train/valid size (21703, 10) (21700, 10)
0:	learn: 1.9090270	test: 2.31

7999:	learn: 1.0180717	test: 1.4482078	best: 1.4481840 (7823)	total: 1m 33s	remaining: 0us
bestTest = 1.448184006
bestIteration = 7823
Shrink model to first 7824 iterations.
Site Id: 13 meter Id: 2 , Fold: 2 , RMSE: 1.4481835665728362

Site Id: 13 meter Id: 2 , CV RMSE: 1.4094288980519858 

Site Id: 13 meter Id: 3 has no training data!
==> Training model for site_id: 14 meter: 0 ...
train/valid size (425093, 10) (425137, 10)
0:	learn: 1.0937139	test: 1.0760682	best: 1.0760682 (0)	total: 15.7ms	remaining: 2m 5s
1000:	learn: 0.3025761	test: 0.3108231	best: 0.3108231 (1000)	total: 12.5s	remaining: 1m 27s
2000:	learn: 0.2681859	test: 0.2882034	best: 0.2881701 (1998)	total: 25s	remaining: 1m 15s
3000:	learn: 0.2539999	test: 0.2841534	best: 0.2841502 (2989)	total: 37.7s	remaining: 1m 2s
4000:	learn: 0.2460113	test: 0.2828172	best: 0.2827299 (3975)	total: 50.4s	remaining: 50.4s
bestTest = 0.2825920629
bestIteration = 4390
Shrink model to first 4391 iterations.
Site Id: 14 meter Id: 0 , Fold: 

3000:	learn: 0.4636943	test: 0.5608240	best: 0.5608008 (2999)	total: 26.9s	remaining: 44.9s
4000:	learn: 0.4486237	test: 0.5548178	best: 0.5548165 (3998)	total: 35.7s	remaining: 35.7s
5000:	learn: 0.4374381	test: 0.5515996	best: 0.5515926 (4991)	total: 44.5s	remaining: 26.7s
6000:	learn: 0.4287569	test: 0.5493700	best: 0.5493695 (5998)	total: 53.4s	remaining: 17.8s
7000:	learn: 0.4211933	test: 0.5475856	best: 0.5475846 (6999)	total: 1m 2s	remaining: 8.9s
7999:	learn: 0.4149670	test: 0.5463785	best: 0.5463785 (7999)	total: 1m 11s	remaining: 0us
bestTest = 0.546378488
bestIteration = 7999
Site Id: 15 meter Id: 1 , Fold: 1 , RMSE: 0.5463783727929123
train/valid size (223044, 10) (223025, 10)
0:	learn: 1.6015094	test: 1.6107634	best: 1.6107634 (0)	total: 11.1ms	remaining: 1m 29s
1000:	learn: 0.5508452	test: 0.5877139	best: 0.5877139 (1000)	total: 9.08s	remaining: 1m 3s
2000:	learn: 0.5015714	test: 0.5595648	best: 0.5595529 (1998)	total: 18.1s	remaining: 54.4s
3000:	learn: 0.4768045	test: 0

In [8]:
### compute weight cv score
def weight_cv_score(df_train, cv_score_df):
    
    mini_model_portion = []
    for site_id in tqdm_notebook(range(16), desc="site_id"):
        for meter_id in range(4):
            X_train_site = df_train[(df_train.site_id==site_id) & (df_train.meter==meter_id)].reset_index(drop=True)
            if(len(X_train_site) != 0):
                portion_rate = np.round(X_train_site.shape[0] / df_train.shape[0] * 100, 2)
                print("site_id:", site_id, "meter:",meter_id, "---", X_train_site.building_id.nunique(),"buildings", portion_rate, "% :", cv_score_df[cv_score_df.site_id==site_id][cv_score_df.meter_id==meter_id]['cv_score'].values.round(4))                
                mini_model_portion.append(X_train_site.shape[0] / df_train.shape[0])
        print("--------------------------------------------")
    cv_weight_score = np.array(mini_model_portion) * (np.array(cv_score_df.cv_score) ** 2)
    weighted_score = np.round(cv_weight_score.sum() ** 0.5, 4)
    
    return weighted_score
cv_score_df = pd.DataFrame.from_dict(cv_scores)
print("Overall CV score is:", weight_cv_score(df_train, cv_score_df))

HBox(children=(FloatProgress(value=0.0, description='site_id', max=16.0, style=ProgressStyle(description_width…

site_id: 0 meter: 0 --- 105 buildings 2.9 % : [0.3627]
site_id: 0 meter: 1 --- 24 buildings 0.84 % : [1.3414]
--------------------------------------------
site_id: 1 meter: 0 --- 51 buildings 2.33 % : [0.6715]
site_id: 1 meter: 3 --- 12 buildings 0.54 % : [1.7752]
--------------------------------------------
site_id: 2 meter: 0 --- 135 buildings 6.11 % : [0.4142]
site_id: 2 meter: 1 --- 99 buildings 4.29 % : [0.907]
site_id: 2 meter: 3 --- 55 buildings 2.04 % : [0.9517]
--------------------------------------------
site_id: 3 meter: 0 --- 274 buildings 12.31 % : [0.4424]
--------------------------------------------
site_id: 4 meter: 0 --- 91 buildings 3.86 % : [0.2271]
--------------------------------------------
site_id: 5 meter: 0 --- 89 buildings 4.02 % : [0.6616]
--------------------------------------------
site_id: 6 meter: 0 --- 36 buildings 1.64 % : [0.2996]
site_id: 6 meter: 1 --- 21 buildings 0.65 % : [1.1871]
site_id: 6 meter: 2 --- 23 buildings 0.89 % : [1.495]
--------------

---

In [9]:
# del df_train, X_train_site, y_train_site, X_train, y_train, dtrain, X_valid, y_valid, dvalid, y_pred_train_site, y_pred_valid, rmse, score, cv_scores
# gc.collect()

In [10]:
# # look at feature importance
# for site_id in range(16):
#     for meter_id in models[site_id][0].keys():
#         for cv_id in range(cv):
#             lgb.plot_importance(models[site_id][0][meter_id][cv_id], figsize=(9,6))
#             plt.title(str(site_id) + "_" + str(meter_id)+"_"+str(cv_id))
#             plt.show()

In [11]:
### make predictions
df_test_sites = []

for site_id in tqdm_notebook(range(16), desc="site_id"):
    
    for meter_id in range(4):
    
        X_test_site = df_test[(df_test.site_id==site_id) & (df_test.meter==meter_id)]
        row_ids_site = X_test_site.row_id

        X_test_site = X_test_site[all_features]
        y_pred_test_site = np.zeros(X_test_site.shape[0])
        
        if(len(X_test_site)==0):
            continue

        for fold in range(cv):
            model_lgb = models[site_id][0][meter_id][fold]
            y_pred_test_site += model_lgb.predict(X_test_site) / cv
            gc.collect()
        
        df_test_site = pd.DataFrame({"row_id": row_ids_site, "meter_reading": y_pred_test_site})
        df_test_sites.append(df_test_site)
        
        gc.collect()

HBox(children=(FloatProgress(value=0.0, description='site_id', max=16.0, style=ProgressStyle(description_width…




In [12]:
submit = pd.concat(df_test_sites)
submit.meter_reading = np.clip(np.expm1(submit.meter_reading), 0, a_max=None)
submit.meter_reading = submit.meter_reading.round(4)
print(submit.shape)
submit.head()

(41697600, 2)


Unnamed: 0,row_id,meter_reading
0,0,172.9397
1156320,66,110.1342
1121280,64,284.9228
1103760,63,338.3966
1138800,65,416.4631


In [13]:
### should be (41697600, 2)
WRITE_TOKEN = True
SUBMIT_TOKEN = True
if((submit.shape == (41697600, 2)) * (WRITE_TOKEN)):
    print("Good, the submission is ready to submit..., submission shape is", submit.shape)
    SUBMISSION_FILE_PATH = "../ensembles/sub_cat3.csv.gz"
    print("Writing the submission to csv.gz file ...")
    submit.to_csv(SUBMISSION_FILE_PATH, index=False, compression='gzip')
#     if(SUBMIT_TOKEN):
#         print("Submitting the csv.gz file to Kaggle platform ...")
#         !kaggle competitions submit -c ashrae-energy-prediction -f "../submission/sub_cat.csv.gz" -m "cat automatic submit ..."
#         print("Submitted successfully! Good luck ...")
else:
    print("There are some things wrong ... The submission is not submitted ...")

Good, the submission is ready to submit..., submission shape is (41697600, 2)
Writing the submission to csv.gz file ...
