In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns
import gc, math, os, warnings
warnings.simplefilter("ignore")
from tqdm import tqdm_notebook

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
import xgboost as xgb

In [2]:
FOLDER = "../features/"
df_train = pd.read_pickle(FOLDER + "train_features.pkl")
df_test = pd.read_pickle(FOLDER + "test_features.pkl")

print("train/test shape is:", df_train.shape, df_test.shape)

train/test shape is: (19228395, 18) (41697600, 17)


In [3]:
df_train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,apparent_temperature,log_meter_reading,DT_hour,DT_day_week,timestamp_og
16418015,1262,0,0.0,110.51,14,6,11.085938,,,3.934783,1.25,-3.282609,0.0,0.350981,4.714114,0,4,2016-01-01
16418016,1262,1,0.0,116.0,14,6,11.085938,,,3.934783,1.25,-3.282609,0.0,0.350981,4.762174,0,4,2016-01-01
16418017,1262,3,0.0,855.1,14,6,11.085938,,,3.934783,1.25,-3.282609,0.0,0.350981,6.752387,0,4,2016-01-01
2639831,226,0,0.0,29.45,2,6,11.210938,1930.0,,15.6,6.0,-5.6,0.0,11.43978,3.416086,0,4,2016-01-01
3350958,262,0,0.0,16.84,2,1,9.945312,1999.0,,15.6,6.0,-5.6,0.0,11.43978,2.881443,0,4,2016-01-01


In [4]:
# # set category feature
# categorical_features = [
#     "building_id",  #
#     "primary_use",
#     "DT_day_week",
#     "DT_hour",
# ]
# for col in categorical_features:
#     df_train[col] = df_train[col].astype('category')
#     df_test[col] = df_test[col].astype('category')

In [5]:
# drop useless features
drop_features = [
    "timestamp",  #
    "timestamp_og",
    "site_id",
    'meter',
    "meter_reading",
    "log_meter_reading",
    "year_built",
    "floor_count",
]
all_features = [col for col in df_train.columns if col not in drop_features]

In [6]:
print("train/test shape is:", df_train.shape, df_test.shape)
print("features used # is", len(all_features))
df_train[all_features].head()

train/test shape is: (19228395, 18) (41697600, 17)
features used # is 10


Unnamed: 0,building_id,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,apparent_temperature,DT_hour,DT_day_week
16418015,1262,6,11.085938,3.934783,1.25,-3.282609,0.0,0.350981,0,4
16418016,1262,6,11.085938,3.934783,1.25,-3.282609,0.0,0.350981,0,4
16418017,1262,6,11.085938,3.934783,1.25,-3.282609,0.0,0.350981,0,4
2639831,226,6,11.210938,15.6,6.0,-5.6,0.0,11.43978,0,4
3350958,262,1,9.945312,15.6,6.0,-5.6,0.0,11.43978,0,4


In [7]:
cv = 2
models = {}
site_model = {}
cv_scores = {"site_id": [], "meter_id":[], "cv_score": []}
USE_GRID_SEARCH = False
df_test_sites =[]

for site_id in tqdm_notebook(range(16), desc="site_id"):
    
    models[site_id] = []
    site_model = {}
    for meter_id in range(4):
        
        ### train a single model for each site/meter
        X_train_site = df_train[(df_train.site_id==site_id) & (df_train.meter==meter_id)].reset_index(drop=True)
        y_train_site = X_train_site.log_meter_reading
        y_pred_train_site = np.zeros(X_train_site.shape[0])
        
        ### TEST
        X_test_site = df_test[(df_test.site_id==site_id) & (df_test.meter==meter_id)]
        row_ids_site = X_test_site.row_id
        X_test_site = X_test_site[all_features]
        y_pred_test_site = np.zeros(X_test_site.shape[0])
        
        if(len(X_train_site)==0):
            print("Site Id:", site_id, "meter Id:", meter_id, "has no training data!")
            continue
        
        print("==> Training model for site_id:", site_id, "meter:", meter_id, "...")
        
        kf = KFold(n_splits=cv)
        skf = StratifiedKFold(n_splits=cv, shuffle=False, random_state=42)
        
        site_model[meter_id] = []
        score = 0

        #################### grid search #############################################
        if(USE_GRID_SEARCH):
            estimator = xgb.XGBRegressor()
            param_grid = {
                'objective':['reg:squarederror'],
                'metric': ['rmse'],
                'num_leaves': [8, 16, 32],
                'learning_rate': [0.03],
                'n_estimators': [5000],
                'subsample': [0.8], 
                'lambda_l2': [0.1],
                "max_depth": [6],
                'seed': [42],
                'n_jobs':[-1],
                'tree_method': ['gpu_hist'],
            }
            gbm = GridSearchCV(estimator, param_grid, cv=kf)
            gbm.fit(X_train_site[all_features], y_train_site)
            print('Best parameters found by grid search are:', gbm.best_params_)
        else:
            param = {
                'objective':'reg:squarederror',
                'metric': 'rmse',
                'num_leaves': 8,
                'learning_rate': 0.03,
                'n_estimators': 5000,
                'subsample': 0.8, 
                "lambda_l2":0.1,
                "max_depth":6,
                'seed': 42,
                'n_jobs':-1,
                'tree_method':'gpu_hist',
            }
            
        #skf.split(X_train_site, X_train_site.building_id)
        #kf.split(X_train_site)
        for fold, (train_index, valid_index) in enumerate(skf.split(X_train_site, X_train_site.building_id)):
            X_train, X_valid = X_train_site.loc[train_index, all_features], X_train_site.loc[valid_index, all_features]
            y_train, y_valid = y_train_site.iloc[train_index], y_train_site.iloc[valid_index]

            print("train/valid size", X_train.shape, X_valid.shape)
            
            if(USE_GRID_SEARCH):
                model_lgb = xgb.XGBRegressor(**gbm.best_params_)
            else:
                model_lgb = xgb.XGBRegressor(**param)
            
            model_lgb.fit(X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                early_stopping_rounds=200,
                verbose=1000)
        
#             site_model[meter_id].append(model_lgb)

            y_pred_valid = model_lgb.predict(X_valid)
            y_pred_train_site[valid_index] = y_pred_valid

            rmse = np.sqrt(mean_squared_error(y_valid, y_pred_valid))
            print("Site Id:", site_id, "meter Id:", meter_id, ", Fold:", fold+1, ", RMSE:", rmse)
            score += rmse / cv
            
            ### TEST
            y_pred_test_site += model_lgb.predict(X_test_site) / cv

            
            del train_index, valid_index, X_train, X_valid, y_train, y_valid, model_lgb
            
            gc.collect()
            gc.collect()
        
        cv_scores["site_id"].append(site_id)
        cv_scores["meter_id"].append(meter_id)
        cv_scores["cv_score"].append(score)

        print("\nSite Id:", site_id, "meter Id:", meter_id, ", CV RMSE:", np.sqrt(mean_squared_error(y_train_site, y_pred_train_site)), "\n")
        
        df_test_site = pd.DataFrame({"row_id": row_ids_site, "meter_reading": y_pred_test_site})
        df_test_sites.append(df_test_site)
        
        del X_train_site, y_train_site, y_pred_train_site
        del X_test_site, row_ids_site, y_pred_test_site, df_test_site
        gc.collect()
        gc.collect()
        
    models[site_id].append(site_model)

HBox(children=(FloatProgress(value=0.0, description='site_id', max=16.0, style=ProgressStyle(description_width…

==> Training model for site_id: 0 meter: 0 ...
train/valid size (278987, 10) (279013, 10)
[0]	validation_0-rmse:4.79409
Will train until validation_0-rmse hasn't improved in 200 rounds.
[1000]	validation_0-rmse:0.300341
[2000]	validation_0-rmse:0.298874
Stopping. Best iteration:
[1941]	validation_0-rmse:0.298724

Site Id: 0 meter Id: 0 , Fold: 1 , RMSE: 0.29872380149572253
train/valid size (279013, 10) (278987, 10)
[0]	validation_0-rmse:4.79773
Will train until validation_0-rmse hasn't improved in 200 rounds.
Stopping. Best iteration:
[446]	validation_0-rmse:0.38305

Site Id: 0 meter Id: 0 , Fold: 2 , RMSE: 0.3830501730569645

Site Id: 0 meter Id: 0 , CV RMSE: 0.34348265301353 

==> Training model for site_id: 0 meter: 1 ...
train/valid size (80990, 10) (81006, 10)
[0]	validation_0-rmse:6.52285
Will train until validation_0-rmse hasn't improved in 200 rounds.
Stopping. Best iteration:
[269]	validation_0-rmse:1.29846

Site Id: 0 meter Id: 1 , Fold: 1 , RMSE: 1.2984642885757607
train/val

Stopping. Best iteration:
[479]	validation_0-rmse:1.09847

Site Id: 6 meter Id: 1 , Fold: 2 , RMSE: 1.0984688951408317

Site Id: 6 meter Id: 1 , CV RMSE: 1.1790860169246407 

==> Training model for site_id: 6 meter: 2 ...
train/valid size (85295, 10) (85306, 10)
[0]	validation_0-rmse:5.09721
Will train until validation_0-rmse hasn't improved in 200 rounds.
Stopping. Best iteration:
[226]	validation_0-rmse:1.3958

Site Id: 6 meter Id: 2 , Fold: 1 , RMSE: 1.3957955532131805
train/valid size (85306, 10) (85295, 10)
[0]	validation_0-rmse:4.57279
Will train until validation_0-rmse hasn't improved in 200 rounds.
Stopping. Best iteration:
[425]	validation_0-rmse:1.47286

Site Id: 6 meter Id: 2 , Fold: 2 , RMSE: 1.472861422863208

Site Id: 6 meter Id: 2 , CV RMSE: 1.4348435006707128 

Site Id: 6 meter Id: 3 has no training data!
==> Training model for site_id: 7 meter: 0 ...
train/valid size (46073, 10) (46079, 10)
[0]	validation_0-rmse:6.63082
Will train until validation_0-rmse hasn't improve

Site Id: 11 meter Id: 0 , Fold: 1 , RMSE: 0.1908495462043449
train/valid size (21702, 10) (21698, 10)
[0]	validation_0-rmse:4.78922
Will train until validation_0-rmse hasn't improved in 200 rounds.
Stopping. Best iteration:
[257]	validation_0-rmse:0.192247

Site Id: 11 meter Id: 0 , Fold: 2 , RMSE: 0.19224690049620505

Site Id: 11 meter Id: 0 , CV RMSE: 0.19154943317381332 

==> Training model for site_id: 11 meter: 1 ...
train/valid size (13841, 10) (13843, 10)
[0]	validation_0-rmse:2.24807
Will train until validation_0-rmse hasn't improved in 200 rounds.
Stopping. Best iteration:
[78]	validation_0-rmse:1.14712

Site Id: 11 meter Id: 1 , Fold: 1 , RMSE: 1.1471209691384228
train/valid size (13843, 10) (13841, 10)
[0]	validation_0-rmse:3.31013
Will train until validation_0-rmse hasn't improved in 200 rounds.
Stopping. Best iteration:
[114]	validation_0-rmse:1.41715

Site Id: 11 meter Id: 1 , Fold: 2 , RMSE: 1.4171463382218643

Site Id: 11 meter Id: 1 , CV RMSE: 1.2892129845093603 

Site

Will train until validation_0-rmse hasn't improved in 200 rounds.
[1000]	validation_0-rmse:0.51829
Stopping. Best iteration:
[1183]	validation_0-rmse:0.517557

Site Id: 15 meter Id: 1 , Fold: 2 , RMSE: 0.5175568560510011

Site Id: 15 meter Id: 1 , CV RMSE: 0.5294040426329114 

==> Training model for site_id: 15 meter: 2 ...
train/valid size (246742, 10) (246754, 10)
[0]	validation_0-rmse:5.30097
Will train until validation_0-rmse hasn't improved in 200 rounds.
Stopping. Best iteration:
[401]	validation_0-rmse:1.06597

Site Id: 15 meter Id: 2 , Fold: 1 , RMSE: 1.065972836247144
train/valid size (246754, 10) (246742, 10)
[0]	validation_0-rmse:5.28104
Will train until validation_0-rmse hasn't improved in 200 rounds.
Stopping. Best iteration:
[658]	validation_0-rmse:0.888258

Site Id: 15 meter Id: 2 , Fold: 2 , RMSE: 0.8882574684259177

Site Id: 15 meter Id: 2 , CV RMSE: 0.981149291088162 

==> Training model for site_id: 15 meter: 3 ...
train/valid size (7398, 10) (7399, 10)
[0]	validatio

In [8]:
### compute weight cv score
def weight_cv_score(df_train, cv_score_df):
    
    mini_model_portion = []
    for site_id in tqdm_notebook(range(16), desc="site_id"):
        for meter_id in range(4):
            X_train_site = df_train[(df_train.site_id==site_id) & (df_train.meter==meter_id)].reset_index(drop=True)
            if(len(X_train_site) != 0):
                portion_rate = np.round(X_train_site.shape[0] / df_train.shape[0] * 100, 2)
                print("site_id:", site_id, "meter:",meter_id, "---", X_train_site.building_id.nunique(),"buildings", portion_rate, "% :", cv_score_df[cv_score_df.site_id==site_id][cv_score_df.meter_id==meter_id]['cv_score'].values.round(4))                
                mini_model_portion.append(X_train_site.shape[0] / df_train.shape[0])
        print("--------------------------------------------")
    cv_weight_score = np.array(mini_model_portion) * (np.array(cv_score_df.cv_score) ** 2)
    weighted_score = np.round(cv_weight_score.sum() ** 0.5, 4)
    
    return weighted_score
cv_score_df = pd.DataFrame.from_dict(cv_scores)
print("Overall CV score is:", weight_cv_score(df_train, cv_score_df))

HBox(children=(FloatProgress(value=0.0, description='site_id', max=16.0, style=ProgressStyle(description_width…

site_id: 0 meter: 0 --- 105 buildings 2.9 % : [0.3409]
site_id: 0 meter: 1 --- 24 buildings 0.84 % : [1.35]
--------------------------------------------
site_id: 1 meter: 0 --- 51 buildings 2.33 % : [0.7004]
site_id: 1 meter: 3 --- 12 buildings 0.54 % : [1.7214]
--------------------------------------------
site_id: 2 meter: 0 --- 135 buildings 6.11 % : [0.4153]
site_id: 2 meter: 1 --- 99 buildings 4.29 % : [0.9081]
site_id: 2 meter: 3 --- 55 buildings 2.04 % : [0.9458]
--------------------------------------------
site_id: 3 meter: 0 --- 274 buildings 12.31 % : [0.4321]
--------------------------------------------
site_id: 4 meter: 0 --- 91 buildings 3.86 % : [0.2191]
--------------------------------------------
site_id: 5 meter: 0 --- 89 buildings 4.02 % : [0.6528]
--------------------------------------------
site_id: 6 meter: 0 --- 36 buildings 1.64 % : [0.3011]
site_id: 6 meter: 1 --- 21 buildings 0.65 % : [1.1765]
site_id: 6 meter: 2 --- 23 buildings 0.89 % : [1.4343]
--------------

---

In [9]:
# del df_train, X_train_site, y_train_site, X_train, y_train, dtrain, X_valid, y_valid, dvalid, y_pred_train_site, y_pred_valid, rmse, score, cv_scores
# gc.collect()

In [10]:
# # look at feature importance
# for site_id in range(16):
#     for meter_id in models[site_id][0].keys():
#         for cv_id in range(cv):
#             xgb.plot_importance(models[site_id][0][meter_id][cv_id], figsize=(9,6))
#             plt.title(str(site_id) + "_" + str(meter_id)+"_"+str(cv_id))
#             plt.show()

In [11]:
# ### make predictions
# df_test_sites = []

# for site_id in tqdm_notebook(range(16), desc="site_id"):
    
#     for meter_id in range(4):
    
#         X_test_site = df_test[(df_test.site_id==site_id) & (df_test.meter==meter_id)]
#         row_ids_site = X_test_site.row_id

#         X_test_site = X_test_site[all_features]
#         y_pred_test_site = np.zeros(X_test_site.shape[0])
        
#         if(len(X_test_site)==0):
#             continue

#         for fold in range(cv):
#             model_lgb = models[site_id][0][meter_id][fold]
#             y_pred_test_site += model_lgb.predict(X_test_site, num_iteration=model_lgb.best_iteration_) / cv
#             gc.collect()
        
#         df_test_site = pd.DataFrame({"row_id": row_ids_site, "meter_reading": y_pred_test_site})
#         df_test_sites.append(df_test_site)
        
#         gc.collect()

In [12]:
submit = pd.concat(df_test_sites)
submit.meter_reading = np.clip(np.expm1(submit.meter_reading), 0, a_max=None)
submit.meter_reading = submit.meter_reading.round(4)
print(submit.shape)
submit.head()

(41697600, 2)


Unnamed: 0,row_id,meter_reading
0,0,170.9701
1156320,66,116.6722
1121280,64,260.7244
1103760,63,322.7231
1138800,65,440.0671


In [13]:
### should be (41697600, 2)
WRITE_TOKEN = True
SUBMIT_TOKEN = False
if((submit.shape == (41697600, 2)) * (WRITE_TOKEN)):
    print("Good, the submission is ready to submit..., submission shape is", submit.shape)
    SUBMISSION_FILE_PATH = "../submission/sub_xgb.csv.gz"
    print("Writing the submission to csv.gz file ...")
    submit.to_csv(SUBMISSION_FILE_PATH, index=False, compression='gzip')
    if(SUBMIT_TOKEN):
        print("Submitting the csv.gz file to Kaggle platform ...")
        !kaggle competitions submit -c ashrae-energy-prediction -f "../submission/sub_xgb.csv.gz" -m "xgb automatic submit ..."
        print("Submitted successfully! Good luck ...")
else:
    print("There are some things wrong ... The submission is not submitted ...")

Good, the submission is ready to submit..., submission shape is (41697600, 2)
Writing the submission to csv.gz file ...
