In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns
import gc, math, os, warnings
warnings.simplefilter("ignore")
from tqdm import tqdm_notebook

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
import lightgbm as lgb

In [2]:
FOLDER = "../features/"
df_train = pd.read_pickle(FOLDER + "train_features.pkl")
df_test = pd.read_pickle(FOLDER + "test_features.pkl")

print("train/test shape is:", df_train.shape, df_test.shape)

train/test shape is: (19228395, 18) (41697600, 17)


In [3]:
df_train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,IsHoliday,log_meter_reading,DT_hour,DT_day_week,timestamp_og
16418015,1262,0,0.0,110.51,14,6,11.085938,,,3.934783,1.25,-3.282609,0.0,1,4.714114,0,4,2016-01-01
16418016,1262,1,0.0,116.0,14,6,11.085938,,,3.934783,1.25,-3.282609,0.0,1,4.762174,0,4,2016-01-01
16418017,1262,3,0.0,855.1,14,6,11.085938,,,3.934783,1.25,-3.282609,0.0,1,6.752387,0,4,2016-01-01
2639831,226,0,0.0,29.45,2,6,11.210938,1930.0,,15.6,6.0,-5.6,0.0,1,3.416086,0,4,2016-01-01
3350958,262,0,0.0,16.84,2,1,9.945312,1999.0,,15.6,6.0,-5.6,0.0,1,2.881443,0,4,2016-01-01


In [4]:
# set category feature
categorical_features = [
    "building_id",  #
    "primary_use",
    "DT_day_week",
    "DT_hour",
    #     'IsHoliday'
]
for col in categorical_features:
    df_train[col] = df_train[col].astype('category')
    df_test[col] = df_test[col].astype('category')

In [5]:
# drop useless features
drop_features = [
    "timestamp",  #
    "timestamp_og",
    "site_id",
    'meter',
    "meter_reading",
    "log_meter_reading",
    "IsHoliday",
    "floor_count",
]
all_features = [col for col in df_train.columns if col not in drop_features]

In [6]:
print("train/test shape is:", df_train.shape, df_test.shape)
print("features used # is", len(all_features))
df_train[all_features].head()

train/test shape is: (19228395, 18) (41697600, 17)
features used # is 10


Unnamed: 0,building_id,primary_use,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,DT_hour,DT_day_week
16418015,1262,6,11.085938,,3.934783,1.25,-3.282609,0.0,0,4
16418016,1262,6,11.085938,,3.934783,1.25,-3.282609,0.0,0,4
16418017,1262,6,11.085938,,3.934783,1.25,-3.282609,0.0,0,4
2639831,226,6,11.210938,1930.0,15.6,6.0,-5.6,0.0,0,4
3350958,262,1,9.945312,1999.0,15.6,6.0,-5.6,0.0,0,4


In [7]:
cv = 2
models = {}
site_model = {}
cv_scores = {"site_id": [], "meter_id":[], "cv_score": []}
USE_GRID_SEARCH = False

for site_id in tqdm_notebook(range(16), desc="site_id"):
    
    models[site_id] = []
    site_model = {}
    for meter_id in range(4):
        
        ### train a single model for each site/meter
        X_train_site = df_train[(df_train.site_id==site_id) & (df_train.meter==meter_id)].reset_index(drop=True)
        y_train_site = X_train_site.log_meter_reading
        y_pred_train_site = np.zeros(X_train_site.shape[0])
        
        if(len(X_train_site)==0):
            print("Site Id:", site_id, "meter Id:", meter_id, "has no training data!")
            continue
        
        print("==> Training model for site_id:", site_id, "meter:", meter_id, "...")
        
        site_model[meter_id] = []
        score = 0

        #################### grid search #############################################
        if(USE_GRID_SEARCH):
            estimator = lgb.LGBMRegressor()
            param_grid = {
                'metric': ['rmse'],
                'num_leaves': [4, 8, 16, 32, 64],
                'learning_rate': [0.02],
                'n_estimators': [40000],
                'subsample': [0.85], 
                'colsample_bytree': [0.85],
                'seed': [42],
                'n_jobs':[-1]
            }
            gbm = GridSearchCV(estimator, param_grid, cv=kf)
            gbm.fit(X_train_site[all_features], y_train_site)
            print('Best parameters found by grid search are:', gbm.best_params_)
        else:
            param = {
                'metric': 'rmse',
                'num_leaves': 8,
                'learning_rate': 0.5,
                'n_estimators': 10000,
                'subsample': 0.85, 
                'colsample_bytree': 1,
                'seed': 42,
                'n_jobs':-1
            }
            
        # Train on WHOLE
        model_lgb = lgb.LGBMRegressor(**param)
        model_lgb.fit(X_train_site[all_features], y_train_site,
                      eval_set=None,
            #eval_set=[(X_train_site[all_features], y_train_site)],
            #eval_metric='l2',
                      categorical_feature=categorical_features,
            #early_stopping_rounds=100, 
                      verbose=1000)
        site_model[meter_id].append(model_lgb)
        
        
#         # KFOLD
#         for fold, (train_index, valid_index) in enumerate(kf.split(X_train_site)):
#             X_train, X_valid = X_train_site.loc[train_index, all_features], X_train_site.loc[valid_index, all_features]
#             y_train, y_valid = y_train_site.iloc[train_index], y_train_site.iloc[valid_index]
            
#             print("train/valid size", X_train.shape, X_valid.shape)
            
#             if(USE_GRID_SEARCH):
#                 model_lgb = lgb.LGBMRegressor(**gbm.best_params_)
#             else:
#                 model_lgb = lgb.LGBMRegressor(**param)
            
#             model_lgb.fit(X_train, y_train,
#                 eval_set=[(X_train_site[all_features], y_train_site)],
#                 eval_metric='l2',
#                           categorical_feature=categorical_features,
#                 early_stopping_rounds=300,
#                 verbose=False)
        
#             site_model[meter_id].append(model_lgb)

#             y_pred_valid = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration_)
#             y_pred_train_site[valid_index] = y_pred_valid

#             rmse = np.sqrt(mean_squared_error(y_valid, y_pred_valid))
#             print("Site Id:", site_id, "meter Id:", meter_id, ", Fold:", fold+1, ", RMSE:", rmse)
#             score += rmse / cv

#             gc.collect()
        
#         cv_scores["site_id"].append(site_id)
#         cv_scores["meter_id"].append(meter_id)
#         cv_scores["cv_score"].append(score)
        
#         print("\nSite Id:", site_id, "meter Id:", meter_id, ", CV RMSE:", np.sqrt(mean_squared_error(y_train_site, y_pred_train_site)), "\n")
    models[site_id].append(site_model)

HBox(children=(IntProgress(value=0, description='site_id', max=16, style=ProgressStyle(description_width='init…

==> Training model for site_id: 0 meter: 0 ...
==> Training model for site_id: 0 meter: 1 ...
Site Id: 0 meter Id: 2 has no training data!
Site Id: 0 meter Id: 3 has no training data!
==> Training model for site_id: 1 meter: 0 ...
Site Id: 1 meter Id: 1 has no training data!
Site Id: 1 meter Id: 2 has no training data!
==> Training model for site_id: 1 meter: 3 ...
==> Training model for site_id: 2 meter: 0 ...
==> Training model for site_id: 2 meter: 1 ...
Site Id: 2 meter Id: 2 has no training data!
==> Training model for site_id: 2 meter: 3 ...
==> Training model for site_id: 3 meter: 0 ...
Site Id: 3 meter Id: 1 has no training data!
Site Id: 3 meter Id: 2 has no training data!
Site Id: 3 meter Id: 3 has no training data!
==> Training model for site_id: 4 meter: 0 ...
Site Id: 4 meter Id: 1 has no training data!
Site Id: 4 meter Id: 2 has no training data!
Site Id: 4 meter Id: 3 has no training data!
==> Training model for site_id: 5 meter: 0 ...
Site Id: 5 meter Id: 1 has no train

---

In [8]:
# del df_train, X_train_site, y_train_site, X_train, y_train, dtrain, X_valid, y_valid, dvalid, y_pred_train_site, y_pred_valid, rmse, score, cv_scores
# gc.collect()

In [9]:
### make predictions
df_test_sites = []

for site_id in tqdm_notebook(range(16), desc="site_id"):
    
    for meter_id in range(4):
    
        X_test_site = df_test[(df_test.site_id==site_id) & (df_test.meter==meter_id)]
        row_ids_site = X_test_site.row_id

        X_test_site = X_test_site[all_features]
        y_pred_test_site = np.zeros(X_test_site.shape[0])
        
        if(len(X_test_site)==0):
            continue
        
        model_lgb = models[site_id][0][meter_id][0]
        y_pred_test_site += model_lgb.predict(X_test_site, num_iteration=model_lgb.best_iteration_)
        gc.collect()
        
        df_test_site = pd.DataFrame({"row_id": row_ids_site, "meter_reading": y_pred_test_site})
        df_test_sites.append(df_test_site)
        
        gc.collect()

HBox(children=(IntProgress(value=0, description='site_id', max=16, style=ProgressStyle(description_width='init…




In [10]:
submit = pd.concat(df_test_sites)
submit.meter_reading = np.clip(np.expm1(submit.meter_reading), 0, a_max=None)
submit.meter_reading = submit.meter_reading.round(4)
submit.head()

Unnamed: 0,row_id,meter_reading
0,0,141.3757
1156320,66,99.6008
1121280,64,532.0747
1103760,63,271.1895
1138800,65,486.4675


In [12]:
### should be (41697600, 2)
SUBMIT_TOKEN = True
if((submit.shape == (41697600, 2)) * (SUBMIT_TOKEN)):
    print("Good, the submission is ready to submit..., submission shape is", submit.shape)
    SUBMISSION_FILE_PATH = "../submission/sub.csv.gz"
    print("Writing the submission to csv.gz file ...")
    submit.to_csv(SUBMISSION_FILE_PATH, index=False, compression='gzip')
    print("Submitting the csv.gz file to Kaggle platform ...")
    !kaggle competitions submit -c ashrae-energy-prediction -f "../submission/sub.csv.gz" -m "automatic submit ..."
    print("Submitted successfully! Good luck ...")
else:
    print("There are some things wrong ... The submission is not submitted ...")

Good, the submission is ready to submit..., submission shape is (41697600, 2)
Writing the submission to csv.gz file ...
Submitting the csv.gz file to Kaggle platform ...
100%|████████████████████████████████████████| 264M/264M [01:56<00:00, 2.38MB/s]
Successfully submitted to ASHRAE - Great Energy Predictor IIISubmitted successfully! Good luck ...
