**THIS FILE CONTAINS THE NECESSARY CODE FOR MODELLING AND HYPERPARAMETER TUNING FOR LGBM REGRESSOR.**

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from tqdm import tqdm
import sklearn

In [None]:
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.model_selection import StratifiedKFold,KFold
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import xgboost
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from scipy.stats import uniform,randint
from sklearn.model_selection import train_test_split

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
file=open('/content/drive/MyDrive/Project Energy Consumption/df_tr_red_final_modified.txt','rb')
df_tr_red_final=pickle.load(file)

In [None]:
df_tr_red_final.drop(['index','timestamp'],axis=1,inplace=True)

In [None]:
df_tr_red_final.reset_index(inplace=True)

**TARGET TRANSFORMATION**

1.   AS THE METRIC IS RMSLE I AM TAKING THE LOG1P OF THE METER READINGS THEN TAKING THE EVALUATION METRIC TO BE RMSE.



In [None]:
y_tr=np.log1p(df_tr_red_final['meter_reading'])
df_tr_red_final.drop('meter_reading',axis=1,inplace=True)

**DROPPING THE FEATURES WHICH ARE NOT IMPORTANT**

In [None]:
df_tr_red_final.drop(['cloud_coverage','sea_level_pressure','wind_direction','wind_speed',
                      'is_summer_month','is_pub_holiday'],axis=1,inplace=True)

In [None]:
df_tr_red_final.drop('index',axis=1,inplace=True)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(df_tr_red_final,y_tr,test_size=0.2,random_state=0)

In [None]:
kf=KFold(n_splits=3,random_state=0)

**HYPERPARAMETER TUNING**



1.   HERE I AM DOING THE HYPERPARAMETER TUNING USING RANDOMIZED SEARCH CV WITH THE USE OF GPU.
2.   IT HELPS THE MODEL TO FIND THE BEST PARAMS REQUIRED FOR TRAINING.





In [None]:
params={'max_depth':[3,5,7,9,11],
'learning_rate':[0.1,0.01,0.03,0.05],
'colsample_bytree':[0.7,0.8,0.9,1.0],
'n_estimators':[300,500,800,1200],
'min_child_samples':[50,100,200,300,500]}


lgb_reg=LGBMRegressor()
random_clf=RandomizedSearchCV(lgb_reg,params,n_iter=8,scoring='neg_root_mean_squared_error',cv=kf,verbose=24,random_state=1,n_jobs=-1)
random_clf.fit(X_train,y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 17.0min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed: 29.3min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 34.1min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed: 40.8min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed: 45.6min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 51.6min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 52.6min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 52.7min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed: 57.3min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed: 63.5min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed: 76.4min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed: 76.5min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed: 80

RandomizedSearchCV(cv=KFold(n_splits=3, random_state=0, shuffle=False),
                   error_score=nan,
                   estimator=LGBMRegressor(boosting_type='gbdt',
                                           class_weight=None,
                                           colsample_bytree=1.0,
                                           importance_type='split',
                                           learning_rate=0.1, max_depth=-1,
                                           min_child_samples=20,
                                           min_child_weight=0.001,
                                           min_split_gain=0.0, n_estimators=100,
                                           n_jobs=-1, num_leaves=31,
                                           objective=None, random_...
                   iid='deprecated', n_iter=8, n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.7, 0.8, 0.9,
                                                             1.0],
   

**BEST PARAMS**

In [None]:
random_clf.best_params_

{'colsample_bytree': 0.9,
 'learning_rate': 0.1,
 'max_depth': 7,
 'min_child_samples': 100,
 'n_estimators': 1200}

**BEST SCORE**

In [None]:
random_clf.best_score_

-0.8866650518027203

**PREDICTING ON THE TEST SET WITH THE BEST PARAMS FOUND FROM HYPERPARAMETER TUNING**

In [None]:
test_pred=random_clf.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test,test_pred))

0.8857758144612343

**FITTING THE BEST MODEL ON THE TRAINING DATA**

In [None]:
lgbm_reg_final=LGBMRegressor(n_estimators=1200,min_child_samples=100,max_depth=7,learning_rate=0.1,colsample_bytree=0.9,n_jobs=-1)

In [None]:
lgbm_model=lgbm_reg_final.fit(df_tr_red_final,y_tr)

**STORING THE BEST MODEL IN THE FORM OF PICKLE AND DUMPING IT ON DRIVE**

In [None]:
filename='lgbm_model_2.txt'
my_file=open(filename,'wb')

In [None]:
pickle.dump(lgbm_model,my_file)