**This file contains the code for the LGBM base model**

Importing the necessary files from drive

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from tqdm import tqdm
import sklearn

In [None]:
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.model_selection import StratifiedKFold,KFold
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from scipy.stats import uniform,randint
from sklearn.model_selection import train_test_split

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
file=open('/content/drive/MyDrive/Project Energy Consumption/df_tr_red_final_modified.txt','rb')
df_tr_red_final=pickle.load(file)

In [None]:
df_tr_red_final.reset_index(inplace=True)

In [None]:
df_tr_red_final.drop(['index','timestamp'],axis=1,inplace=True)

In [None]:
df_tr_red_final.drop('level_0',axis=1,inplace=True)

**Target Transformation**

1.   Here I am taking log1p of the meter readings and then I will evaluate my base models on RMSE which by default becomes the RMSLE(The evaluation metric on which we have to evaluate on).



In [None]:
y_tr=np.log1p(df_tr_red_final['meter_reading'])
df_tr_red_final.drop('meter_reading',axis=1,inplace=True)

**Dropping the features which are not important**

In [None]:
df_tr_red_final.drop(['cloud_coverage','sea_level_pressure','wind_direction','wind_speed',
                      'is_summer_month','is_pub_holiday'],axis=1,inplace=True)

**Custom Ensembling**

1.   Here first I will divide my train data into 80-20 split.Now from that 80% data I will further divide it into 50-50.After that from that 50% I will start doing sampling with replacement.Now my base models will train on that sampled data and will predict on the remaining 50% data.



In [None]:
X_train,X_test,y_train,y_test=train_test_split(df_tr_red_final,y_tr,test_size=0.2,random_state=0)

In [None]:
X_train_d1,X_train_d2,y_train_d1,y_train_d2=train_test_split(X_train,y_train,test_size=0.5,random_state=0)

**Doing Sampling with replacement.Setting up random state helps to reproduce the results**

In [None]:
s3_d1=X_train_d1.sample(frac=0.8,replace=True,random_state=2)
y3_d1=y_train_d1.sample(frac=0.8,replace=True,random_state=2)

**hyperparameter Tuning for LGBM Base Model**

In [None]:
params={'max_depth':[3,5,7,9,11],
'learning_rate':[0.1,0.01,0.03,0.05],
'colsample_bytree':[0.7,0.8,0.9,1.0],
'n_estimators':[300,500,800,1200],
'min_child_samples':[50,100,200,300,500]}


lgb_reg=LGBMRegressor()
random_lgb=RandomizedSearchCV(lgb_reg,params,n_iter=8,scoring='neg_root_mean_squared_error',cv=3,verbose=1,random_state=42,n_jobs=-1)
random_lgb.fit(s3_d1,y3_d1)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed: 63.2min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=LGBMRegressor(boosting_type='gbdt',
                                           class_weight=None,
                                           colsample_bytree=1.0,
                                           importance_type='split',
                                           learning_rate=0.1, max_depth=-1,
                                           min_child_samples=20,
                                           min_child_weight=0.001,
                                           min_split_gain=0.0, n_estimators=100,
                                           n_jobs=-1, num_leaves=31,
                                           objective=None, random_state=None,
                                           reg_alpha=0.0, reg_lambda=0.0,
                                           silen...
                   iid='deprecated', n_iter=8, n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.7, 0.8, 0.9,
  

**Finding the best score and params from the above search**

In [None]:
random_lgb.best_params_

{'colsample_bytree': 1.0,
 'learning_rate': 0.1,
 'max_depth': 11,
 'min_child_samples': 300,
 'n_estimators': 800}

In [None]:
random_lgb.best_score_

-0.9034979261671303

**Fitting the model with the best params on the sampled data**

In [None]:
lgb_model=LGBMRegressor(n_estimators=800,min_child_samples=300,max_depth=11,learning_rate=0.1,colsample_bytree=1.0,n_jobs=-1)

In [None]:
lgb_model.fit(s3_d1,y3_d1)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=11,
              min_child_samples=300, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=800, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

**Important Points**

1.   After saving my best model I will predict on the other 50% data using my base model(XGBOOST) and then I will convert that into a dataframe which will serve as input for my meta model and the target variable will be taken from the other 50% data(Ground Truth).



In [None]:
filename='lgb_model_enesemble.txt'
my_model=open(filename,'wb')

In [None]:
pickle.dump(lgb_model,my_model)

In [None]:
s3_predict=lgb_model.predict(X_train_d2)

In [None]:
s3_predict_df=pd.DataFrame(s3_predict,columns=['s3_predict'])

In [None]:
s3_predict_test=lgb_model.predict(X_test)

In [None]:
s3_predict_test_df=pd.DataFrame(s3_predict_test,columns=['s3_predict_test'])

In [None]:
filename='s3_pred_df.txt'
my_file_3=open(filename,'wb')

In [None]:
pickle.dump(s3_predict_df,my_file_3)

In [None]:
filename='s3_test_df.txt'
my_file_4=open(filename,'wb')

In [None]:
pickle.dump(s3_predict_test_df,my_file_4)

                                                             **End of Notebook**