In [None]:
import pandas as pd
import numpy as np
import pickle
import datetime as dt
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
import gc
from numpy import percentile
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as MSE
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn import preprocessing
import lightgbm as lgb
from prettytable import PrettyTable

In [None]:
def reduce_mem_usage(df, verbose=True):
    '''This function reduces the size of  dataframe '''
    
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Linear regression(label encoding)


In [None]:
#loading dataframes
with open("/content/drive/MyDrive/cleaned_df.pkl","rb") as f:
  cleaned_df=pickle.load(f)

with open("/content/drive/MyDrive/Featured_df.pkl","rb") as f:
  Featured_df=pickle.load(f)

In [None]:
# removing one hot encoded primary_use feature
l1=[]
for col in Featured_df.columns:
  if col.startswith("primary"):
    l1.append(col)

Featured_df=Featured_df.drop(l1,axis=1)


In [None]:
#label encoding primary_use feature
label_encoder = preprocessing.LabelEncoder()
Featured_df['primary_use']= label_encoder.fit_transform(cleaned_df['primary_use'])

In [None]:
y=Featured_df.meter_reading
X=Featured_df.drop('meter_reading',axis=1)

In [None]:
del Featured_df
gc.collect()

790

In [None]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

del X,y

In [None]:
#training linear regression model
model=LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [None]:
#predicting test data and train data
y_predict_test=model.predict(X_test)
y_predict_train=model.predict(X_train)

#root mean squared error of test data and train data
print("RMSE of train data:",MSE(y_train,y_predict_train,squared=False))
print("RMSE of test data:",MSE(y_test,y_predict_test,squared=False))

RMSE of train data: 1.6757702209110554
RMSE of test data: 1.6774855671483084


Linear regression with label encoding is performing slightly worst than linear regression with one hot encoding. Hence we will use one hot encoding for all other models

# Decision tree

In [None]:
#loading dataframe
with open("/content/drive/MyDrive/Featured_df.pkl","rb") as f:
  Featured_df=pickle.load(f)

y=Featured_df.meter_reading
X=Featured_df.drop('meter_reading',axis=1)

del Featured_df
gc.collect()

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
# Hyperparameter Tuning
params={'max_depth':[25,30,35,40]}
clf=RandomizedSearchCV(DecisionTreeRegressor(random_state=0),params,scoring='neg_root_mean_squared_error',
                       cv=3,random_state=0,return_train_score=True)
clf.fit(X_train,y_train)
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,150.380704,0.857357,3.989043,0.07413,25,{'max_depth': 25},-0.642098,-0.645421,-0.648611,-0.645377,0.002659,1,-0.381253,-0.407494,-0.41213,-0.400292,0.013595
1,175.317555,7.053305,7.350448,0.566913,30,{'max_depth': 30},-0.650094,-0.643975,-0.645567,-0.646545,0.002592,2,-0.203373,-0.237119,-0.238295,-0.226262,0.016193
2,187.832401,0.88784,8.807534,0.703918,35,{'max_depth': 35},-0.660738,-0.656569,-0.656952,-0.658086,0.001881,3,-0.101205,-0.128482,-0.129042,-0.119576,0.012992
3,185.203277,2.849498,8.059208,0.129561,40,{'max_depth': 40},-0.66495,-0.66348,-0.662534,-0.663655,0.000994,4,-0.043686,-0.062212,-0.064032,-0.056643,0.009192


from above table we can see that max_depth=25 is the best hyperparameter

In [None]:

# Training Decision tree with max_depth=25
model=DecisionTreeRegressor(random_state=0,max_depth=25)
model.fit(X_train,y_train)

#predicting test data and train data
y_predict_test=model.predict(X_test)
y_predict_train=model.predict(X_train)

#root mean squared error of test data and train data
print("RMSE of train data:",MSE(y_train,y_predict_train,squared=False))
print("RMSE of test data:",MSE(y_test,y_predict_test,squared=False))

RMSE of train data: 0.42514026089113566
RMSE of test data: 0.611504709817372


Thus with decision tree we are getting RMSE value for test data= 0.6115. This is better than our baseline model(linear regression + OHE) RMSE=1.6662 

# LGBM Model

In [None]:
with open("/content/drive/MyDrive/Featured_df.pkl","rb") as f:
  Featured_df=pickle.load(f)


In [None]:
y=Featured_df.meter_reading
X=Featured_df.drop('meter_reading',axis=1)

In [None]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

del X,y
del Featured_df
gc.collect()

8

In [None]:
'''Here, as i didn't have enough computaional power i didn't use randomizedSearchCV. I have manually tried different sets of parameters 
   and shown the code for only best parametrs '''

model = lgb.LGBMRegressor(max_depth=16,n_estimators=1300,random_state=0,n_jobs=-1,num_leaves=200,learning_rate=1)
model.fit(X_train,y_train)


#predicting test data and train data
y_predict_test=model.predict(X_test)
y_predict_train=model.predict(X_train)

#root mean squared error of test data 
print("RMSE of train data:",MSE(y_train,y_predict_train,squared=False))
print("RMSE of test data:",MSE(y_test,y_predict_test,squared=False))


RMSE of train data: 0.48297907432123144
RMSE of test data: 0.5707418104775989


LGBM is the best model till now with test data RMSE=0.5707

In [None]:
del Featured_df
gc.collect()

0

# LGBM with day_of_week and season feature

In [None]:
#adding day_of_week feature
Featured_df["day_of_week"]=np.uint8(cleaned_df['timestamp'].dt.dayofweek)

# adding season feature
Featured_df['season']= Featured_df['Month'].apply(lambda x: 'Spring' if x==3 or x==4 or x==5 else 'Summer' if 
                                                x==6 or x==7 or x==8 
                                                else 'Autumn' if x==9 or x==10 or 
                                                x==11 else 'Winter')
# one hot encoding season feature
Featured_df=pd.get_dummies(Featured_df, columns=['season'])

In [None]:
#reducing size
Featured_df=reduce_mem_usage(Featured_df)

Mem. usage decreased to 618.64 Mb (43.9% reduction)


In [None]:
y=Featured_df.meter_reading
X=Featured_df.drop('meter_reading',axis=1)

In [None]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

del X,y,cleaned_df
del Featured_df
gc.collect()

431

In [None]:
#training the model
model = lgb.LGBMRegressor(max_depth=16,n_estimators=1300,random_state=0,n_jobs=-1,num_leaves=200,learning_rate=1)
model.fit(X_train,y_train)


#predicting test data and train data
y_predict_test=model.predict(X_test)
y_predict_train=model.predict(X_train)

#root mean squared error of test data 
print("RMSE of train data:",MSE(y_train,y_predict_train,squared=False))
print("RMSE of test data:",MSE(y_test,y_predict_test,squared=False))

RMSE of train data: 0.4680493259186183
RMSE of test data: 0.5568587537669808


After adding day of week and season feature lgbm model's rmse value for test data is decreased from 0.5707 to 0.5568.

# Summary table

In [None]:
x = PrettyTable()
x.field_names=['Model','Train RMSE', 'Test RMSE']
x.add_row(['Linear regression with outliers(OHE)',1.8567,1.8573])
x.add_row(['Linear regression without outliers(OHE)',1.6645,1.6663])
x.add_row(['Linear regression(label encoding)',1.6758,1.6775])
x.add_row(['Decision Tree(OHE)',0.4251,0.6115])
x.add_row(['LightGBM(OHE)',0.4829,0.5707])
x.add_row(['LightGBM with Weekday and season', 0.4680,0.5568])
print(x)

+-----------------------------------------+------------+-----------+
|                  Model                  | Train RMSE | Test RMSE |
+-----------------------------------------+------------+-----------+
|   Linear regression with outliers(OHE)  |   1.8567   |   1.8573  |
| Linear regression without outliers(OHE) |   1.6645   |   1.6663  |
|    Linear regression(label encoding)    |   1.6758   |   1.6775  |
|            Decision Tree(OHE)           |   0.4251   |   0.6115  |
|              LightGBM(OHE)              |   0.4829   |   0.5707  |
|     LightGBM with Weekday and season    |   0.468    |   0.5568  |
+-----------------------------------------+------------+-----------+
