## Regression Analysis

In this notebook, we perform regression analysis on the cleaned dataset with the following regressors:

    1. Ridge Regressor
    2. Lasso Regressor
    3. Elastic Net Regressor

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
dfTrain = pd.read_csv("data/train_cleaned.csv")

dfTrain.head()

Unnamed: 0,Year_Factor,State_Factor,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,...,days_below_0F,days_above_80F,days_above_90F,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,site_eui
0,1,0,0,13,61242.0,1942.0,11.0,2.4,36,50.5,...,0,14,0,0,0,1.0,1.0,1.0,109.142051,248.682615
1,1,0,0,55,274000.0,1955.0,45.0,1.8,36,50.5,...,0,14,0,0,0,1.0,62.779974,1.0,12.0,26.50015
2,1,0,0,48,280025.0,1951.0,97.0,1.8,36,50.5,...,0,14,0,0,0,1.0,62.779974,1.0,12.0,24.693619
3,1,0,0,6,55325.0,1980.0,46.0,1.8,36,50.5,...,0,14,0,0,0,1.0,62.779974,1.0,12.0,48.406926
4,1,0,0,56,66000.0,1985.0,100.0,2.4,36,50.5,...,0,14,0,0,0,1.0,1.0,1.0,109.142051,3.899395


In [12]:
# Split train file into training and testing sets
features = [f for f in list(dfTrain.columns) if "site_eui" not in f]

X_train,X_test,y_train,y_test = train_test_split(dfTrain[features], dfTrain[['site_eui']], test_size=0.25, random_state=10)

In [87]:
# Ridge Regression
def ridge_regression(X_train, y_train, X_test, y_test, alpha_ridge):
    rr = Ridge(alpha=alpha_ridge)

    rr.fit(X_train, y_train) 

    y_train_pred = rr.predict(X_train)
    y_test_pred = rr.predict(X_test)
    
    ret = [np.sqrt(mean_squared_error(y_train, y_train_pred))]
    ret.extend([r2_score(y_train, y_train_pred)])
    ret.extend([np.sqrt(mean_squared_error(y_test, y_test_pred))])
    ret.extend([r2_score(y_test, y_test_pred)])
    ret.extend([rr.intercept_[0]])
    ret.extend(rr.coef_[0])
    return ret

In [91]:
# Lasso Regression
def lasso_regression(X_train, y_train, X_test, y_test, alpha_lasso):
    lr = Lasso(alpha=alpha_lasso)

    lr.fit(X_train, y_train) 

    y_train_pred = lr.predict(X_train)
    y_test_pred = lr.predict(X_test)
    
    ret = [np.sqrt(mean_squared_error(y_train, y_train_pred))]
    ret.extend([r2_score(y_train, y_train_pred)])
    ret.extend([np.sqrt(mean_squared_error(y_test, y_test_pred))])
    ret.extend([r2_score(y_test, y_test_pred)])
    ret.extend([lr.intercept_[0]])
    ret.extend(lr.coef_)
    return ret

In [94]:
# Elastic Net Regression
def elastic_net_regression(X_train, y_train, X_test, y_test, alpha_en):
    en = ElasticNet(alpha=alpha_en)

    en.fit(X_train, y_train) 

    y_train_pred = en.predict(X_train)
    y_test_pred = en.predict(X_test)
    
    ret = [np.sqrt(mean_squared_error(y_train, y_train_pred))]
    ret.extend([r2_score(y_train, y_train_pred)])
    ret.extend([np.sqrt(mean_squared_error(y_test, y_test_pred))])
    ret.extend([r2_score(y_test, y_test_pred)])
    ret.extend([en.intercept_[0]])
    ret.extend(en.coef_)
    return ret

In [95]:
# Set the different values of learning rates to be tested
alpha = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]

#Initialize the dataframe for storing coefficients.
col = ['train_rmse','train_r2', 'test_rmse','test_r2', 'intercept'] + ['coef_' + col for col in list(X_train.columns)]
ind = ['alpha_%.2g'%alpha[i] for i in range(0,len(alpha))]
coef_matrix_ridge = pd.DataFrame(index=ind, columns=col)
coef_matrix_lasso = pd.DataFrame(index=ind, columns=col)
coef_matrix_elastic_net = pd.DataFrame(index=ind, columns=col)

for i in range(len(alpha)):
    coef_matrix_ridge.iloc[i,] = ridge_regression(X_train, y_train, X_test, y_test, alpha[i])
    coef_matrix_lasso.iloc[i,] = lasso_regression(X_train, y_train, X_test, y_test, alpha[i])
    coef_matrix_elastic_net.iloc[i,] = elastic_net_regression(X_train, y_train, X_test, y_test, alpha[i])

In [96]:
#Set the display format to be scientific for ease of analysis
pd.options.display.float_format = '{:,.4g}'.format
coef_matrix_ridge

Unnamed: 0,train_rmse,train_r2,test_rmse,test_r2,intercept,coef_Year_Factor,coef_State_Factor,coef_building_class,coef_facility_type,coef_floor_area,...,coef_days_below_10F,coef_days_below_0F,coef_days_above_80F,coef_days_above_90F,coef_days_above_100F,coef_days_above_110F,coef_direction_max_wind_speed,coef_direction_peak_wind_speed,coef_max_wind_speed,coef_days_with_fog
alpha_1e-15,51.69,0.2037,53.43,0.1865,-1473,-5.413,8.047,-18.16,-0.5483,1.102e-05,...,-1.934,2.199,-0.1413,0.1036,-0.8587,-0.7409,-0.09786,0.05168,1.16,-0.02536
alpha_1e-10,51.69,0.2037,53.43,0.1865,-1473,-5.413,8.047,-18.16,-0.5483,1.102e-05,...,-1.934,2.199,-0.1413,0.1036,-0.8587,-0.7409,-0.09786,0.05168,1.16,-0.02536
alpha_1e-08,51.69,0.2037,53.43,0.1865,-1473,-5.413,8.047,-18.16,-0.5483,1.102e-05,...,-1.934,2.199,-0.1413,0.1036,-0.8587,-0.7409,-0.09786,0.05168,1.16,-0.02536
alpha_0.0001,51.69,0.2037,53.43,0.1865,-1473,-5.413,8.047,-18.16,-0.5483,1.102e-05,...,-1.934,2.199,-0.1413,0.1036,-0.8587,-0.741,-0.09786,0.05168,1.16,-0.02536
alpha_0.001,51.69,0.2037,53.43,0.1865,-1473,-5.412,8.047,-18.16,-0.5483,1.102e-05,...,-1.934,2.199,-0.1412,0.1035,-0.8586,-0.7411,-0.09785,0.05168,1.16,-0.02536
alpha_0.01,51.69,0.2037,53.43,0.1865,-1475,-5.409,8.044,-18.16,-0.5483,1.102e-05,...,-1.934,2.198,-0.141,0.1033,-0.8579,-0.7428,-0.09785,0.05168,1.16,-0.02536
alpha_1,51.69,0.2037,53.43,0.1865,-1618,-5.167,7.815,-18.15,-0.5483,1.102e-05,...,-1.899,2.134,-0.1253,0.08541,-0.8058,-0.8657,-0.09742,0.05156,1.153,-0.02529
alpha_5,51.69,0.2037,53.43,0.1865,-1723,-4.862,7.501,-18.14,-0.5482,1.101e-05,...,-1.854,2.059,-0.1074,0.06403,-0.7409,-0.9959,-0.09691,0.05142,1.145,-0.02522
alpha_10,51.69,0.2037,53.43,0.1866,-1681,-4.742,7.349,-18.13,-0.5481,1.101e-05,...,-1.834,2.035,-0.1029,0.05706,-0.7173,-1.014,-0.09675,0.05137,1.143,-0.0252
alpha_20,51.69,0.2037,53.43,0.1866,-1547,-4.625,7.164,-18.11,-0.5479,1.102e-05,...,-1.811,2.018,-0.1017,0.05226,-0.6986,-0.9797,-0.09666,0.05134,1.142,-0.0252


In [97]:
#Set the display format to be scientific for ease of analysis
pd.options.display.float_format = '{:,.4g}'.format
coef_matrix_lasso

Unnamed: 0,train_rmse,train_r2,test_rmse,test_r2,intercept,coef_Year_Factor,coef_State_Factor,coef_building_class,coef_facility_type,coef_floor_area,...,coef_days_below_10F,coef_days_below_0F,coef_days_above_80F,coef_days_above_90F,coef_days_above_100F,coef_days_above_110F,coef_direction_max_wind_speed,coef_direction_peak_wind_speed,coef_max_wind_speed,coef_days_with_fog
alpha_1e-15,51.7,0.2034,53.42,0.1868,-14.75,-4.752,7.412,-18.15,-0.5462,1.105e-05,...,-0.8138,1.542,-0.1053,-0.06605,-0.6378,-0.03187,-0.09577,0.05153,1.143,-0.02537
alpha_1e-10,51.7,0.2034,53.42,0.1868,-14.75,-4.752,7.412,-18.15,-0.5462,1.105e-05,...,-0.8138,1.542,-0.1053,-0.06605,-0.6378,-0.03187,-0.09577,0.05153,1.143,-0.02537
alpha_1e-08,51.7,0.2034,53.42,0.1868,-14.75,-4.752,7.412,-18.15,-0.5462,1.105e-05,...,-0.8138,1.542,-0.1053,-0.06605,-0.6378,-0.03187,-0.09577,0.05153,1.143,-0.02537
alpha_0.0001,51.7,0.2034,53.42,0.1868,-14.72,-4.746,7.396,-18.15,-0.5462,1.105e-05,...,-0.8141,1.541,-0.1054,-0.0662,-0.6379,-0.02486,-0.09576,0.05153,1.143,-0.02538
alpha_0.001,51.7,0.2034,53.42,0.1868,-14.13,-4.695,7.256,-18.14,-0.5461,1.105e-05,...,-0.8166,1.53,-0.1066,-0.06678,-0.6366,-0.0,-0.09568,0.05151,1.142,-0.02543
alpha_0.01,51.7,0.2034,53.42,0.187,-10.16,-4.109,5.88,-18.08,-0.5447,1.107e-05,...,-0.8296,1.45,-0.1248,-0.05588,-0.6068,0.0,-0.09456,0.05123,1.122,-0.02579
alpha_1,51.86,0.1985,53.41,0.187,-106.1,-0.0,0.0,-11.96,-0.4606,1.316e-05,...,0.0,0.0,-0.1385,-0.1209,-0.1771,-0.0,-0.03009,0.03383,0.0,-0.02246
alpha_5,52.52,0.1779,53.92,0.1716,90.67,-0.0,0.0,-0.0,-0.3106,1.667e-05,...,0.0,0.0,-0.1852,-0.0,-0.0,-0.0,-0.01964,0.01804,0.0,-0.0299
alpha_10,52.63,0.1743,54.03,0.1682,105.9,-0.0,0.0,-0.0,-0.24,1.703e-05,...,0.0,0.0,-0.09049,-0.0,-0.0,-0.0,-0.008384,0.003458,-0.0,-0.03653
alpha_20,52.76,0.1704,54.16,0.1643,113.6,-0.0,0.0,-0.0,-0.1462,1.691e-05,...,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.005426,0.0,-0.0,-0.03462


In [98]:
#Set the display format to be scientific for ease of analysis
pd.options.display.float_format = '{:,.4g}'.format
coef_matrix_elastic_net

Unnamed: 0,train_rmse,train_r2,test_rmse,test_r2,intercept,coef_Year_Factor,coef_State_Factor,coef_building_class,coef_facility_type,coef_floor_area,...,coef_days_below_10F,coef_days_below_0F,coef_days_above_80F,coef_days_above_90F,coef_days_above_100F,coef_days_above_110F,coef_direction_max_wind_speed,coef_direction_peak_wind_speed,coef_max_wind_speed,coef_days_with_fog
alpha_1e-15,51.7,0.2034,53.42,0.1868,-14.75,-4.752,7.412,-18.15,-0.5462,1.105e-05,...,-0.8138,1.542,-0.1053,-0.06605,-0.6378,-0.03187,-0.09577,0.05153,1.143,-0.02537
alpha_1e-10,51.7,0.2034,53.42,0.1868,-14.75,-4.752,7.412,-18.15,-0.5462,1.105e-05,...,-0.8138,1.542,-0.1053,-0.06605,-0.6378,-0.03187,-0.09577,0.05153,1.143,-0.02537
alpha_1e-08,51.7,0.2034,53.42,0.1868,-14.75,-4.752,7.412,-18.15,-0.5462,1.105e-05,...,-0.8138,1.542,-0.1053,-0.06605,-0.6378,-0.03187,-0.09577,0.05153,1.143,-0.02537
alpha_0.0001,51.7,0.2034,53.42,0.1868,-14.65,-4.737,7.376,-18.14,-0.5461,1.105e-05,...,-0.8149,1.54,-0.1054,-0.0666,-0.6375,-0.0218,-0.09578,0.05153,1.143,-0.02538
alpha_0.001,51.7,0.2034,53.42,0.1868,-13.66,-4.606,7.063,-18.09,-0.5457,1.106e-05,...,-0.8248,1.525,-0.1066,-0.07127,-0.6299,0.0,-0.09581,0.05153,1.143,-0.02543
alpha_0.01,51.7,0.2034,53.4,0.1873,-16.58,-3.604,4.761,-17.61,-0.5416,1.12e-05,...,-0.8528,1.371,-0.1253,-0.09637,-0.5599,0.0,-0.09538,0.05132,1.132,-0.02555
alpha_1,52.11,0.1905,53.58,0.1819,-117.0,-0.2659,0.0,-3.925,-0.4298,1.486e-05,...,0.0,0.0,-0.1265,-0.1566,-0.1534,-0.0,-0.02901,0.03327,0.0,-0.01764
alpha_5,52.44,0.1804,53.85,0.1738,52.54,-0.0,0.0,-0.24,-0.3422,1.641e-05,...,0.0,0.0,-0.1965,-0.02301,-0.0,-0.0,-0.02681,0.02642,0.0,-0.03232
alpha_10,52.55,0.1769,53.95,0.1707,94.39,-0.0,0.0,-0.0,-0.2893,1.684e-05,...,0.0,0.0,-0.1584,-0.0,-0.0,-0.0,-0.02241,0.01912,0.0,-0.03448
alpha_20,52.67,0.1731,54.06,0.1671,113.4,-0.0,0.0,-0.0,-0.2181,1.703e-05,...,0.0,0.0,-0.07248,-0.0,-0.0,-0.0,-0.00923,0.004005,-0.0,-0.0384


In [99]:
# Number of Scraped features - Ridge
coef_matrix_ridge.apply(lambda x: sum(x.values==0),axis=1)

alpha_1e-15     0
alpha_1e-10     0
alpha_1e-08     0
alpha_0.0001    0
alpha_0.001     0
alpha_0.01      0
alpha_1         0
alpha_5         0
alpha_10        0
alpha_20        0
dtype: int64

In [100]:
# Number of Scraped features - Lasso
coef_matrix_lasso.apply(lambda x: sum(x.values==0),axis=1)

alpha_1e-15      0
alpha_1e-10      0
alpha_1e-08      0
alpha_0.0001     0
alpha_0.001      1
alpha_0.01       2
alpha_1         32
alpha_5         47
alpha_10        49
alpha_20        52
dtype: int64

In [101]:
# Number of Scraped features - Elastic Net
coef_matrix_elastic_net.apply(lambda x: sum(x.values==0),axis=1)

alpha_1e-15      0
alpha_1e-10      0
alpha_1e-08      0
alpha_0.0001     0
alpha_0.001      1
alpha_0.01       1
alpha_1         28
alpha_5         38
alpha_10        47
alpha_20        49
dtype: int64

# Results

Even with an optimized learning rate, these simple regression models - ridge, lasso and elastic net regression, cannot minimize RMSE below 51.69 effectively. However, we can see that a number of features are being scraped from the regression at higher alpha values.

Of the three regression models implemented in this notebook, Ridge regression with alpha = 0.01 gives R2 = 0.1865 and RMSE = 53.43 on the validation dataset