## Regression Analysis

In this notebook, we perform regression analysis on the cleaned dataset with the following regressors:

    1. Ridge Regressor
    2. Lasso Regressor
    3. Elastic Net Regressor

In [21]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [22]:
dfTrain = pd.read_csv("data/train_cleaned.csv")
dfTest = pd.read_csv("data/test_cleaned.csv")

In [23]:
X_train = dfTrain.drop('site_eui', axis = 1)
y_train = dfTrain['site_eui']
X_test = dfTest

In [24]:
# Ridge Regression
def ridge_regression(X, y, alpha_ridge):
    rr = Ridge(alpha=alpha_ridge)

    rr.fit(X, y) 

    y_pred = rr.predict(X_train)
    
    ret = [np.sqrt(mean_squared_error(y_train, y_pred))]
    ret.extend([r2_score(y_train, y_pred)])
    ret.extend([rr.intercept_])
    ret.extend(rr.coef_)
    return ret

In [25]:
# Lasso Regression
def lasso_regression(X, y, alpha_lasso):
    lr = Lasso(alpha=alpha_lasso)

    lr.fit(X, y) 

    y_pred = lr.predict(X_train)
    
    ret = [np.sqrt(mean_squared_error(y_train, y_pred))]
    ret.extend([r2_score(y_train, y_pred)])
    ret.extend([lr.intercept_])
    ret.extend(lr.coef_)
    return ret

In [30]:
# Elastic Net Regression
def elastic_net_regression(X, y, alpha_en):
    en = ElasticNet(alpha=alpha_en)

    en.fit(X, y) 

    y_pred = en.predict(X_train)
    
    ret = [np.sqrt(mean_squared_error(y_train, y_pred))]
    ret.extend([r2_score(y_train, y_pred)])
    ret.extend([en.intercept_])
    ret.extend(en.coef_)
    return ret

In [31]:
#Set the different values of alpha to be tested
alpha = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]

#Initialize the dataframe for storing coefficients.
col = ['rmse','r2', 'intercept'] + ['coef_' + col for col in list(X_train.columns)]
ind = ['alpha_%.2g'%alpha[i] for i in range(0,len(alpha))]
coef_matrix_ridge = pd.DataFrame(index=ind, columns=col)
coef_matrix_lasso = pd.DataFrame(index=ind, columns=col)
coef_matrix_elastic_net = pd.DataFrame(index=ind, columns=col)

for i in range(len(alpha)):
    coef_matrix_ridge.iloc[i,] = ridge_regression(X_train, y_train, alpha[i])
    coef_matrix_lasso.iloc[i,] = lasso_regression(X_train, y_train, alpha[i])
    coef_matrix_elastic_net.iloc[i,] = elastic_net_regression(X_train, y_train, alpha[i])

In [40]:
#Set the display format to be scientific for ease of analysis
pd.options.display.float_format = '{:,.4g}'.format
coef_matrix_ridge

Unnamed: 0,rmse,r2,intercept,coef_Year_Factor,coef_State_Factor,coef_building_class,coef_facility_type,coef_floor_area,coef_year_built,coef_energy_star_rating,...,coef_days_below_10F,coef_days_below_0F,coef_days_above_80F,coef_days_above_90F,coef_days_above_100F,coef_days_above_110F,coef_direction_max_wind_speed,coef_direction_peak_wind_speed,coef_max_wind_speed,coef_days_with_fog
alpha_1e-15,52.12,0.1997,-1444,-3.955,5.372,-17.33,-0.5581,1.119e-05,0.01561,-0.9398,...,-2.044,1.462,-0.2306,-0.004797,-0.8699,0.7659,-0.08533,0.04494,0.9908,-0.02781
alpha_1e-10,52.12,0.1997,-1444,-3.955,5.372,-17.33,-0.5581,1.119e-05,0.01561,-0.9398,...,-2.044,1.462,-0.2306,-0.004797,-0.8699,0.7659,-0.08533,0.04494,0.9908,-0.02781
alpha_1e-08,52.12,0.1997,-1444,-3.955,5.372,-17.33,-0.5581,1.119e-05,0.01561,-0.9398,...,-2.044,1.462,-0.2306,-0.004797,-0.8699,0.7659,-0.08533,0.04494,0.9908,-0.02781
alpha_0.0001,52.12,0.1997,-1444,-3.955,5.372,-17.33,-0.5581,1.119e-05,0.01561,-0.9398,...,-2.044,1.462,-0.2306,-0.004798,-0.8699,0.7659,-0.08533,0.04494,0.9908,-0.02781
alpha_0.001,52.12,0.1997,-1444,-3.955,5.372,-17.33,-0.5581,1.119e-05,0.01561,-0.9398,...,-2.044,1.462,-0.2306,-0.004807,-0.8699,0.7658,-0.08533,0.04494,0.9908,-0.02781
alpha_0.01,52.12,0.1997,-1445,-3.954,5.371,-17.33,-0.5581,1.119e-05,0.01561,-0.9398,...,-2.044,1.462,-0.2305,-0.004901,-0.8696,0.765,-0.08533,0.04494,0.9908,-0.02781
alpha_1,52.12,0.1997,-1518,-3.855,5.289,-17.33,-0.5581,1.119e-05,0.0156,-0.9398,...,-2.028,1.431,-0.2235,-0.01252,-0.8472,0.7044,-0.08515,0.04488,0.988,-0.02778
alpha_5,52.12,0.1997,-1583,-3.705,5.158,-17.32,-0.558,1.118e-05,0.01558,-0.9398,...,-2.001,1.385,-0.2135,-0.02363,-0.8118,0.6122,-0.0849,0.04481,0.9843,-0.02775
alpha_10,52.12,0.1997,-1557,-3.64,5.092,-17.31,-0.5579,1.119e-05,0.01557,-0.9398,...,-1.985,1.365,-0.2102,-0.02792,-0.795,0.5734,-0.08483,0.04479,0.9832,-0.02775
alpha_20,52.12,0.1997,-1462,-3.579,5.014,-17.3,-0.5578,1.119e-05,0.01554,-0.9398,...,-1.963,1.345,-0.2085,-0.03127,-0.7779,0.5428,-0.08479,0.04477,0.9827,-0.02777


In [38]:
#Set the display format to be scientific for ease of analysis
pd.options.display.float_format = '{:,.4g}'.format
coef_matrix_lasso

Unnamed: 0,rmse,r2,intercept,coef_Year_Factor,coef_State_Factor,coef_building_class,coef_facility_type,coef_floor_area,coef_year_built,coef_energy_star_rating,...,coef_days_below_10F,coef_days_below_0F,coef_days_above_80F,coef_days_above_90F,coef_days_above_100F,coef_days_above_110F,coef_direction_max_wind_speed,coef_direction_peak_wind_speed,coef_max_wind_speed,coef_days_with_fog
alpha_1e-15,52.12,0.1995,-46.34,-3.608,5.519,-17.34,-0.5562,1.121e-05,0.01479,-0.9396,...,-0.8593,0.8859,-0.2131,-0.1086,-0.7544,1.574,-0.08453,0.04514,0.9899,-0.02849
alpha_1e-10,52.12,0.1995,-46.34,-3.608,5.519,-17.34,-0.5562,1.121e-05,0.01479,-0.9396,...,-0.8593,0.8859,-0.2131,-0.1086,-0.7544,1.574,-0.08453,0.04514,0.9899,-0.02849
alpha_1e-08,52.12,0.1995,-46.34,-3.608,5.519,-17.34,-0.5562,1.121e-05,0.01479,-0.9396,...,-0.8593,0.8859,-0.2131,-0.1086,-0.7544,1.574,-0.08453,0.04514,0.9899,-0.02849
alpha_0.0001,52.12,0.1995,-46.3,-3.604,5.502,-17.34,-0.5562,1.121e-05,0.01479,-0.9396,...,-0.859,0.8834,-0.2131,-0.109,-0.7523,1.556,-0.08452,0.04514,0.9897,-0.0285
alpha_0.001,52.12,0.1995,-45.93,-3.567,5.356,-17.33,-0.556,1.121e-05,0.01476,-0.9396,...,-0.8559,0.8601,-0.2129,-0.1128,-0.7315,1.387,-0.08444,0.04511,0.9884,-0.02853
alpha_0.01,52.12,0.1995,-39.04,-3.332,3.966,-17.26,-0.5543,1.124e-05,0.01447,-0.9395,...,-0.7652,0.6293,-0.2084,-0.1603,-0.5235,0.0,-0.08352,0.04487,0.9714,-0.02879
alpha_1,52.26,0.1953,-126.8,-0.0,0.0,-11.22,-0.4709,1.33e-05,0.001403,-0.9311,...,-0.0,-0.0,-0.1886,-0.1955,-0.05603,-0.0,-0.02769,0.02887,0.0,-0.02952
alpha_5,52.87,0.1764,97.73,-0.0,0.0,-0.0,-0.3253,1.686e-05,-0.01071,-0.9112,...,0.0,0.0,-0.1741,-0.0,-0.0,-0.0,-0.01667,0.01327,0.0,-0.03857
alpha_10,52.99,0.1727,114.7,-0.0,0.0,-0.0,-0.2537,1.723e-05,-0.01223,-0.9032,...,0.0,0.0,-0.07583,-0.0,-0.0,-0.0,-0.006509,0.0,-0.0,-0.04517
alpha_20,53.1,0.1692,119.4,-0.0,0.0,-0.0,-0.1612,1.707e-05,-0.006764,-0.8847,...,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.006632,0.0,-0.0,-0.04247


In [39]:
#Set the display format to be scientific for ease of analysis
pd.options.display.float_format = '{:,.4g}'.format
coef_matrix_elastic_net

Unnamed: 0,rmse,r2,intercept,coef_Year_Factor,coef_State_Factor,coef_building_class,coef_facility_type,coef_floor_area,coef_year_built,coef_energy_star_rating,...,coef_days_below_10F,coef_days_below_0F,coef_days_above_80F,coef_days_above_90F,coef_days_above_100F,coef_days_above_110F,coef_direction_max_wind_speed,coef_direction_peak_wind_speed,coef_max_wind_speed,coef_days_with_fog
alpha_1e-15,52.12,0.1995,-46.34,-3.608,5.519,-17.34,-0.5562,1.121e-05,0.01479,-0.9396,...,-0.8593,0.8859,-0.2131,-0.1086,-0.7544,1.574,-0.08453,0.04514,0.9899,-0.02849
alpha_1e-10,52.12,0.1995,-46.34,-3.608,5.519,-17.34,-0.5562,1.121e-05,0.01479,-0.9396,...,-0.8593,0.8859,-0.2131,-0.1086,-0.7544,1.574,-0.08453,0.04514,0.9899,-0.02849
alpha_1e-08,52.12,0.1995,-46.34,-3.608,5.519,-17.34,-0.5562,1.121e-05,0.01479,-0.9396,...,-0.8593,0.8859,-0.2131,-0.1086,-0.7544,1.574,-0.08453,0.04514,0.9899,-0.02849
alpha_0.0001,52.12,0.1995,-46.21,-3.599,5.488,-17.33,-0.5562,1.121e-05,0.01479,-0.9396,...,-0.8599,0.8832,-0.213,-0.1096,-0.7513,1.553,-0.08453,0.04514,0.9899,-0.0285
alpha_0.001,52.12,0.1995,-45.26,-3.52,5.226,-17.29,-0.5557,1.122e-05,0.01473,-0.9395,...,-0.8635,0.8585,-0.2122,-0.1178,-0.7245,1.376,-0.08455,0.04513,0.99,-0.02854
alpha_0.01,52.12,0.1995,-43.34,-2.939,3.408,-16.81,-0.5517,1.135e-05,0.01422,-0.9391,...,-0.7939,0.6092,-0.2069,-0.1719,-0.5546,0.4893,-0.08414,0.04487,0.9799,-0.02858
alpha_1,52.49,0.1881,-133.6,-0.2205,0.0,-3.727,-0.4426,1.493e-05,0.0007466,-0.9234,...,0.0,0.0,-0.1478,-0.2036,-0.08859,-0.0,-0.0262,0.02808,0.0,-0.02462
alpha_5,52.8,0.1786,58.7,-0.0,0.0,-0.197,-0.3557,1.66e-05,-0.008844,-0.9113,...,0.0,0.0,-0.193,-0.02346,-0.0,-0.0,-0.02344,0.02135,0.0,-0.03926
alpha_10,52.9,0.1754,101.5,-0.0,0.0,-0.0,-0.3034,1.702e-05,-0.01194,-0.903,...,0.0,0.0,-0.1472,-0.0,-0.0,-0.0,-0.01936,0.01438,0.0,-0.04289
alpha_20,53.02,0.1716,121.1,-0.0,0.0,-0.0,-0.2313,1.72e-05,-0.01218,-0.8863,...,0.0,0.0,-0.06004,-0.0,-0.0,-0.0,-0.006821,0.0,-0.0,-0.04674


In [36]:
# Number of Scraped features - Ridge
coef_matrix_ridge.apply(lambda x: sum(x.values==0),axis=1)

alpha_1e-15     0
alpha_1e-10     0
alpha_1e-08     0
alpha_0.0001    0
alpha_0.001     0
alpha_0.01      0
alpha_1         0
alpha_5         0
alpha_10        0
alpha_20        0
dtype: int64

In [35]:
# Number of Scraped features - Lasso
coef_matrix_lasso.apply(lambda x: sum(x.values==0),axis=1)

alpha_1e-15      0
alpha_1e-10      0
alpha_1e-08      0
alpha_0.0001     0
alpha_0.001      0
alpha_0.01       2
alpha_1         32
alpha_5         47
alpha_10        50
alpha_20        52
dtype: int64

In [37]:
# Number of Scraped features - Elastic Net
coef_matrix_elastic_net.apply(lambda x: sum(x.values==0),axis=1)

alpha_1e-15      0
alpha_1e-10      0
alpha_1e-08      0
alpha_0.0001     0
alpha_0.001      0
alpha_0.01       0
alpha_1         27
alpha_5         40
alpha_10        47
alpha_20        50
dtype: int64