In [518]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%pylab inline
import scipy.stats
import statsmodels.formula.api as smf
import statsmodels.api as sm
import urllib
import requests
import zipfile
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
from sklearn.decomposition import PCA
import geopandas as gpd
from sklearn.model_selection import train_test_split

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


# 1. data preparation

In [519]:
def get_LL84(url):
    data = pd.read_excel(url)
    cols = [x.encode('utf8').replace('\xc2\xb2', '2') for x in data.columns]
    data.columns = cols
    data.rename(columns = {'NYC Borough, Block, and Lot (BBL)':'BBL',
                           'NYC Borough, Block and Lot (BBL)':'BBL','Zip Code':'Zip',
                           'Site EUI (kBtu/ft2)':'Site EUI',
        'Site EUI\n(kBtu/ft2)':'Site EUI','DOF Benchmarking Submission Status':'Benchmarking Submission',
                           'Weather Normalized Source EUI\n(kBtu/ft2)':'Weather Normalized Source EUI',
                           'Weather Normalized Source EUI (kBtu/ft2)':'Weather Normalized Source EUI',
                           'Municipally Supplied Potable Water - Indoor Intensity (gal/ft2)':'Indoor Water Intensity(gal/ft2)',
                           'Indoor Water Intensity (All Water Sources)\n(gal/ft2)':'Indoor Water Intensity(gal/ft2)',
                           'Water per Square Foot':'Indoor Water Intensity(gal/ft2)',
                           'Total GHG Emissions\n(MtCO2e)':'GHG',
                           'Total GHG Emissions (Metric Tons CO2e)':'GHG',
                           'Municipally Supplied Potable Water - Indoor Intensity (gal/ft2)':'Indoor Water Intensity(gal/ft2)',
                               'Property Floor Area (Buildngs and Parking)\n(ft2)':'Property Floor Area(ft2)',
                           'DOF Property Floor Area (ft2)':'Property Floor Area(ft2)',
                               'DOF Property Floor Area (Buildngs and Parking)\n(ft2)':'Property Floor Area(ft2)',
                           'DOF Property Floor Area (ft²)':'Property Floor Area(ft2)',
                               'DOF Number of Buildings':'Number of Buildings',
                           'Number of Buildings - Self-reported':'Number of Buildings',
                          'Primary Property Type - Self Selected':'Reported Facility Type'},inplace=True)
    
    
    data = data[data['ENERGY STAR Score']>0][['BBL','Zip','Benchmarking Submission','Site EUI','Weather Normalized Source EUI',
                 'Indoor Water Intensity(gal/ft2)','Reported Water Method','ENERGY STAR Score'
                ,'GHG','Property Floor Area(ft2)','Reported Facility Type','Number of Buildings']]
    return data

In [520]:
#LL13 = get_LL84('http://www.nyc.gov/html/gbee/downloads/excel/2013_nyc_ll84_disclosure.xlsx')

In [521]:
#LL14 = get_LL84('http://www.nyc.gov/html/gbee/downloads/excel/150428_2014_nyc_ll84_disclosure.xlsx')

In [522]:
#LL15 = get_LL84('http://www.nyc.gov/html/gbee/downloads/excel/2015_nyc_cy2014__ll84_disclosure_data.xlsx')

In [523]:
LL16 = get_LL84('http://www.nyc.gov/html/gbee/downloads/excel/nyc_benchmarking_disclosure_data_reported_in_2016.xlsx')

In [524]:
PRICE = pd.read_csv('Housing_price/2016.csv')

## As we have only 4 years of data here, we would now only use data of 2016 

In [525]:
LL16.BBL = LL16.BBL.astype('str').str[:-2]

In [526]:
merged16 = pd.merge(LL16,PRICE,on='BBL')

In [527]:
merged16 = merged16[merged16.VALUE>0]

In [528]:
merged16.dropna(axis=0,inplace = True)

In [529]:
merged16.columns = [x.replace(' ','_') for x in merged16.columns]

In [530]:
merged16.head()

Unnamed: 0,BBL,Zip,Benchmarking_Submission,Site_EUI,Weather_Normalized_Source_EUI,Indoor_Water_Intensity(gal/ft2),Reported_Water_Method,ENERGY_STAR_Score,GHG,Property_Floor_Area(ft2),Reported_Facility_Type,Number_of_Buildings,VALUE
2,1014270028,10021.0,In Compliance,44.9,105.2,71.51,Manual,80.0,538.4,166432.0,Multifamily Housing,1.0,33931000
3,1015180024,10128.0,In Compliance,91.3,180.7,64.86,ABS,10.0,699.3,114939.0,Multifamily Housing,1.0,24268000
49,1000260021,10005.0,In Compliance,63.5,128.4,34.22,ABS,39.0,2163.1,493187.0,Multifamily Housing,1.0,114278000
87,1000520021,10006.0,In Compliance,84.6,131.1,23.11,ABS,24.0,379.0,57945.0,Multifamily Housing,1.0,6108000
108,1000680028,10038.0,In Compliance,107.7,231.7,88.05,ABS,40.0,534.2,71539.0,Hotel,1.0,30130000


# 2. PCA

In [531]:
X_num = merged16[list(merged16.columns[3:6])+list(merged16.columns[7:10])+['Number_of_Buildings','VALUE']]

### Normalization

### In order to do log transformation later, here we use feature scaling normalization in case of negative numbers

In [532]:
columns = list(merged16.columns[3:6])+list(merged16.columns[7:10])+['Number_of_Buildings','VALUE']

In [533]:
def norm(data,columns):
    for x in columns:
        data[x] = (data[x]-np.min(data[x]))/(np.max(data[x])-np.min(data[x]))
    return data

In [534]:
merged16 = norm(merged16,columns)

### Remove outliers

In [535]:
def clean(data,columns):
    for x in columns:
        data = data[(data[x]<np.mean(data[x])+2*np.std(data[x]))&(data[x]>np.mean(data[x])-2*np.std(data[x]))]
    return data

In [536]:
merged16 = clean(merged16,columns)

In [537]:
X_num = merged16[list(merged16.columns[3:6])+list(merged16.columns[7:10])+['Number_of_Buildings']]
pca = PCA(0.95)
Xproj = pca.fit_transform(X_num)

In [538]:
pca.explained_variance_ratio_

array([ 0.99131268])

In [539]:
pca.components_

array([[  1.10501753e-03,   4.48459790e-04,   3.11993995e-03,
         -9.99977005e-01,   1.18143080e-03,  -5.77801806e-03,
          2.30074394e-04]])

In [540]:
components_explanation = pd.DataFrame(pca.components_,columns = X_num.columns)
components_explanation

Unnamed: 0,Site_EUI,Weather_Normalized_Source_EUI,Indoor_Water_Intensity(gal/ft2),ENERGY_STAR_Score,GHG,Property_Floor_Area(ft2),Number_of_Buildings
0,0.001105,0.000448,0.00312,-0.999977,0.001181,-0.005778,0.00023


## Here we could find the impact of each original variable on only 1 principle component which explains almost all of the information
## Based on the coefficients, we could explain each component as:
## Component 1: Energy efficiency

# 3. Analysis

In [541]:
merged16['Energy_efficiency'] = Xproj[:,0]

### Hierarchical linear regression ---- to check if energy efficiency actually works

In [308]:
lm_no_energy = smf.ols(formula='VALUE~C(Benchmarking_Submission)+C(Reported_Water_Method)+\
                       C(Reported_Facility_Type)'
             ,data=merged16).fit()
lm_no_energy.summary()

0,1,2,3
Dep. Variable:,VALUE,R-squared:,0.091
Model:,OLS,Adj. R-squared:,0.087
Method:,Least Squares,F-statistic:,20.61
Date:,"Mon, 20 Nov 2017",Prob (F-statistic):,3.8900000000000003e-47
Time:,12:35:58,Log-Likelihood:,6547.1
No. Observations:,2685,AIC:,-13070.0
Df Residuals:,2671,BIC:,-12980.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0126,0.021,0.596,0.551,-0.029,0.054
C(Reported_Water_Method)[T.Manual],0.0088,0.003,3.076,0.002,0.003,0.014
C(Reported_Facility_Type)[T.Distribution Center],0.0070,0.023,0.311,0.756,-0.037,0.051
C(Reported_Facility_Type)[T.Financial Office],0.0358,0.030,1.189,0.235,-0.023,0.095
C(Reported_Facility_Type)[T.Hotel],0.0341,0.023,1.469,0.142,-0.011,0.080
C(Reported_Facility_Type)[T.K-12 School],0.0334,0.024,1.411,0.158,-0.013,0.080
C(Reported_Facility_Type)[T.Medical Office],0.0359,0.023,1.548,0.122,-0.010,0.081
C(Reported_Facility_Type)[T.Multifamily Housing],0.0106,0.021,0.500,0.617,-0.031,0.052
C(Reported_Facility_Type)[T.Non-Refrigerated Warehouse],0.0117,0.022,0.535,0.593,-0.031,0.054

0,1,2,3
Omnibus:,635.367,Durbin-Watson:,0.802
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1225.698
Skew:,1.441,Prob(JB):,6.97e-267
Kurtosis:,4.627,Cond. No.,260.0


In [309]:
lm_with_energy = smf.ols(formula='VALUE~Energy_efficiency+C(Benchmarking_Submission)+C(Reported_Water_Method)+\
                       C(Reported_Facility_Type)'
             ,data=merged16).fit()
lm_with_energy.summary()

0,1,2,3
Dep. Variable:,VALUE,R-squared:,0.096
Model:,OLS,Adj. R-squared:,0.092
Method:,Least Squares,F-statistic:,20.32
Date:,"Mon, 20 Nov 2017",Prob (F-statistic):,1.37e-49
Time:,12:35:58,Log-Likelihood:,6554.6
No. Observations:,2685,AIC:,-13080.0
Df Residuals:,2670,BIC:,-12990.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0121,0.021,0.572,0.567,-0.029,0.054
C(Reported_Water_Method)[T.Manual],0.0086,0.003,3.037,0.002,0.003,0.014
C(Reported_Facility_Type)[T.Distribution Center],0.0083,0.023,0.368,0.713,-0.036,0.053
C(Reported_Facility_Type)[T.Financial Office],0.0341,0.030,1.137,0.256,-0.025,0.093
C(Reported_Facility_Type)[T.Hotel],0.0348,0.023,1.504,0.133,-0.011,0.080
C(Reported_Facility_Type)[T.K-12 School],0.0339,0.024,1.436,0.151,-0.012,0.080
C(Reported_Facility_Type)[T.Medical Office],0.0355,0.023,1.533,0.125,-0.010,0.081
C(Reported_Facility_Type)[T.Multifamily Housing],0.0112,0.021,0.529,0.597,-0.030,0.053
C(Reported_Facility_Type)[T.Non-Refrigerated Warehouse],0.0124,0.022,0.572,0.567,-0.030,0.055

0,1,2,3
Omnibus:,635.108,Durbin-Watson:,0.822
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1228.057
Skew:,1.438,Prob(JB):,2.14e-267
Kurtosis:,4.645,Cond. No.,260.0


In [310]:
anova = sm.stats.anova_lm(lm_no_energy,lm_with_energy)
anova

Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,2671.0,1.198162,0.0,,,
1,2670.0,1.19144,1.0,0.006722,15.064481,0.000106


In [311]:
anova['Pr(>F)'][1] < 0.05

True

## As p here is way less than our significance level 5%, We could conclude that energy efficiency actually has a significant impact on property market value

### Linear Regression
### Optimize the model using feature selection (maximizing adj-R2)

In [312]:
merged16.head()

Unnamed: 0,BBL,Zip,Benchmarking_Submission,Site_EUI,Weather_Normalized_Source_EUI,Indoor_Water_Intensity(gal/ft2),Reported_Water_Method,ENERGY_STAR_Score,GHG,Property_Floor_Area(ft2),Reported_Facility_Type,Number_of_Buildings,VALUE,Energy_efficiency
3,1015180024,10128.0,In Compliance,0.001415,0.000889,0.00461,ABS,0.090909,0.002558,0.048184,Multifamily Housing,0.022222,0.089241,0.555036
87,1000520021,10006.0,In Compliance,0.001311,0.000643,0.001642,ABS,0.232323,0.001378,0.005862,Multifamily Housing,0.022222,0.020492,0.413859
118,1000710001,10038.0,In Compliance,0.001486,0.000743,0.002553,ABS,0.212121,0.001588,0.002168,Multifamily Housing,0.022222,0.044111,0.434085
121,1000720027,10038.0,In Compliance,0.00055,0.000331,0.001922,ABS,0.989899,0.000429,0.001452,Multifamily Housing,0.022222,0.016309,-0.343675
126,1000760024,10038.0,In Compliance,0.000993,0.00058,0.001784,ABS,0.868687,0.000988,0.008894,Multifamily Housing,0.022222,0.065353,-0.222508


In [380]:
train,test = train_test_split(merged16, test_size = 0.2)

In [381]:
lm1 = smf.ols(formula='VALUE~Energy_efficiency+C(Benchmarking_Submission)+C(Reported_Water_Method)+C(Reported_Facility_Type)'
             ,data=train).fit()
lm1.summary()

0,1,2,3
Dep. Variable:,VALUE,R-squared:,0.096
Model:,OLS,Adj. R-squared:,0.09
Method:,Least Squares,F-statistic:,16.22
Date:,"Mon, 20 Nov 2017",Prob (F-statistic):,2.47e-38
Time:,12:37:21,Log-Likelihood:,5269.3
No. Observations:,2148,AIC:,-10510.0
Df Residuals:,2133,BIC:,-10420.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0122,0.021,0.583,0.560,-0.029,0.053
C(Reported_Water_Method)[T.Manual],0.0082,0.003,2.657,0.008,0.002,0.014
C(Reported_Facility_Type)[T.Distribution Center],0.0081,0.022,0.363,0.717,-0.036,0.052
C(Reported_Facility_Type)[T.Financial Office],0.0348,0.030,1.171,0.242,-0.023,0.093
C(Reported_Facility_Type)[T.Hotel],0.0347,0.023,1.516,0.130,-0.010,0.080
C(Reported_Facility_Type)[T.K-12 School],0.0330,0.024,1.370,0.171,-0.014,0.080
C(Reported_Facility_Type)[T.Medical Office],0.0372,0.023,1.590,0.112,-0.009,0.083
C(Reported_Facility_Type)[T.Multifamily Housing],0.0110,0.021,0.526,0.599,-0.030,0.052
C(Reported_Facility_Type)[T.Non-Refrigerated Warehouse],0.0111,0.022,0.516,0.606,-0.031,0.053

0,1,2,3
Omnibus:,524.775,Durbin-Watson:,1.924
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1038.111
Skew:,1.461,Prob(JB):,3.78e-226
Kurtosis:,4.748,Cond. No.,232.0


In [382]:
pvalues = lm1.pvalues

In [383]:
pvalues.sort_values(ascending=True)

Energy_efficiency                                          0.002845
C(Reported_Water_Method)[T.Manual]                         0.007936
C(Reported_Facility_Type)[T.Senior Care Community]         0.068001
C(Reported_Facility_Type)[T.Office]                        0.074174
C(Reported_Facility_Type)[T.Medical Office]                0.111972
C(Reported_Facility_Type)[T.Hotel]                         0.129684
C(Reported_Facility_Type)[T.Worship Facility]              0.149244
C(Reported_Facility_Type)[T.K-12 School]                   0.170968
C(Reported_Facility_Type)[T.Residence Hall/Dormitory]      0.211700
C(Reported_Facility_Type)[T.Financial Office]              0.241824
Intercept                                                  0.560130
C(Reported_Facility_Type)[T.Multifamily Housing]           0.599052
C(Reported_Facility_Type)[T.Non-Refrigerated Warehouse]    0.606063
C(Reported_Facility_Type)[T.Retail Store]                  0.677296
C(Reported_Facility_Type)[T.Distribution Center]

In [384]:
def AdjR2(features):
    lm = smf.ols(formula = ('VALUE~'+features), data = train).fit()
    lmy = lm.predict(test)
    y_err = lmy-test.VALUE
    y_norm = test.VALUE-np.mean(test.VALUE)
    # Adjusted R^2
    R2 = 1 - y_err.dot(y_err) / y_norm.dot(y_norm) * (len(merged16.index)-1) / (len(merged16.index)-(features.count('+')+1+features.count('-'))-1)
    return R2, lm

In [385]:
AdjR2('Energy_efficiency-1')[0]

-1.2042357790208142

In [386]:
AdjR2('Energy_efficiency+C(Reported_Water_Method)-1')[0]

0.015938411503404804

In [387]:
AdjR2('Energy_efficiency+C(Reported_Water_Method)+C(Reported_Facility_Type)-1')[0]

0.090844851246921543

In [388]:
AdjR2('Energy_efficiency+C(Reported_Water_Method)+C(Reported_Facility_Type)')[0]

0.091183961708969918

In [389]:
lm = smf.ols(formula='VALUE~Energy_efficiency+C(Reported_Water_Method)+C(Reported_Facility_Type)'
             ,data=train).fit()
lm.summary()

0,1,2,3
Dep. Variable:,VALUE,R-squared:,0.096
Model:,OLS,Adj. R-squared:,0.09
Method:,Least Squares,F-statistic:,16.22
Date:,"Mon, 20 Nov 2017",Prob (F-statistic):,2.47e-38
Time:,12:37:25,Log-Likelihood:,5269.3
No. Observations:,2148,AIC:,-10510.0
Df Residuals:,2133,BIC:,-10420.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0122,0.021,0.583,0.560,-0.029,0.053
C(Reported_Water_Method)[T.Manual],0.0082,0.003,2.657,0.008,0.002,0.014
C(Reported_Facility_Type)[T.Distribution Center],0.0081,0.022,0.363,0.717,-0.036,0.052
C(Reported_Facility_Type)[T.Financial Office],0.0348,0.030,1.171,0.242,-0.023,0.093
C(Reported_Facility_Type)[T.Hotel],0.0347,0.023,1.516,0.130,-0.010,0.080
C(Reported_Facility_Type)[T.K-12 School],0.0330,0.024,1.370,0.171,-0.014,0.080
C(Reported_Facility_Type)[T.Medical Office],0.0372,0.023,1.590,0.112,-0.009,0.083
C(Reported_Facility_Type)[T.Multifamily Housing],0.0110,0.021,0.526,0.599,-0.030,0.052
C(Reported_Facility_Type)[T.Non-Refrigerated Warehouse],0.0111,0.022,0.516,0.606,-0.031,0.053

0,1,2,3
Omnibus:,524.775,Durbin-Watson:,1.924
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1038.111
Skew:,1.461,Prob(JB):,3.78e-226
Kurtosis:,4.748,Cond. No.,232.0


## We could see that the model is still unsatisfying

### Log-log Linear Regression

### Logarithm of both x and y

In [481]:
columns = ['VALUE','Energy_efficiency']

### As here energy_efficiency has negative numbers, we would add 1 to each of them

In [542]:
loglog = merged16.copy()
loglog['Energy_efficiency'] = loglog['Energy_efficiency']+1

In [543]:
for column in columns:
    loglog[column] = np.log(loglog[column])

  


### remove inf and 0

In [545]:
loglog = loglog.sort_values(by='VALUE',ascending=True).iloc[1:,:]

In [546]:
train, test = train_test_split(loglog,test_size = 0.2)

In [547]:
loglog1 = smf.ols(formula='VALUE~Energy_efficiency+C(Benchmarking_Submission)+C(Reported_Water_Method)+C(Reported_Facility_Type)'
             ,data=train).fit()
loglog1.summary()

0,1,2,3
Dep. Variable:,VALUE,R-squared:,0.08
Model:,OLS,Adj. R-squared:,0.074
Method:,Least Squares,F-statistic:,13.25
Date:,"Mon, 20 Nov 2017",Prob (F-statistic):,1.4e-30
Time:,13:18:24,Log-Likelihood:,-2765.6
No. Observations:,2147,AIC:,5561.0
Df Residuals:,2132,BIC:,5646.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-4.0735,0.883,-4.611,0.000,-5.806,-2.341
C(Reported_Water_Method)[T.Manual],0.2726,0.131,2.085,0.037,0.016,0.529
C(Reported_Facility_Type)[T.Distribution Center],0.3739,0.965,0.388,0.698,-1.518,2.265
C(Reported_Facility_Type)[T.Financial Office],1.1489,1.252,0.918,0.359,-1.307,3.605
C(Reported_Facility_Type)[T.Hotel],1.1193,0.984,1.137,0.256,-0.811,3.050
C(Reported_Facility_Type)[T.K-12 School],1.2492,0.984,1.269,0.205,-0.681,3.180
C(Reported_Facility_Type)[T.Medical Office],1.3159,1.018,1.293,0.196,-0.680,3.311
C(Reported_Facility_Type)[T.Multifamily Housing],0.2633,0.881,0.299,0.765,-1.464,1.990
C(Reported_Facility_Type)[T.Non-Refrigerated Warehouse],0.5023,0.910,0.552,0.581,-1.281,2.286

0,1,2,3
Omnibus:,21.958,Durbin-Watson:,1.958
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23.275
Skew:,-0.215,Prob(JB):,8.83e-06
Kurtosis:,3.274,Cond. No.,289.0


In [548]:
pvalues = loglog1.pvalues

In [549]:
pvalues.sort_values(ascending=True)

Intercept                                                  0.000004
Energy_efficiency                                          0.000036
C(Reported_Water_Method)[T.Manual]                         0.037223
C(Reported_Facility_Type)[T.Office]                        0.167535
C(Reported_Facility_Type)[T.Senior Care Community]         0.171057
C(Reported_Facility_Type)[T.Medical Office]                0.196116
C(Reported_Facility_Type)[T.Residence Hall/Dormitory]      0.196515
C(Reported_Facility_Type)[T.K-12 School]                   0.204574
C(Reported_Facility_Type)[T.Worship Facility]              0.233267
C(Reported_Facility_Type)[T.Hotel]                         0.255651
C(Reported_Facility_Type)[T.Financial Office]              0.358973
C(Reported_Facility_Type)[T.Non-Refrigerated Warehouse]    0.580818
C(Reported_Facility_Type)[T.Retail Store]                  0.593886
C(Reported_Facility_Type)[T.Distribution Center]           0.698325
C(Reported_Facility_Type)[T.Multifamily Housing]

In [550]:
AdjR2('Energy_efficiency')[0]

0.01400133066561704

In [551]:
AdjR2('Energy_efficiency+C(Reported_Water_Method)')[0]

0.018888723646621752

In [552]:
AdjR2('Energy_efficiency+C(Reported_Water_Method)+C(Reported_Facility_Type)')[0]

0.096349950639519233

In [553]:
lm = smf.ols(formula='VALUE~Energy_efficiency+C(Reported_Water_Method)+C(Reported_Facility_Type)'
             ,data=train).fit()
lm.summary()

0,1,2,3
Dep. Variable:,VALUE,R-squared:,0.08
Model:,OLS,Adj. R-squared:,0.074
Method:,Least Squares,F-statistic:,13.25
Date:,"Mon, 20 Nov 2017",Prob (F-statistic):,1.4e-30
Time:,13:18:33,Log-Likelihood:,-2765.6
No. Observations:,2147,AIC:,5561.0
Df Residuals:,2132,BIC:,5646.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-4.0735,0.883,-4.611,0.000,-5.806,-2.341
C(Reported_Water_Method)[T.Manual],0.2726,0.131,2.085,0.037,0.016,0.529
C(Reported_Facility_Type)[T.Distribution Center],0.3739,0.965,0.388,0.698,-1.518,2.265
C(Reported_Facility_Type)[T.Financial Office],1.1489,1.252,0.918,0.359,-1.307,3.605
C(Reported_Facility_Type)[T.Hotel],1.1193,0.984,1.137,0.256,-0.811,3.050
C(Reported_Facility_Type)[T.K-12 School],1.2492,0.984,1.269,0.205,-0.681,3.180
C(Reported_Facility_Type)[T.Medical Office],1.3159,1.018,1.293,0.196,-0.680,3.311
C(Reported_Facility_Type)[T.Multifamily Housing],0.2633,0.881,0.299,0.765,-1.464,1.990
C(Reported_Facility_Type)[T.Non-Refrigerated Warehouse],0.5023,0.910,0.552,0.581,-1.281,2.286

0,1,2,3
Omnibus:,21.958,Durbin-Watson:,1.958
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23.275
Skew:,-0.215,Prob(JB):,8.83e-06
Kurtosis:,3.274,Cond. No.,289.0


### Semi-log Linear Regression

In [560]:
semilog = merged16.copy()

In [561]:
semilog.VALUE = np.log(semilog.VALUE)

  """Entry point for launching an IPython kernel.


In [562]:
semilog = semilog.sort_values(by='VALUE').iloc[1:,:]

In [563]:
train, test = train_test_split(semilog,test_size = 0.2)

In [564]:
semilog1 = smf.ols(formula='VALUE~Energy_efficiency+C(Benchmarking_Submission)+C(Reported_Water_Method)+C(Reported_Facility_Type)'
             ,data=train).fit()
semilog1.summary()

0,1,2,3
Dep. Variable:,VALUE,R-squared:,0.073
Model:,OLS,Adj. R-squared:,0.067
Method:,Least Squares,F-statistic:,12.02
Date:,"Mon, 20 Nov 2017",Prob (F-statistic):,2.28e-27
Time:,13:22:35,Log-Likelihood:,-2745.8
No. Observations:,2147,AIC:,5522.0
Df Residuals:,2132,BIC:,5607.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-4.3981,0.872,-5.041,0.000,-6.109,-2.687
C(Reported_Water_Method)[T.Manual],0.3492,0.126,2.762,0.006,0.101,0.597
C(Reported_Facility_Type)[T.Distribution Center],0.4013,0.976,0.411,0.681,-1.512,2.314
C(Reported_Facility_Type)[T.Financial Office],1.0738,1.240,0.866,0.387,-1.359,3.506
C(Reported_Facility_Type)[T.Hotel],1.1404,0.975,1.169,0.242,-0.772,3.053
C(Reported_Facility_Type)[T.K-12 School],1.2249,1.007,1.216,0.224,-0.751,3.200
C(Reported_Facility_Type)[T.Medical Office],1.2437,1.007,1.235,0.217,-0.732,3.219
C(Reported_Facility_Type)[T.Multifamily Housing],0.2645,0.873,0.303,0.762,-1.447,1.976
C(Reported_Facility_Type)[T.Non-Refrigerated Warehouse],0.4943,0.903,0.547,0.584,-1.277,2.266

0,1,2,3
Omnibus:,9.757,Durbin-Watson:,1.992
Prob(Omnibus):,0.008,Jarque-Bera (JB):,8.476
Skew:,-0.093,Prob(JB):,0.0144
Kurtosis:,2.755,Cond. No.,233.0


In [565]:
pvalues = semilog1.pvalues

In [566]:
pvalues.sort_values(ascending=True)

Intercept                                                  5.009120e-07
Energy_efficiency                                          5.023224e-05
C(Reported_Water_Method)[T.Manual]                         5.800131e-03
C(Reported_Facility_Type)[T.Worship Facility]              1.746069e-01
C(Reported_Facility_Type)[T.Office]                        1.818384e-01
C(Reported_Facility_Type)[T.Senior Care Community]         2.010179e-01
C(Reported_Facility_Type)[T.Medical Office]                2.171121e-01
C(Reported_Facility_Type)[T.K-12 School]                   2.241410e-01
C(Reported_Facility_Type)[T.Hotel]                         2.424479e-01
C(Reported_Facility_Type)[T.Residence Hall/Dormitory]      2.848254e-01
C(Reported_Facility_Type)[T.Financial Office]              3.867367e-01
C(Reported_Facility_Type)[T.Non-Refrigerated Warehouse]    5.842424e-01
C(Reported_Facility_Type)[T.Retail Store]                  5.899962e-01
C(Reported_Facility_Type)[T.Distribution Center]           6.808

In [567]:
AdjR2('Energy_efficiency')[0]

0.015137896948479224

In [568]:
AdjR2('Energy_efficiency+C(Reported_Water_Method)')[0]

0.017012855637926871

In [569]:
AdjR2('Energy_efficiency+C(Reported_Water_Method)+C(Reported_Facility_Type)')[0]

0.11724082504310751

In [570]:
lm = smf.ols(formula='VALUE~Energy_efficiency+C(Reported_Water_Method)+C(Reported_Facility_Type)'
             ,data=train).fit()
lm.summary()

0,1,2,3
Dep. Variable:,VALUE,R-squared:,0.073
Model:,OLS,Adj. R-squared:,0.067
Method:,Least Squares,F-statistic:,12.02
Date:,"Mon, 20 Nov 2017",Prob (F-statistic):,2.28e-27
Time:,13:24:43,Log-Likelihood:,-2745.8
No. Observations:,2147,AIC:,5522.0
Df Residuals:,2132,BIC:,5607.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-4.3981,0.872,-5.041,0.000,-6.109,-2.687
C(Reported_Water_Method)[T.Manual],0.3492,0.126,2.762,0.006,0.101,0.597
C(Reported_Facility_Type)[T.Distribution Center],0.4013,0.976,0.411,0.681,-1.512,2.314
C(Reported_Facility_Type)[T.Financial Office],1.0738,1.240,0.866,0.387,-1.359,3.506
C(Reported_Facility_Type)[T.Hotel],1.1404,0.975,1.169,0.242,-0.772,3.053
C(Reported_Facility_Type)[T.K-12 School],1.2249,1.007,1.216,0.224,-0.751,3.200
C(Reported_Facility_Type)[T.Medical Office],1.2437,1.007,1.235,0.217,-0.732,3.219
C(Reported_Facility_Type)[T.Multifamily Housing],0.2645,0.873,0.303,0.762,-1.447,1.976
C(Reported_Facility_Type)[T.Non-Refrigerated Warehouse],0.4943,0.903,0.547,0.584,-1.277,2.266

0,1,2,3
Omnibus:,9.757,Durbin-Watson:,1.992
Prob(Omnibus):,0.008,Jarque-Bera (JB):,8.476
Skew:,-0.093,Prob(JB):,0.0144
Kurtosis:,2.755,Cond. No.,233.0
