In [56]:
import pandas as pd
import numpy as np
from functools import reduce
import statsmodels.api as sm

## Read in 2003 Data

In [57]:
#Read in files 2003 Consumption
consump_03_01 = pd.read_csv('2003_Consumption/FILE01.csv')
consump_03_01.columns = [col.strip() for col in consump_03_01.columns]
consump_03_01 = consump_03_01[['PUBID8', 'REGION8', 'CENDIV8', 'SQFT8', 'PBA8', 'YRCON8', 'FREESTN8'
                              ,'GLSSPC8', 'NELVTR8', 'NESLTR8', 'OPEN248', 'WKHRS8',
                              'NWKER8']]

consump_03_02 = pd.read_csv('2003_Consumption/FILE02.csv')
consump_03_02.columns = [col.strip() for col in consump_03_02.columns]
consump_03_02 = consump_03_02[['PUBID8', 'ONEACT8']]

consump_03_04 = pd.read_csv('2003_Consumption/FILE04.csv')
consump_03_04.columns = [col.strip() for col in consump_03_04.columns]
consump_03_04 = consump_03_04[['PUBID8','RFGEQP8']]

consump_03_05 = pd.read_csv('2003_Consumption/FILE05.csv')
consump_03_05.columns = [col.strip() for col in consump_03_05.columns]
consump_03_05 = consump_03_05[['PUBID8','ELHT18', 'ELCOOL8','ELWATR8','ELCOOK8','ELMANU8']]

consump_03_15 = pd.read_csv('2003_Consumption/FILE15.csv')
consump_03_15.columns = [col.strip() for col in consump_03_15.columns]
consump_03_15 = consump_03_15[['PUBID8','ELCNS8']]

#Merge Dataframes
dfs = [consump_03_01, consump_03_02, consump_03_04, consump_03_05, consump_03_15]
consump_03 = reduce(lambda left,right: pd.merge(left,right,on='PUBID8'), dfs)

consump_03['YEAR'] = 2003

## Read in 2012 Data

In [58]:
consump_12 = pd.read_csv('2012_public_use_data_aug2016.csv')

consump_12 = consump_12[['PUBID', 'REGION', 'CENDIV', 'SQFT', 'PBA','YRCON','FREESTN', 'NELVTR','NESLTR', 'GLSSPC',
          'OPEN24', 'WKHRS', 'NWKER', 'ONEACT', 'RFGEQP', 'ELHT1', 'ELCOOL', 'ELWATR', 'ELCOOK', 'ELMANU',
          'ELCNS']]

consump_12['YEAR'] = 2012

There are climate fields PUBCLIM in 2012 and CLIMAT in 2003 that refer to a climate type a building is located.  These are based on the number of heating and cooling days.  We could potentially use this as a consumption feature by tying zipcodes to NOAA data which has the heating/cooling days for each of the stations.

GlassPercent Categories are different, might need to standardize these if we actually use them

## Merge Years & Data Standarization 

In [59]:
for i in consump_03.columns:
    consump_03[i] = pd.to_numeric(consump_03[i], errors = 'coerce')
    
consump_03.columns = list(consump_12.columns)

consump_all = pd.concat([consump_12,consump_03])

In [60]:
PBA_Dict = {
1:'Vacant',
2:'Office',
4:'Laboratory',
5:'Nonrefrigerated warehouse',
6:'Food sales',
7:'Public order and safety',
8:'Outpatient health care',
11:'Refrigerated warehouse',
12:'Religious worship',
13:'Public assembly',
14:'Education',
15:'Food service',
16:'Inpatient health care',
17:'Nursing',
18:'Lodging',
23:'Strip shopping mall',
24:'Enclosed mall',
25:'Retail other than mall',
26:'Service',
91: 'Other'}

In [61]:
pba_list = []
for i in consump_all['PBA']:
    pba_list.append(PBA_Dict[i])

consump_all['PBA_Detail'] = pd.Series(pba_list)

In [62]:
consump_all['NELVTR'] = consump_all['NELVTR'].fillna(value=0)
consump_all['NESLTR'] = consump_all['NESLTR'].fillna(value=0)

In [63]:
#Filter for freestanding building with a single primary activity
consump_filtered = consump_all.loc[(consump_all['FREESTN'] == 1) & (consump_all['ONEACT'] == 1)]
consump_filtered = consump_filtered.dropna(axis=0, how = 'any')

binary_fix = ['OPEN24','RFGEQP', 'ELHT1','ELCOOL','ELWATR', 'ELCOOK', 'ELMANU']
for column in binary_fix:
    consump_filtered[column] = consump_filtered[column].replace(to_replace = 2, value = 0)
    
max_val_fix = ['NELVTR', 'NESLTR']
for column in binary_fix:
    consump_filtered[column] = consump_filtered[column].replace(to_replace = 995, value = 51)

# Regression Modeling 

In [64]:
consump_filtered.columns

Index(['PUBID', 'REGION', 'CENDIV', 'SQFT', 'PBA', 'YRCON', 'FREESTN',
       'NELVTR', 'NESLTR', 'GLSSPC', 'OPEN24', 'WKHRS', 'NWKER', 'ONEACT',
       'RFGEQP', 'ELHT1', 'ELCOOL', 'ELWATR', 'ELCOOK', 'ELMANU', 'ELCNS',
       'YEAR', 'PBA_Detail'],
      dtype='object')

In [70]:
X = consump_filtered[['SQFT', 'WKHRS', 'NWKER','OPEN24','NELVTR', 'NESLTR','RFGEQP', 'ELHT1',
                      'ELCOOL', 'ELWATR', 'ELCOOK', 'ELMANU']]
y = consump_filtered['ELCNS']
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

0,1,2,3
Dep. Variable:,ELCNS,R-squared:,0.667
Model:,OLS,Adj. R-squared:,0.666
Method:,Least Squares,F-statistic:,947.4
Date:,"Sat, 03 Mar 2018",Prob (F-statistic):,0.0
Time:,15:09:30,Log-Likelihood:,-95871.0
No. Observations:,5687,AIC:,191800.0
Df Residuals:,5675,BIC:,191800.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
SQFT,15.8491,0.409,38.774,0.000,15.048 16.650
WKHRS,-6002.3730,2578.976,-2.327,0.020,-1.11e+04 -946.594
NWKER,3588.2452,180.144,19.919,0.000,3235.094 3941.397
OPEN24,2.611e+06,3.41e+05,7.661,0.000,1.94e+06 3.28e+06
NELVTR,1.673e+04,1634.235,10.237,0.000,1.35e+04 1.99e+04
NESLTR,3926.7169,1849.149,2.124,0.034,301.678 7551.756
RFGEQP,-2.317e+05,1.95e+05,-1.189,0.235,-6.14e+05 1.5e+05
ELHT1,8.06e+04,1.55e+05,0.521,0.603,-2.23e+05 3.84e+05
ELCOOL,2.913e+04,1.95e+05,0.150,0.881,-3.53e+05 4.11e+05

0,1,2,3
Omnibus:,11496.26,Durbin-Watson:,1.989
Prob(Omnibus):,0.0,Jarque-Bera (JB):,60122678.417
Skew:,16.306,Prob(JB):,0.0
Kurtosis:,505.656,Cond. No.,1590000.0


# Business Specific Models

In [71]:
#Office Model

office_df = consump_filtered.loc[consump_filtered['PBA_Detail'] == 'Office']

X = office_df[['SQFT', 'WKHRS', 'NWKER','OPEN24','NELVTR', 'NESLTR','RFGEQP', 'ELHT1',
                      'ELCOOL', 'ELWATR', 'ELCOOK', 'ELMANU']]
y = office_df['ELCNS']
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

0,1,2,3
Dep. Variable:,ELCNS,R-squared:,0.743
Model:,OLS,Adj. R-squared:,0.74
Method:,Least Squares,F-statistic:,271.4
Date:,"Sat, 03 Mar 2018",Prob (F-statistic):,4.45e-322
Time:,15:11:50,Log-Likelihood:,-18729.0
No. Observations:,1139,AIC:,37480.0
Df Residuals:,1127,BIC:,37540.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
SQFT,11.4132,0.752,15.174,0.000,9.937 12.889
WKHRS,-5895.2605,6524.186,-0.904,0.366,-1.87e+04 6905.657
NWKER,1892.8153,283.906,6.667,0.000,1335.772 2449.859
OPEN24,1.933e+06,8.26e+05,2.341,0.019,3.13e+05 3.55e+06
NELVTR,1.708e+04,2241.630,7.619,0.000,1.27e+04 2.15e+04
NESLTR,-1065.5614,3448.050,-0.309,0.757,-7830.881 5699.758
RFGEQP,1.161e+05,2.74e+05,0.423,0.672,-4.22e+05 6.54e+05
ELHT1,4.28e+05,2.25e+05,1.906,0.057,-1.26e+04 8.69e+05
ELCOOL,3.132e+05,3.69e+05,0.848,0.396,-4.11e+05 1.04e+06

0,1,2,3
Omnibus:,2098.639,Durbin-Watson:,1.962
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3509767.843
Skew:,12.732,Prob(JB):,0.0
Kurtosis:,273.751,Cond. No.,2750000.0


In [74]:
#Food Service Model

food_serve_df = consump_filtered.loc[consump_filtered['PBA_Detail'] == 'Food service']

X = food_serve_df[['SQFT', 'WKHRS', 'NWKER','OPEN24','NELVTR', 'NESLTR','RFGEQP', 'ELHT1',
                      'ELCOOL', 'ELWATR', 'ELCOOK', 'ELMANU']]
y = food_serve_df['ELCNS']
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

0,1,2,3
Dep. Variable:,ELCNS,R-squared:,0.99
Model:,OLS,Adj. R-squared:,0.99
Method:,Least Squares,F-statistic:,2596.0
Date:,"Sat, 03 Mar 2018",Prob (F-statistic):,4.99e-280
Time:,15:14:13,Log-Likelihood:,-4257.5
No. Observations:,299,AIC:,8539.0
Df Residuals:,287,BIC:,8583.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
SQFT,-0.8842,0.770,-1.148,0.252,-2.400 0.631
WKHRS,3053.8953,750.847,4.067,0.000,1576.030 4531.760
NWKER,-1.117e+04,350.793,-31.841,0.000,-1.19e+04 -1.05e+04
OPEN24,-8.215e+04,1.16e+05,-0.711,0.478,-3.1e+05 1.45e+05
NELVTR,3.268e+05,9.84e+04,3.319,0.001,1.33e+05 5.21e+05
NESLTR,1.76e+06,3.97e+04,44.311,0.000,1.68e+06 1.84e+06
RFGEQP,2.505e+05,9.85e+04,2.543,0.012,5.66e+04 4.44e+05
ELHT1,1.717e+04,5.08e+04,0.338,0.736,-8.29e+04 1.17e+05
ELCOOL,-1.502e+05,8.73e+04,-1.720,0.087,-3.22e+05 2.17e+04

0,1,2,3
Omnibus:,227.695,Durbin-Watson:,2.08
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2946.702
Skew:,3.08,Prob(JB):,0.0
Kurtosis:,17.092,Cond. No.,810000.0


In [77]:
#Nonrefrigerated warehouse Model

ware_nofridge_df = consump_filtered.loc[consump_filtered['PBA_Detail'] == 'Nonrefrigerated warehouse']

X = ware_nofridge_df[['SQFT', 'WKHRS', 'NWKER','OPEN24','NELVTR', 'NESLTR','RFGEQP', 'ELHT1',
                      'ELCOOL', 'ELWATR', 'ELCOOK', 'ELMANU']]
y = ware_nofridge_df['ELCNS']
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

0,1,2,3
Dep. Variable:,ELCNS,R-squared:,0.785
Model:,OLS,Adj. R-squared:,0.78
Method:,Least Squares,F-statistic:,181.6
Date:,"Sat, 03 Mar 2018",Prob (F-statistic):,2.3099999999999997e-190
Time:,15:17:37,Log-Likelihood:,-9743.4
No. Observations:,610,AIC:,19510.0
Df Residuals:,598,BIC:,19560.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
SQFT,3.7029,0.469,7.904,0.000,2.783 4.623
WKHRS,1070.2875,2583.374,0.414,0.679,-4003.301 6143.876
NWKER,8703.9733,837.298,10.395,0.000,7059.572 1.03e+04
OPEN24,5.89e+04,4.22e+05,0.139,0.889,-7.71e+05 8.89e+05
NELVTR,-2.892e+05,1.59e+05,-1.820,0.069,-6.01e+05 2.29e+04
NESLTR,2.324e+05,9.48e+04,2.452,0.014,4.63e+04 4.19e+05
RFGEQP,7.26e+04,2.18e+05,0.333,0.739,-3.55e+05 5.01e+05
ELHT1,2.824e+05,2.02e+05,1.400,0.162,-1.14e+05 6.79e+05
ELCOOL,-2.327e+05,2.43e+05,-0.956,0.340,-7.11e+05 2.45e+05

0,1,2,3
Omnibus:,932.134,Durbin-Watson:,2.076
Prob(Omnibus):,0.0,Jarque-Bera (JB):,456497.375
Skew:,8.361,Prob(JB):,0.0
Kurtosis:,135.97,Cond. No.,1400000.0
