# Import Data

In [3]:
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, GridSearchCV

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shap

In [5]:
data = pd.read_json('data.json').dropna()
data_with_energy_score = pd.read_json('data_with_energy_score.json').dropna()

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1649 entries, 0 to 3375
Data columns (total 53 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   PropUseTypes1_12                 1649 non-null   int64  
 1   PropUseTypes2_center             1649 non-null   int64  
 2   PropUseTypes3_data               1649 non-null   int64  
 3   PropUseTypes4_distribution       1649 non-null   int64  
 4   PropUseTypes5_facility           1649 non-null   int64  
 5   PropUseTypes6_grocery            1649 non-null   int64  
 6   PropUseTypes7_hotel              1649 non-null   int64  
 7   PropUseTypes8_medical            1649 non-null   int64  
 8   PropUseTypes9_non                1649 non-null   int64  
 9   PropUseTypes10_office            1649 non-null   int64  
 10  PropUseTypes11_parking           1649 non-null   int64  
 11  PropUseTypes12_refrigerated      1649 non-null   int64  
 12  PropUseTypes13_resta

# Train Test Split

In [7]:
X = data.drop(['TotalGHGEmissions', 'SiteEnergyUse(kBtu)'], axis=1)
y = data['SiteEnergyUse(kBtu)']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [9]:
y_train = np.log(y_train)
y_test = np.log(y_test)

# Dummy Regressor

In [10]:
dummy = DummyRegressor(strategy='mean')

dummy.fit(X_train, y_train)
y_pred_dummy = dummy.predict(X_test)
r2_dummy = r2_score(y_test, y_pred_dummy)

print(f'Dummy Regressor - Best R^2 Score : {r2_dummy:.2f}')

Dummy Regressor - Best R^2 Score : -0.00


# Linear Regressor

## Fitting and Predictions

In [11]:
lin_reg = LinearRegression()

param_grid_lr = {
    'fit_intercept': [True, False],
}

grid_search_lr = GridSearchCV(
    lin_reg,
    param_grid_lr,
    scoring='r2',
    return_train_score=True,
    n_jobs=-1
)

In [12]:
%%timeit
grid_search_lr.fit(X_train, y_train)

122 ms ± 25 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Results Cross Validation

In [13]:
# Get mean test scores from cross validation
rslt = grid_search_lr.cv_results_.get("mean_test_score")

# Sort the best estimators based on these mean scores
ranked_indices = np.argsort(rslt)[::-1]

for i, (param, score) in enumerate(zip(
    np.array(grid_search_lr.cv_results_.get("params"))[ranked_indices],
    np.round(rslt[ranked_indices], 5)
)):
    print(f"Estimator {i + 1}  - ", *[f"{key}={val} " for key, val in param.items()], f"-->  r2={round(score, 5)}")

Estimator 1  -  fit_intercept=False  -->  r2=0.70319
Estimator 2  -  fit_intercept=True  -->  r2=0.7023


In [14]:
best_lr_model = grid_search_lr.best_estimator_
y_pred_lr = best_lr_model.predict(X_test)
r2_lr = r2_score(y_test, y_pred_lr)

print(f'Linear Regressor - Best R^2 Score : {r2_lr:.2f}')

Linear Regressor - Best R^2 Score : 0.69


# Random Forest Regressor

In [15]:
rf_reg = RandomForestRegressor(random_state=42)

param_grid_rf = [{
    'n_estimators': [10, 20, 50, 100, 200],
    'max_depth': [None, 2, 5, 10],
    'min_samples_split': [2, 5, 10],
}]

grid_search_rf = GridSearchCV(
    rf_reg,
    param_grid_rf,
    scoring='r2',
    return_train_score=True,
    n_jobs=-1
)

In [16]:
%%timeit
grid_search_rf.fit(X_train, y_train)

24.4 s ± 333 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Results Cross Validation

In [17]:
# Get mean test scores from cross validation
rslt = grid_search_rf.cv_results_.get("mean_test_score")

# Sort the best estimators based on these mean scores
rank = 3
ranked_indices = np.argsort(rslt)[-rank:][::-1]

print(f"Top {rank} estimators:")

for i, (param, score) in enumerate(zip(
    np.array(grid_search_rf.cv_results_.get("params"))[ranked_indices],
    np.round(rslt[ranked_indices], 5)
)):
    print(f"Estimator {i + 1}  - ", *[f"{key}={val} " for key, val in param.items()], f"-->  r2={round(score, 5)}")

Top 3 estimators:
Estimator 1  -  max_depth=None  min_samples_split=2  n_estimators=200  -->  r2=0.7079
Estimator 2  -  max_depth=None  min_samples_split=2  n_estimators=100  -->  r2=0.7079
Estimator 3  -  max_depth=None  min_samples_split=5  n_estimators=200  -->  r2=0.70718


In [18]:
best_rf_model = grid_search_rf.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)
r2_rf = r2_score(y_test, y_pred_rf)

print(f'Random Forest Regressor - Best R^2 Score : {r2_rf:.5f}')

Random Forest Regressor - Best R^2 Score : 0.70190


# With ENERGY STAR Score

## Train Test Split

In [19]:
X_ESS = data_with_energy_score.drop(['TotalGHGEmissions', 'SiteEnergyUse(kBtu)'], axis=1)
y_ESS = data_with_energy_score['SiteEnergyUse(kBtu)']

X_train_ESS, X_test_ESS, y_train_ESS, y_test_ESS = train_test_split(
    X_ESS, y_ESS, test_size=0.2, random_state=42
)

## Random Forest Regressor

In [20]:
rf_reg_ESS = RandomForestRegressor(random_state=42)

grid_search_rf_ESS = GridSearchCV(
    rf_reg_ESS,
    param_grid_rf,
    scoring='r2',
    return_train_score=True,
    n_jobs=-1
)

grid_search_rf_ESS.fit(X_train_ESS, y_train_ESS)

### Results Cross Validation

In [21]:
# Get mean test scores from cross validation
rslt = grid_search_rf_ESS.cv_results_.get("mean_test_score")

# Sort the best estimators based on these mean scores
rank = 3
ranked_indices = np.argsort(rslt)[-rank:][::-1]

print(f"Top {rank} estimators:")

for i, (param, score) in enumerate(zip(
    np.array(grid_search_rf.cv_results_.get("params"))[ranked_indices],
    np.round(rslt[ranked_indices], 5)
)):
    print(f"Estimator {i + 1}  - ", *[f"{key}={val} " for key, val in param.items()], f"-->  r2={score:.5f}")

Top 3 estimators:
Estimator 1  -  max_depth=None  min_samples_split=2  n_estimators=200  -->  r2=0.62448
Estimator 2  -  max_depth=None  min_samples_split=5  n_estimators=200  -->  r2=0.61515
Estimator 3  -  max_depth=None  min_samples_split=10  n_estimators=200  -->  r2=0.61370


In [22]:
best_rf_model_ESS = grid_search_rf_ESS.best_estimator_
y_pred_rf_ESS = best_rf_model_ESS.predict(X_test_ESS)
r2_rf = r2_score(y_test_ESS, y_pred_rf_ESS)

print(f'Random Forest Regressor with ENERGY STAR Score - Best R^2 Score : {r2_rf:.2f}')

Random Forest Regressor with ENERGY STAR Score - Best R^2 Score : 0.74
