In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error 
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [44]:
baseCase = pd.read_excel('../DSSG2023-Heating-Loads-Data/alanMitchellData/simulation_data/results_baseCase.xlsx', usecols=lambda x: x not in ['Unnamed: 0'])
medRebate = pd.read_excel('../DSSG2023-Heating-Loads-Data/alanMitchellData/simulation_data/results_medRebate.xlsx', usecols=lambda x: x not in ['Unnamed: 0'])
highRebate = pd.read_excel('../DSSG2023-Heating-Loads-Data/alanMitchellData/simulation_data/results_highRebate.xlsx', usecols=lambda x: x not in ['Unnamed: 0'])
medFuelEsc = pd.read_excel('../DSSG2023-Heating-Loads-Data/alanMitchellData/simulation_data/results_medFuelEsc.xlsx', usecols=lambda x: x not in ['Unnamed: 0'])
highFuelEsc = pd.read_excel('../DSSG2023-Heating-Loads-Data/alanMitchellData/simulation_data/results_highFuelEsc.xlsx', usecols=lambda x: x not in ['Unnamed: 0'])

FileNotFoundError: [Errno 2] No such file or directory: '../DSSG2023-Heating-Loads-Data/alanMitchellData/simulation_data/results_baseCase.xlsx'

In [3]:
baseCase['rebate_dol'] = 0
baseCase['fuel_esc_rate'] = 0.03

medRebate['rebate_dol'] = 2000
medRebate['fuel_esc_rate'] = 0.03
highRebate['rebate_dol'] = 8000
highRebate['fuel_esc_rate'] = 0.03

medFuelEsc['rebate_dol'] = 0
medFuelEsc['fuel_esc_rate'] = 0.06
highFuelEsc['rebate_dol'] = 0
highFuelEsc['fuel_esc_rate'] = 0.12

In [4]:
allScens = pd.concat([baseCase, medRebate, highRebate, medFuelEsc, highFuelEsc])

In [5]:
len(allScens)

13393

In [None]:
allScens = pd.get_dummies(allScens, columns=['City', 'Census_Area', 'Exist_Fuel'])

In [None]:
allScens.columns

## Building regression models

In [None]:
# Separating the df into input and output components
Y = allScens['NPV']

In [None]:
allScens_dummies = allScens.filter(regex = 'Census_Area_|Exist_Fuel_(?!Type)')
allScens_numerics = allScens[['PCE', 'Sq_Ft', 'Capital_Cost', 'Elec_Use_Jan', 'Elec_Use_May', 'Design_Heat_Load', 'Design_Heat_Temp', 'COP', 'HP_Load_Frac', 'Fuel_Use_Chg', 'Elec_Use_Chg', 'Elec_Rate_Incremental', 'rebate_dol', 'fuel_esc_rate']]
X = pd.concat([allScens_dummies, allScens_numerics], axis=1)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33, random_state = 42)

In [None]:
model = GradientBoostingRegressor(max_depth = 5, n_estimators=250, learning_rate=0.5)
#model = Lasso(alpha=1, max_iter=10000)

In [None]:
model.fit(X_train, Y_train)

In [None]:
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot

model = GradientBoostingRegressor()
# define the grid of values to search
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]
grid['subsample'] = [0.5, 0.7, 1.0]
grid['max_depth'] = [3, 7, 9]

# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='neg_mean_absolute_error')
# execute the grid search
grid_result = grid_search.fit(X, Y)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))



In [None]:
# Making predictions
Y_pred = model.predict(X_test)

In [None]:
rmse = mean_squared_error(Y_test, Y_pred, squared=False)
mae = mean_absolute_error(Y_test, Y_pred)
print("The RMSE of the model is", rmse)
print("The MAE of the model is", mae)

In [None]:
np.mean(allScens['NPV'])

In [None]:
model = make_pipeline(StandardScaler(), RandomForestRegressor())
model.fit(X_train, Y_train)


coefs = pd.DataFrame(
    model[1].coef_, columns=["Coefficients"], index=X_train.columns
)

coefs.plot(kind="barh", figsize=(9, 7))
plt.title("Ridge model")
plt.axvline(x=0, color=".5")
plt.subplots_adjust(left=0.3)