In [10]:
import numpy as np
from numpy import mean
from numpy import std
import pandas as pd
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error 
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import xgboost as xgb

## Loading simulated data with AM's original code and TMY3 data

In [32]:
allScens = pd.read_csv('../Output/allScens_augmented.csv', usecols=lambda x: x not in ['Unnamed: 0'])

In [33]:
allScens = pd.get_dummies(allScens, columns=['Exist_Fuel', 'Scenario'])

In [34]:
allScens.columns

Index(['City', 'Exist_Fuel_Type', 'Census_Area', 'ANCSA_Region', 'Util_Name',
       'PCE', 'Sq_Ft', 'Capital_Cost', 'Elec_Use_Jan', 'Elec_Use_May',
       'Design_Heat_Load', 'Design_Heat_Temp', 'COP', 'Max_HP_Cap_Reached',
       'HP_Load_Frac', 'IRR', 'NPV', 'CO2_lbs_saved',
       'CO2_driving_miles_saved', 'Fuel_Use_Chg', 'Fuel_Price_Incremental',
       'Elec_Use_Chg', 'Elec_Rate_Incremental', 'Econ', 'rebate_dol',
       'fuel_esc_rate', 'TMYid', 'Longitude', 'Latitude', 'Oil1Price',
       'PropanePrice', 'GasPrice', 'avg_elec_usage1', 'avg_elec_usage2',
       'avg_elec_usage3', 'avg_elec_usage4', 'avg_elec_usage5',
       'avg_elec_usage6', 'avg_elec_usage7', 'avg_elec_usage8',
       'avg_elec_usage9', 'avg_elec_usage10', 'avg_elec_usage11',
       'avg_elec_usage12', 'avgTemp_1', 'avgTemp_2', 'avgTemp_3', 'avgTemp_4',
       'avgTemp_5', 'avgTemp_6', 'avgTemp_7', 'avgTemp_8', 'avgTemp_9',
       'avgTemp_10', 'avgTemp_11', 'avgTemp_12', 'freezing_days', 'Railbelt',
       '

## Building regression models

In [35]:
# Separating the df into input and output components
allScens_numerics1 = allScens.filter(regex = 'Exist_Fuel_(?!Type)|avgTemp_[1,2,3,10,11,12]|avg_elec_usage[1,2,3,10,11,12]')
allScens_numerics2 = allScens[['freezing_days', 'Oil1Price', 'PropanePrice', 'GasPrice', 'PCE', 'Sq_Ft', 'Capital_Cost', 'Design_Heat_Load', 'Design_Heat_Temp', 'rebate_dol', 'fuel_esc_rate']]

X = pd.concat([allScens_numerics1, allScens_numerics2], axis=1)
Y = allScens['NPV']
cities = allScens['City']

In [36]:
X.head()

Unnamed: 0,avg_elec_usage1,avg_elec_usage2,avg_elec_usage3,avg_elec_usage10,avg_elec_usage11,avg_elec_usage12,avgTemp_1,avgTemp_2,avgTemp_3,avgTemp_10,...,Oil1Price,PropanePrice,GasPrice,PCE,Sq_Ft,Capital_Cost,Design_Heat_Load,Design_Heat_Temp,rebate_dol,fuel_esc_rate
0,471.936998,423.455318,460.815493,425.89382,443.33996,463.328994,32.287177,33.65,35.370403,42.440242,...,7.3,0.0,0.0,0.7597,1130.0,6400.0,12197.89368,22.6,0,0.03
1,471.936998,423.455318,460.815493,425.89382,443.33996,463.328994,32.287177,33.65,35.370403,42.440242,...,7.3,0.0,0.0,0.7597,678.0,6400.0,8148.60648,22.6,0,0.03
2,471.936998,423.455318,460.815493,425.89382,443.33996,463.328994,32.287177,33.65,35.370403,42.440242,...,7.3,0.0,0.0,0.7597,1695.0,6400.0,17259.50268,22.6,0,0.03
3,471.936998,423.455318,460.815493,425.89382,443.33996,463.328994,32.287177,33.65,35.370403,42.440242,...,7.3,0.0,0.0,0.7597,1130.0,6400.0,12197.89368,22.6,0,0.03
4,471.936998,423.455318,460.815493,425.89382,443.33996,463.328994,32.287177,33.65,35.370403,42.440242,...,7.3,0.0,0.0,0.7597,1130.0,6400.0,12197.89368,22.6,0,0.03


In [37]:
# Compare to the mean
np.mean(allScens['NPV'])

28107.264750234277

## AM Original Data :: Extreme Gradient Boosting Model (XGB)

In [45]:
# Naive split (disregard cities)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

# Naive model
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, Y_train)

# Making predictions
Y_pred = xgb_reg.predict(X_test)

# Assess performance
rmse = mean_squared_error(Y_test, Y_pred, squared=False)
mae = mean_absolute_error(Y_test, Y_pred)
print("The RMSE of the model is", rmse)
print("The MAE of the model is", mae)

The RMSE of the model is 4194.033544014309
The MAE of the model is 2901.260632387525


In [47]:
# Group based split
from sklearn.model_selection import GroupKFold 
split = GroupKFold(n_splits=5).split(allScens, groups=allScens['City'])
train_inds, test_inds = next(split)

X_train_byCity = X.iloc[train_inds]
Y_train_byCity = Y.iloc[train_inds]
X_test_byCity = X.iloc[test_inds]
Y_test_byCity = Y.iloc[test_inds]

# Naive model
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train_byCity, Y_train_byCity)

# Making predictions
Y_pred_byCity = xgb_reg.predict(X_test_byCity)

# Assess performance
rmse_byCity = mean_squared_error(Y_test_byCity, Y_pred_byCity, squared=False)
mae_byCity = mean_absolute_error(Y_test_byCity, Y_pred_byCity)
print("The RMSE of the model is", rmse_byCity)
print("The MAE of the model is", mae_byCity)

The RMSE of the model is 11699.056505365146
The MAE of the model is 7057.575274930493


In [None]:
## Alternative splitting method:
#from sklearn.model_selection import GroupShuffleSplit 
# splitter = GroupShuffleSplit(test_size=.33, n_splits=2, random_state = 7)
# split = splitter.split(allScens, groups=allScens['City'])
# train_inds, test_inds = next(split)

## Loading simulated data with extra randomness

In [50]:
allScensR = pd.read_csv('../Output/allScens_wRandomness_augmented.csv', usecols=lambda x: x not in ['Unnamed: 0'])

In [51]:
allScensR = pd.get_dummies(allScensR, columns=['Exist_Fuel', 'Scenario'])

In [53]:
# Separating the df into input and output components
allScensR_numerics1 = allScensR.filter(regex = 'Exist_Fuel_(?!Type)|Avg_Temp_[1,2,3,10,11,12]|Elec_Use_')
allScensR_numerics2 = allScensR[['Freezing_days', 'Exist_Unit_Fuel_Cost', 'Elec_Rate_Avg_Base', 'PCE', 'Sq_Ft', 'Capital_Cost', 'Design_Heat_Load', 'Design_Heat_Temp', 'Rebate_dol', 'Fuel_Esc_Rate']]

Xr = pd.concat([allScensR_numerics1, allScensR_numerics2], axis=1)
Yr = allScensR['NPV']
citiesR = allScensR['City']

In [55]:
# Compare to the mean
np.mean(allScensR['NPV'])

33683.30427644264

## Simulated Data w/ Extra Randomness :: XGB

In [56]:
# Naive split (disregard cities)
Xr_train, Xr_test, Yr_train, Yr_test = train_test_split(Xr, Yr, test_size=0.33)

# Naive model
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(Xr_train, Yr_train)

# Making predictions
Yr_pred = xgb_reg.predict(Xr_test)

# Assess performance
rmseR = mean_squared_error(Yr_test, Yr_pred, squared=False)
maeR = mean_absolute_error(Yr_test, Yr_pred)
print("The RMSE of the model is", rmseR)
print("The MAE of the model is", maeR)

The RMSE of the model is 17244.921723924017
The MAE of the model is 10536.626256032125


In [57]:
# Group based split
from sklearn.model_selection import GroupKFold 
splitR = GroupKFold(n_splits=5).split(allScensR, groups=allScensR['City'])
trainR_inds, testR_inds = next(splitR)

Xr_train_byCity = Xr.iloc[trainR_inds]
Yr_train_byCity = Yr.iloc[trainR_inds]
Xr_test_byCity = Xr.iloc[testR_inds]
Yr_test_byCity = Yr.iloc[testR_inds]

# Naive model
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(Xr_train_byCity, Yr_train_byCity)

# Making predictions
Yr_pred_byCity = xgb_reg.predict(Xr_test_byCity)

# Assess performance
rmseR_byCity = mean_squared_error(Yr_test_byCity, Yr_pred_byCity, squared=False)
maeR_byCity = mean_absolute_error(Yr_test_byCity, Yr_pred_byCity)
print("The RMSE of the model is", rmseR_byCity)
print("The MAE of the model is", maeR_byCity)

The RMSE of the model is 16588.076477921913
The MAE of the model is 10098.849507728251


## Code dump tab

### Random Search CV

In [24]:
params = {
 'learning_rate' : [0.05,0.10,0.15,0.20,0.25,0.30],
 'n_estimators': [50, 100, 200, 500],
 'max_depth' : [ 3, 4, 5, 6, 8, 10, 12, 15],
 'subsample' : [0.2, 0.4, 0.8, 1.0],
 'min_child_weight' : [ 1, 3, 5, 7 ],
 'gamma': [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 'colsample_bytree' : [ 0.3, 0.4, 0.5 , 0.7 ]
}

In [25]:
xgb_reg = xgb.XGBRegressor()

In [26]:
cv = GroupKFold(n_splits=5)
rs_model=RandomizedSearchCV(xgb_reg,
                            param_distributions=params,
                            n_iter=10,
                            cv=cv,
                            scoring='neg_mean_absolute_error',
                            n_jobs=-1,
                            verbose=2)

In [27]:
rs_model.fit(X_train, Y_train, groups = cities.iloc[train_inds])

ValueError: Found input variables with inconsistent numbers of samples: [13229, 13229, 15800]

In [28]:
rs_model.best_params_

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

In [78]:
best_model = rs_model.best_estimator_

In [79]:
Y_pred = best_model.predict(X_test)
mean_absolute_error(Y_test, Y_pred)

9540.84480758613

In [80]:
cv = GroupKFold(n_splits=5)
score=cross_val_score(best_model,X,Y,cv=cv,groups=cities,scoring='neg_mean_absolute_error')

In [81]:
score.mean()

-10477.075601656821

## Random Forest

In [29]:
# Fitting a random forest 
model = RandomForestRegressor()
model.fit(X_train, Y_train)

# Making predictions
Y_pred = model.predict(X_test)

# Assess performance
rmse = mean_squared_error(Y_test, Y_pred, squared=False)
mae = mean_absolute_error(Y_test, Y_pred)
print("The RMSE of the model is", rmse)
print("The MAE of the model is", mae)

The RMSE of the model is 18536.060103982018
The MAE of the model is 11032.333956888468


### Random Forest -- Random Search CV

In [None]:
n_estimators = [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
max_depth = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]
min_samples_leaf =  [1, 2, 4]
min_samples_split = [2, 5, 10]
max_features = ['sqrt', 'log2', None]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features}


In [None]:
hyperparameter_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'max_features': ['sqrt', 'log2']}

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

In [None]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=hyperparameter_grid, n_iter=50, cv=5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, Y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1400; total time=  11.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1400; total time=  12.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1400; total time=  12.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time=   4.0s
[CV] END max_depth=80, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=1800; total time=  16.6s
[CV] END max_depth=80, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=1800; total time=  16.8s
[CV] END max_depth=80, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=1800; total time=  17.1s
[CV] END max_depth=80, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_est



[CV] END max_depth=70, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1400; total time=  22.0s
[CV] END max_depth=80, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=  22.9s
[CV] END max_depth=80, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=  23.1s
[CV] END max_depth=80, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=  23.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1600; total time=  22.2s
[CV] END max_depth=80, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=  24.5s
[CV] END max_depth=80, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=  24.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1600; total time=  22.1s
[CV] END max_depth=20, max_

In [None]:
rf_random.best_params_

{'n_estimators': 1800,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 40}

In [None]:
best_rf = RandomForestRegressor(**rf_random.best_params_)

In [None]:
cv = RepeatedKFold(n_splits=5, n_repeats=5)
scores = cross_val_score(best_rf, X, Y, cv = cv, scoring='neg_mean_absolute_error')

In [None]:
scores.mean()

-3985.506493899842

## Gradient boosting model 

In [120]:
# Fitting a gradient boosting model 
model = GradientBoostingRegressor(**rs_model.best_params_)

TypeError: GradientBoostingRegressor.__init__() got an unexpected keyword argument 'min_child_weight'

In [112]:
# K-fold cross validation 
cv = RepeatedKFold(n_splits=3, n_repeats=3)
n_scores = cross_val_score(model, X, Y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: -2551.385 (69.356)


In [64]:
n_estimators = [100, 500, 1000, 1500]
max_depth = [3, 5, 10, 15]
min_samples_leaf = [2, 4, 6, 10] 
min_samples_split = [2, 4, 6, 10]
max_features = ['auto', 'sqrt']

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features}

In [65]:
random_cv = RandomizedSearchCV(estimator=model,
                               param_distributions=hyperparameter_grid,
                               cv=4, n_iter=10,
                               scoring = 'neg_mean_absolute_error',n_jobs = -1,
                               verbose = 2, 
                               random_state=42)

In [66]:
random_cv.fit(X, Y)

Fitting 4 folds for each of 10 candidates, totalling 40 fits
[CV] END max_depth=10, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=4, n_estimators=1500; total time=   0.0s
[CV] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=4, n_estimators=1500; total time=   0.0s
[CV] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=4, n_estimators=1500; total time=   0.0s
[CV] END max_depth=15, max_features=auto, min_samples_leaf=10, min_samples_split=4, n_est

16 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
7 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/brianleung/miniconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/brianleung/miniconda3/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/brianleung/miniconda3/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/brianleung/miniconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, i

In [68]:
random_cv.best_params_

{'n_estimators': 1500,
 'min_samples_split': 4,
 'min_samples_leaf': 10,
 'max_features': 'sqrt',
 'max_depth': 5}

In [71]:
best_model = GradientBoostingRegressor(n_estimators= 1500,
                                       min_samples_split= 4,
                                       min_samples_leaf= 10,
                                       max_features='sqrt',
                                       max_depth=5)

score=cross_val_score(best_model,X,Y,cv=3, scoring='neg_mean_absolute_error')

In [72]:
score

array([-8792.47329762, -8229.639407  , -9513.83353661])

## Grid search for hyper-parameters

In [509]:
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot

model = GradientBoostingRegressor()
# define the grid of values to search
grid = dict()
grid['n_estimators'] = [100, 250, 500, 1000]
grid['learning_rate'] = [0.01, 0.1, 0.5, 1]
grid['subsample'] = [0.5, 0.7, 1.0]
grid['max_depth'] = [3, 7, 9]

# define the evaluation procedure
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='neg_mean_absolute_error')
# execute the grid search
grid_result = grid_search.fit(X, Y)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

This version of python seems to be incorrectly compiled
(internal generated filenames are not absolute).
This may make the debugger miss breakpoints.
Related bug: http://bugs.python.org/issue1666807
This version of python seems to be incorrectly compiled
(internal generated filenames are not absolute).
This may make the debugger miss breakpoints.
Related bug: http://bugs.python.org/issue1666807
This version of python seems to be incorrectly compiled
(internal generated filenames are not absolute).
This may make the debugger miss breakpoints.
Related bug: http://bugs.python.org/issue1666807
This version of python seems to be incorrectly compiled
(internal generated filenames are not absolute).
This may make the debugger miss breakpoints.
Related bug: http://bugs.python.org/issue1666807
This version of python seems to be incorrectly compiled
(internal generated filenames are not absolute).
This may make the debugger miss breakpoints.
Related bug: http://bugs.python.org/issue1666807
This 