In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GroupKFold 
import xgboost as xgb

## Loading simulated data with AM's original code and TMY3 data

In [2]:
allScens = pd.read_csv('../Output/allScens_augmented.csv', usecols=lambda x: x not in ['Unnamed: 0'])

In [3]:
allScens = pd.get_dummies(allScens, columns=['Exist_Fuel', 'Scenario'])

In [4]:
allScens.columns

Index(['City', 'Exist_Fuel_Type', 'Census_Area', 'ANCSA_Region', 'Util_Name',
       'PCE', 'Sq_Ft', 'Capital_Cost', 'Elec_Use_Jan', 'Elec_Use_May',
       'Design_Heat_Load', 'Design_Heat_Temp', 'COP', 'Max_HP_Cap_Reached',
       'HP_Load_Frac', 'IRR', 'NPV', 'CO2_lbs_saved',
       'CO2_driving_miles_saved', 'Fuel_Use_Chg', 'Fuel_Price_Incremental',
       'Elec_Use_Chg', 'Elec_Rate_Incremental', 'Econ', 'rebate_dol',
       'fuel_esc_rate', 'TMYid', 'Longitude', 'Latitude', 'Oil1Price',
       'PropanePrice', 'GasPrice', 'avg_elec_usage1', 'avg_elec_usage2',
       'avg_elec_usage3', 'avg_elec_usage4', 'avg_elec_usage5',
       'avg_elec_usage6', 'avg_elec_usage7', 'avg_elec_usage8',
       'avg_elec_usage9', 'avg_elec_usage10', 'avg_elec_usage11',
       'avg_elec_usage12', 'avgTemp_1', 'avgTemp_2', 'avgTemp_3', 'avgTemp_4',
       'avgTemp_5', 'avgTemp_6', 'avgTemp_7', 'avgTemp_8', 'avgTemp_9',
       'avgTemp_10', 'avgTemp_11', 'avgTemp_12', 'freezing_days', 'Railbelt',
       '

In [5]:
# getting a sense of the outcome (NPV)
np.mean(allScens['NPV'])

28107.264750234277

## AM Original Data :: Preparing Features and Outcome

In [6]:
# Separating the df into input and output components
allScens_X1 = allScens.filter(regex = 'Exist_Fuel_(?!Type)|avgTemp_[1,2,3,10,11,12]|avg_elec_usage[1,2,3,10,11,12]')
allScens_X2 = allScens[['freezing_days', 'Oil1Price', 'PropanePrice', 'GasPrice', 'PCE', 'Sq_Ft', 'Capital_Cost', 'Design_Heat_Load', 'Design_Heat_Temp', 'rebate_dol', 'fuel_esc_rate']]

X = pd.concat([allScens_X1, allScens_X2], axis=1)
Y = allScens['NPV']
cities = allScens['City']

In [7]:
X.head()

Unnamed: 0,avg_elec_usage1,avg_elec_usage2,avg_elec_usage3,avg_elec_usage10,avg_elec_usage11,avg_elec_usage12,avgTemp_1,avgTemp_2,avgTemp_3,avgTemp_10,...,Oil1Price,PropanePrice,GasPrice,PCE,Sq_Ft,Capital_Cost,Design_Heat_Load,Design_Heat_Temp,rebate_dol,fuel_esc_rate
0,471.936998,423.455318,460.815493,425.89382,443.33996,463.328994,32.287177,33.65,35.370403,42.440242,...,7.3,0.0,0.0,0.7597,1130.0,6400.0,12197.89368,22.6,0,0.03
1,471.936998,423.455318,460.815493,425.89382,443.33996,463.328994,32.287177,33.65,35.370403,42.440242,...,7.3,0.0,0.0,0.7597,678.0,6400.0,8148.60648,22.6,0,0.03
2,471.936998,423.455318,460.815493,425.89382,443.33996,463.328994,32.287177,33.65,35.370403,42.440242,...,7.3,0.0,0.0,0.7597,1695.0,6400.0,17259.50268,22.6,0,0.03
3,471.936998,423.455318,460.815493,425.89382,443.33996,463.328994,32.287177,33.65,35.370403,42.440242,...,7.3,0.0,0.0,0.7597,1130.0,6400.0,12197.89368,22.6,0,0.03
4,471.936998,423.455318,460.815493,425.89382,443.33996,463.328994,32.287177,33.65,35.370403,42.440242,...,7.3,0.0,0.0,0.7597,1130.0,6400.0,12197.89368,22.6,0,0.03


## AM Original Data :: Simple Random Forest

In [8]:
# Naive split (disregard cities)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

# Fitting a random forest 
model = RandomForestRegressor()
model.fit(X_train, Y_train)

# Making predictions
Y_pred = model.predict(X_test)

# Assess performance
rmse = mean_squared_error(Y_test, Y_pred, squared=False)
mae = mean_absolute_error(Y_test, Y_pred)
print("The RMSE of the model is", rmse)
print("The MAE of the model is", mae)

The RMSE of the model is 5802.487617533115
The MAE of the model is 3114.3351434556293


In [9]:
# Split by city (some cities only appear in training, but not testing sets)
split = GroupKFold(n_splits=5).split(allScens, groups=allScens['City'])
train_inds, test_inds = next(split)

X_train_byCity = X.iloc[train_inds]
Y_train_byCity = Y.iloc[train_inds]
X_test_byCity = X.iloc[test_inds]
Y_test_byCity = Y.iloc[test_inds]

# Naive model
model = RandomForestRegressor()
model.fit(X_train_byCity, Y_train_byCity)

# Making predictions
Y_pred_byCity = model.predict(X_test_byCity)

# Assess performance
rmse_byCity = mean_squared_error(Y_test_byCity, Y_pred_byCity, squared=False)
mae_byCity = mean_absolute_error(Y_test_byCity, Y_pred_byCity)
print("The RMSE of the model is", rmse_byCity)
print("The MAE of the model is", mae_byCity)

The RMSE of the model is 11701.980226834643
The MAE of the model is 6778.082774000571


## AM Original Data :: Extreme Gradient Boosting Model (XGB)

In [10]:
# Naive split (disregard cities)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

# Naive model
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, Y_train)

# Making predictions
Y_pred = xgb_reg.predict(X_test)

# Assess performance
rmse = mean_squared_error(Y_test, Y_pred, squared=False)
mae = mean_absolute_error(Y_test, Y_pred)
print("The RMSE of the model is", rmse)
print("The MAE of the model is", mae)

The RMSE of the model is 4484.902173913567
The MAE of the model is 3061.823319073927


In [11]:
# Group based split
split = GroupKFold(n_splits=5).split(allScens, groups=allScens['City'])
train_inds, test_inds = next(split)

X_train_byCity = X.iloc[train_inds]
Y_train_byCity = Y.iloc[train_inds]
X_test_byCity = X.iloc[test_inds]
Y_test_byCity = Y.iloc[test_inds]

# Naive model
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train_byCity, Y_train_byCity)

# Making predictions
Y_pred_byCity = xgb_reg.predict(X_test_byCity)

# Assess performance
rmse_byCity = mean_squared_error(Y_test_byCity, Y_pred_byCity, squared=False)
mae_byCity = mean_absolute_error(Y_test_byCity, Y_pred_byCity)
print("The RMSE of the model is", rmse_byCity)
print("The MAE of the model is", mae_byCity)

The RMSE of the model is 11699.056505365146
The MAE of the model is 7057.575274930493


## Loading simulated data with extra randomness

In [12]:
allScensR = pd.read_csv('../Output/allScens_wRandomness_augmented.csv', usecols=lambda x: x not in ['Unnamed: 0'])

In [13]:
allScensR = pd.get_dummies(allScensR, columns=['Exist_Fuel', 'Scenario'])

In [14]:
# Separating the df into input and output components
allScensR_X1 = allScensR.filter(regex = 'Exist_Fuel_(?!Type)|Avg_Temp_[1,2,3,10,11,12]|Elec_Use_')
allScensR_X2 = allScensR[['Freezing_days', 'Exist_Unit_Fuel_Cost', 'Elec_Rate_Avg_Base', 'PCE', 'Sq_Ft', 'Capital_Cost', 'Design_Heat_Load', 'Design_Heat_Temp', 'Rebate_dol', 'Fuel_Esc_Rate']]

Xr = pd.concat([allScensR_X1, allScensR_X2], axis=1)
Yr = allScensR['NPV']
citiesR = allScensR['City']

In [15]:
# getting a sense of the outcome (NPV)
np.mean(allScensR['NPV'])

33683.30427644264

## Simulated Data w/ Extra Randomness :: XGB

In [16]:
# Naive split (disregard cities)
Xr_train, Xr_test, Yr_train, Yr_test = train_test_split(Xr, Yr, test_size=0.33)

# Naive model
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(Xr_train, Yr_train)

# Making predictions
Yr_pred = xgb_reg.predict(Xr_test)

# Assess performance
rmseR = mean_squared_error(Yr_test, Yr_pred, squared=False)
maeR = mean_absolute_error(Yr_test, Yr_pred)
print("The RMSE of the model is", rmseR)
print("The MAE of the model is", maeR)

The RMSE of the model is 17594.844098330377
The MAE of the model is 10765.983139977405


In [17]:
# Group based split
splitR = GroupKFold(n_splits=5).split(allScensR, groups=allScensR['City'])
trainR_inds, testR_inds = next(splitR)

Xr_train_byCity = Xr.iloc[trainR_inds]
Yr_train_byCity = Yr.iloc[trainR_inds]
Xr_test_byCity = Xr.iloc[testR_inds]
Yr_test_byCity = Yr.iloc[testR_inds]

# Naive model
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(Xr_train_byCity, Yr_train_byCity)

# Making predictions
Yr_pred_byCity = xgb_reg.predict(Xr_test_byCity)

# Assess performance
rmseR_byCity = mean_squared_error(Yr_test_byCity, Yr_pred_byCity, squared=False)
maeR_byCity = mean_absolute_error(Yr_test_byCity, Yr_pred_byCity)
print("The RMSE of the model is", rmseR_byCity)
print("The MAE of the model is", maeR_byCity)

The RMSE of the model is 16588.076477921913
The MAE of the model is 10098.849507728251


## Simulated Data w/ Extra Randomness :: XGB with Random Search CV

In [18]:
random_grid = {
 'learning_rate' : [0.05,0.10,0.15,0.20,0.25,0.30],
 'n_estimators': [50, 100, 200, 500],
 'max_depth' : [ 3, 4, 5, 6, 8, 10, 12, 15],
 'subsample' : [0.2, 0.4, 0.8, 1.0],
 'min_child_weight' : [ 1, 3, 5, 7 ],
 'gamma': [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 'colsample_bytree' : [ 0.3, 0.4, 0.5 , 0.7 ]
}

In [20]:
rs_model=RandomizedSearchCV(xgb_reg,
                            param_distributions=random_grid,
                            n_iter=5,
                            cv=GroupKFold(n_splits=5).split(allScensR, groups=allScensR['City']),
                            scoring='neg_mean_absolute_error',
                            n_jobs=-1,
                            verbose=2)

In [21]:
rs_model.fit(Xr, Yr, groups=allScensR['City'])

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END colsample_bytree=0.3, gamma=0.4, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=1.0; total time=   1.0s
[CV] END colsample_bytree=0.3, gamma=0.4, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=1.0; total time=   1.0s
[CV] END colsample_bytree=0.3, gamma=0.4, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=1.0; total time=   1.0s
[CV] END colsample_bytree=0.3, gamma=0.4, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=1.0; total time=   1.0s
[CV] END colsample_bytree=0.3, gamma=0.4, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=1.0; total time=   1.1s
[CV] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.2, max_depth=4, min_child_weight=3, n_estimators=100, subsample=0.8; total time=   1.3s
[CV] END colsample_bytree=0.4, gamma=0.0, learning_rate=0.2, max

In [22]:
rs_model.best_params_

{'subsample': 0.8,
 'n_estimators': 100,
 'min_child_weight': 3,
 'max_depth': 4,
 'learning_rate': 0.2,
 'gamma': 0.0,
 'colsample_bytree': 0.4}

In [23]:
best_model = rs_model.best_estimator_

In [24]:
Yr_pred_byCity = best_model.predict(Xr_test_byCity)
mean_absolute_error(Y_test_byCity, Yr_pred_byCity)

16290.284062436445