# Fit feedforward Neural Network model

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import time
import plotly
import chart_studio
import chart_studio.plotly as py
import plotly.graph_objs as go
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from besos import eppy_funcs as ef
import besos.sampling as sampling
from besos.problem import EPProblem
from besos.evaluator import EvaluatorEP, EvaluatorGeneric
from besos.parameters import wwr, RangeParameter, FieldSelector, FilterSelector, GenericSelector, Parameter, expand_plist
from parameters import RangeParameter, CategoryParameter, expand_plist

from parameter_sets import parameter_set

We begin by:
+ getting a predefined list of 7 parameters from `parameter_sets.py`
+ making these into a `problem` with electricty use as the objective
+ and making an `evaluator` using the default EnergyPlus building.

In [2]:
parameters = parameter_set(7)
problem = EPProblem(parameters, ['Electricity:Facility'])
building = ef.get_building()
evaluator = EvaluatorEP(problem, building)


Duplicate names found. (duplicate, repetitions): [('Watts per Zone Floor Area', 2)]
Attempting to fix automatically



Then we get 20 samples across this design space and evaluate them.

In [3]:
inputs = sampling.dist_sampler(sampling.lhs, problem, 20)
outputs = evaluator.df_apply(inputs)
inputs

HBox(children=(IntProgress(value=0, description='Executing', max=20, style=ProgressStyle(description_width='in…




Unnamed: 0,Wall conductivity,Attic thickness,U-Factor,Solar Heat Gain Coefficient,Watts per Zone Floor Area_0,Watts per Zone Floor Area_1,Window to Wall Ratio
0,0.177185,0.122577,4.578711,0.110993,13.727691,11.989335,0.981828
1,0.078358,0.27816,1.143532,0.897574,13.317177,13.489183,0.880788
2,0.17101,0.179611,2.880863,0.697901,14.519081,13.603152,0.841685
3,0.14084,0.137919,1.012747,0.559023,11.928699,12.048367,0.37847
4,0.095568,0.219453,1.380731,0.274665,10.862745,10.708088,0.013529
5,0.157559,0.116493,1.595703,0.859288,11.610623,10.012001,0.906483
6,0.065818,0.196541,3.478323,0.978255,12.542769,11.330675,0.219611
7,0.127309,0.269509,4.460128,0.102731,13.13422,12.350357,0.102476
8,0.135575,0.163192,2.276359,0.355152,14.163955,10.969172,0.594398
9,0.199411,0.295189,2.036119,0.602306,12.218168,14.834366,0.75198


## Train-test split
Next we split the data into a training set (80%) and a testing set (20%).

In [4]:
train_in, test_in, train_out, test_out = train_test_split(inputs, outputs, test_size=0.2)

## Normalization of inputs

To ensure an equal weighting of inputs and outputs in the backpropagation algorithm fitting the neural network, we have to normalize the input values.  
For example window-to-wall ratio is in the range of 0 to 1 while the $W/$m^2$ are in a range of 10 to 15.  
Different options for normalization exist.  
Here we bring all features (input variables) to have zero mean and a standarddeviation of 1.  
Note that we fit the normalizer on training data only.

In [5]:
scaler = StandardScaler()
inputs = scaler.fit_transform(X=train_in)

scaler_out = StandardScaler()
outputs = scaler_out.fit_transform(X=train_out)

## Hyper-parameters

Before we start fitting the NN model we define the set of hyperparameters we want to analyse in our cross-validation to optimize the model.  
Here, we select the number of layers of the network as well as the regularization parameter alpha as parameter value.  
A larger number of layers and a lower value of the regularizer lead to higher variance of the network.  
This may lead to overfitting.  
The best selection may be found using an optimizer like Bayesian Optimization.  
In this example we use a simple grid search.

In [6]:
hyperparameters = {'hidden_layer_sizes':((len(parameters)*16,),(len(parameters)*16, len(parameters)*16)), 
              'alpha':[1, 10, 10**3]}

neural_net = MLPRegressor(max_iter=1000, early_stopping=False)
folds = 3

## Model fitting

Here, we use the NN model from ScikitLearn.  
In a [different example](FitNNTF.ipynb) we use TensorFlow (with and without the Keras wrapper).

In [7]:
clf = GridSearchCV(neural_net, hyperparameters, iid=True, cv=folds)
clf.fit(inputs, outputs.ravel())

print(f'Best performing model $R^2$ score on training set: {clf.best_score_}')
print(f'Model $R^2$ parameters: {clf.best_params_}')
print(f'Best performing model $R^2$ score on a separate test set: {clf.best_estimator_.score(scaler.transform(test_in), scaler_out.transform(test_out))}')

Best performing model $R^2$ score on training set: 0.5810710281739011
Model $R^2$ parameters: {'alpha': 1, 'hidden_layer_sizes': (112,)}
Best performing model $R^2$ score on a separate test set: 0.9719171887910188


## Surrogate Modelling Evaluator object
We can wrap the fitted model in a BESOS `Evaluator`.  
This has identical behaviour to the original EnergyPlus Evaluator object.

In [8]:
def evaluation_func(ind, scaler=scaler):
    ind = scaler.transform(X=[ind])
    return ((scaler_out.inverse_transform(clf.predict(ind))[0],),())

NN_SM = EvaluatorGeneric(evaluation_func, problem)

This has identical behaviour to the original EnergyPlus Evaluator object.  
In the next cells we generate a single input sample and evaluate it using the surrogate model and EnergyPlus.

In [9]:
sample = sampling.dist_sampler(sampling.lhs, problem, 1)
values = sample.values[0]
print(values)

[ 0.03308963  0.13257495  2.43329987  0.64595155 10.58985017 11.71719755
  0.39190868]


In [10]:
NN_SM(values)[0]

1878672498.4816747

In [11]:
evaluator(values)[0]

1847093206.2648058

## Running a large surrogate evaluation

In [12]:
inputs = sampling.dist_sampler(sampling.lhs, problem, 5000)
outputs = NN_SM.df_apply(inputs)
results = inputs.join(outputs)
results.head()

HBox(children=(IntProgress(value=0, description='Executing', max=5000, style=ProgressStyle(description_width='…




Unnamed: 0,Wall conductivity,Attic thickness,U-Factor,Solar Heat Gain Coefficient,Watts per Zone Floor Area_0,Watts per Zone Floor Area_1,Window to Wall Ratio,Electricity:Facility
0,0.077926,0.267425,4.191725,0.273934,11.199979,13.946797,0.520118,2049474000.0
1,0.064114,0.170255,0.746887,0.576243,10.549722,11.307147,0.638999,1859380000.0
2,0.143723,0.229507,3.442186,0.986898,14.423547,14.338437,0.27241,2251536000.0
3,0.024438,0.247755,3.575314,0.820355,10.540995,11.930376,0.551959,1915107000.0
4,0.069923,0.210096,1.778862,0.127571,10.834494,13.909494,0.546333,2003575000.0


## Visualization

In [13]:
chart_studio.tools.set_credentials_file(username='westerm', api_key='CcjiMWsrVX8c6ZU4Ny1x')
df = inputs.round(3)

# generate list if dictionaries
l = list()
for i in df.columns:
    l.extend([dict(label = i, values = df[i])])

l.extend([dict(label = outputs.columns[0], values = outputs.round(-5))])

data = [
    go.Parcoords(
        line = dict(color = outputs['Electricity:Facility'],
                    colorscale = [[0,'#D7C16B'],[0.5,'#23D8C3'],[1,'#F3F10F']]),
        dimensions = l
    )
]

layout = go.Layout(
    plot_bgcolor = '#E5E5E5',
    paper_bgcolor = '#E5E5E5'
)

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename = 'parcoords-basic')