# Fit a Gaussian Process surrogate model

Here we define a surrogate model using Gaussian Processes.  
We use the GP model from ScikitLearn - we compared it to other models like GPFlow but observed better speed and better code maintenance in this model.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import chart_studio.plotly as py
import plotly.graph_objs as go
import chart_studio
import pandas as pd 
import time
from sklearn.gaussian_process.kernels import (RBF, Matern, RationalQuadratic, ExpSineSquared, DotProduct, ConstantKernel)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from besos import eppy_funcs as ef
import besos.sampling as sampling
from besos.problem import EPProblem
from besos.evaluator import EvaluatorEP
from besos.evaluator import EvaluatorGeneric
from besos.parameters import wwr, RangeParameter, FieldSelector, FilterSelector, GenericSelector, Parameter, expand_plist
from parameters import RangeParameter, CategoryParameter, expand_plist

from parameter_sets import parameter_set

We begin by:
+ getting a predefined list of 7 parameters from `parameter_sets.py`
+ making these into a `problem` with electricty use as the objective
+ and making an `evaluator` using the default EnergyPlus building.

In [2]:
parameters = parameter_set(7)
problem = EPProblem(parameters, ['Electricity:Facility'])
building = ef.get_building()
evaluator = EvaluatorEP(problem, building)


Duplicate names found. (duplicate, repetitions): [('Watts per Zone Floor Area', 2)]
Attempting to fix automatically



Then we get 50 samples across this design space and evaluate them.

In [3]:
inputs = sampling.dist_sampler(sampling.lhs, problem, 5)
outputs = evaluator.df_apply(inputs)
inputs

HBox(children=(IntProgress(value=0, description='Executing', max=5, style=ProgressStyle(description_width='ini…




Unnamed: 0,Wall conductivity,Attic thickness,U-Factor,Solar Heat Gain Coefficient,Watts per Zone Floor Area_0,Watts per Zone Floor Area_1,Window to Wall Ratio
0,0.024417,0.115414,4.386398,0.678482,13.522931,11.615422,0.28557
1,0.142775,0.200433,1.021673,0.596208,14.098853,13.675973,0.137351
2,0.086577,0.226899,2.569318,0.182461,10.209917,12.322224,0.419022
3,0.092048,0.152761,3.376769,0.333602,12.368767,10.125682,0.813589
4,0.178369,0.281291,1.361262,0.964262,11.249753,14.025684,0.749941


## Train-test split

Next we split the data into a training set (80%) and a testing set (20%).

In [4]:
train_in, test_in, train_out, test_out = train_test_split(inputs, outputs, test_size=0.2)

## Hyper-parameters

Before fitting the GP model we define the set of hyperparameters we want to optimize.  
Here we use \textit{3} folds in the k-fold cross validation scheme.  
We select a set of Kernel functions, which must fit the characteristics of a problem - details and examples may be found in the [Kernel cookbook](https://www.cs.toronto.edu/~duvenaud/cookbook/).  
Note that the parameters of the Kernel itself are [optimized during each model fitting run](https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html).

In [5]:
hyperparameters = {'kernel':[None,1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-1, 10.0)),
                             1.0 * RationalQuadratic(length_scale=1.0, alpha=0.5),
                             #ConstantKernel(0.1, (0.01, 10.0))*(DotProduct(sigma_0=1.0, sigma_0_bounds=(0.1, 10.0))**2),
                             1.0 * Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0)),]}

folds = 3

## Model fitting

Here we fit the model using these hyperparameters.

In [6]:
gp = GaussianProcessRegressor(normalize_y=True)

clf = GridSearchCV(gp, hyperparameters, iid=True, cv=folds)

clf.fit(inputs, outputs)

print(f'Best performing model $R^2$ score on training set: {clf.best_score_}')
print(f'Model $R^2$ parameters: {clf.best_params_}')
print(f'Best performing model $R^2$ score on a separate test set: {clf.best_estimator_.score(test_in, test_out)}')

Best performing model $R^2$ score on training set: nan
Model $R^2$ parameters: {'kernel': None}
Best performing model $R^2$ score on a separate test set: nan



R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.



## Surrogate Modelling Evaluator object
We can wrap the fitted model in a BESOS `Evaluator`.

In [7]:
def evaluation_func(ind):
    return ((clf.predict([ind])[0][0],),())

GP_SM = EvaluatorGeneric(evaluation_func, problem)

This has identical behaviour to the original EnergyPlus Evaluator object.  
In the next cells we generate a single input sample and evaluate it using the surrogate model and EnergyPlus.

In [8]:
sample = sampling.dist_sampler(sampling.lhs, problem, 1)
values = sample.values[0]
print(values)

[ 0.06845257  0.2652998   2.92197754  0.47021658 11.81732468 10.07129678
  0.51577527]


In [9]:
GP_SM(values)[0]

1906643216.1133249

In [10]:
evaluator(values)[0]

1832590841.9414

## Running a large surrogate evaluation

In [11]:
inputs = sampling.dist_sampler(sampling.lhs, problem, 5000)
outputs = GP_SM.df_apply(inputs)
results = inputs.join(outputs)
results.head()

HBox(children=(IntProgress(value=0, description='Executing', max=5000, style=ProgressStyle(description_width='…




Unnamed: 0,Wall conductivity,Attic thickness,U-Factor,Solar Heat Gain Coefficient,Watts per Zone Floor Area_0,Watts per Zone Floor Area_1,Window to Wall Ratio,Electricity:Facility
0,0.124309,0.167863,2.052279,0.916026,13.235416,12.586014,0.64582,2059788000.0
1,0.197177,0.229878,3.801748,0.14532,13.499957,10.687017,0.604197,1969611000.0
2,0.045592,0.15011,2.921662,0.387904,13.789488,12.042757,0.256709,2026943000.0
3,0.167353,0.161779,0.880584,0.933286,12.526225,13.269125,0.851882,2083242000.0
4,0.089238,0.231894,3.745927,0.574841,12.832366,14.499098,0.131026,2021454000.0


## Generate an idf/epJSON file with data in dataframe

Generate an idf/epJSON file with selected row of data in dataframe and save it in current directory.

In [12]:
#generate_building(dataframe, index, filename)
evaluator.generate_building(results, 2, 'output')

## Visualization

In [13]:
chart_studio.tools.set_credentials_file(username='westerm', api_key='CcjiMWsrVX8c6ZU4Ny1x')
df = inputs.round(3)

# generate list if dictionaries
l = list()
for i in df.columns:
    l.extend([dict(label = i, values = df[i])])

l.extend([dict(label = outputs.columns[0], values = outputs.round(-5))])

data = [
    go.Parcoords(
        line = dict(color = outputs['Electricity:Facility'],
                    colorscale = [[0,'#D7C16B'],[0.5,'#23D8C3'],[1,'#F3F10F']]),
        dimensions = l
    )
]

layout = go.Layout(
    plot_bgcolor = '#E5E5E5',
    paper_bgcolor = '#E5E5E5'
)

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename = 'parcoords-basic')