# External calibration of DFT prediction with Gaussian Process

In [1]:
# load packages
import numpy as np 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, RBF, ConstantKernel as C
import utils

In [2]:
# GP train setting

# basis function
Basis = ['outer_electron', 'inner_shell', \
         'coord_num', 'valence', 'num_non_hydro', \
         'M-C', 'M-O', 'M-H', 'PW91RelEnergy', '#CH3OH', '#H2O', '#H2']
Nstart = 1000    # GP restart optimization

limit_C = 100;    # lower bound for noise level
limit_l = 1e-6    # lower bound for lengthscale
isotropic_kernel = False                           # ARD kernel

Scale = 'MinMax'  # standardize method
Functional = 'Err_RelEnergy'
WGSFxn = 'PW91BindEnergy'

gp_trainfile = './Benchmark_data/Campbell_Mavrikakis_rel_v1.csv'    # gp training file
metal_file = './Benchmark_data/metal_constant.csv'

In [3]:
# Load and preprocessing the data
sorp_data = utils.read_file(gp_trainfile)
utils.outer_electron(sorp_data, metal_file)
utils.inner_shell(sorp_data, metal_file)
utils.coordination_num(sorp_data, metal_file)

# build training data
Xtrain = [sorp_data[base] for base in Basis]
Xtrain = np.array(Xtrain).T 
Ndata, Nfeature = Xtrain.shape

# Scaling
if Scale == 'MinMax':
    mms = MinMaxScaler()
    Xtrain = mms.fit_transform(Xtrain)
elif Scale == 'Std':
    stdsc = StandardScaler()
    Xtrain = stdsc.fit_transform(Xtrain)
    
# Output
Ytrain = -np.array(sorp_data[Functional])

In [4]:
# define kernel and GP model
if isotropic_kernel:
    k1 = C(1, (1e-6, 1e20)) * RBF(1, (limit_l, 1e20))
else:  
    k1 = C(1, (1e-6, 1e20)) * RBF(np.ones(Nfeature), (limit_l, 1e20))
k2 = WhiteKernel(noise_level=1,
                  noise_level_bounds=(limit_C, 1000))  # noise terms

kernel = k1+k2
gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=Nstart)
gpr.fit(Xtrain, Ytrain)  

GaussianProcessRegressor(alpha=1e-10, copy_X_train=True,
             kernel=1**2 * RBF(length_scale=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + WhiteKernel(noise_level=1),
             n_restarts_optimizer=1000, normalize_y=False,
             optimizer='fmin_l_bfgs_b', random_state=None)

In [5]:
# print the optmized hyperparameters
print 'Hyperparameter:'
print gpr.kernel_.theta.tolist()

Hyperparameter:
[6.716045712171986, 28.519853399302583, 18.2959333460509, -13.815510557964274, -4.549859151440912, 29.583951063276416, 26.568455258521425, -11.589373421550459, -5.821606744357051, -2.0827426719773108, 37.27201584259565, 31.52862947167489, 0.27006970870086416, 4.605170185988092]
