# Initial Design

This notebook contains the code used to select the reaction conditions tested for initial training of the GP. A modified latin hypercube sampling (LHS) strategy is used to select both discrete and continuous variables.  We previously demonstrated that several different designs work well for solvent selection, so LHS was chosen since it is already implemented by GPyOpt.  

## 1. Setup

Let's get everything loaded and ready to go.

In [1]:
#Autoreload automatically reloads any depdencies as you change them
%load_ext autoreload
%autoreload 2

In [3]:
#Import all the necessary packages
import summit
from summit.data import solvent_ds
from summit.domain import Domain, ContinuousVariable, DiscreteVariable, DescriptorsVariable
import GPyOpt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [16]:
#Specify the optimization space

domain = Domain()

domain += ContinuousVariable(name='temperature',
                             description = "reaction temperature",
                             bounds=[20, 50])

domain += ContinuousVariable(name="acid_conc",
                             description = "propionic acid concentration",
                             bounds=[1,40])

domain += ContinuousVariable(name="cat_load",
                             description = "catalyst loading",
                             bounds=[0.1, 10])

domain += ContinuousVariable(name="co_cat_load",
                             description = "co-catalyst loading",
                             bounds=[15, 1500])

domain += ContinuousVariable(name="acrylate_amine_ratio",
                             description = "molar ratio of acrylate to amine",
                             bounds = [0.8, 2])


domain += ContinuousVariable(name="aldehyde_amine_ratio",
                             description = "molar ratio of aldehyde to amine",
                             bounds=[0.8, 2])

domain += DiscreteVariable(name="co_cat",
                           description="enumeration of the two potential cocatalysts",
                           levels = ['co_cat_1', 'co_cat_2'])


domain += DescriptorsVariable(name="solvent",
                             description="18 descriptors of the solvents",
                             df=solvent_ds)

domain #The domain should display as a pandas dataframe

0,1,2,3
Name,Type,Description,Values
temperature,continuous,reaction temperature,"[20,50]"
acid_conc,continuous,propionic acid concentration,"[1,40]"
cat_load,continuous,catalyst loading,"[0.1,10]"
co_cat_load,continuous,co-catalyst loading,"[15,1500]"
acrylate_amine_ratio,continuous,molar ratio of acrylate to amine,"[0.8,2]"
aldehyde_amine_ratio,continuous,molar ratio of aldehyde to amine,"[0.8,2]"
co_cat,discrete,enumeration of the two potential cocatalysts,2 levels
solvent,descriptors,18 descriptors of the solvents,459 examples of 17 descriptors


In [None]:
#Specify the optimization space

domain = OptimizationDomain()

domain += ContinuousVariable(name='temperature',
                             description = "reaction temperature",
                             bounds=[20, 50], 
                             units=units.degC)

domain += ContinuousVariable(name="acid_conc",
                             description = "propionic acid concentration",
                             bounds=[1,40],
                             units=units.millimolar)

domain += ContinuousVariable(name="cat_load",
                             description = "catalyst loading",
                             bounds=[0.1, 10],
                             units=units.millimolar)

domain += ContinuousVariable(name="co_cat_load",
                             description = "co-catalyst loading",
                             bounds=[15, 1500],
                             units=units.millimolar)

domain += ContinuousVariable(name="acrylate_amine_ratio",
                             description = "molar ratio of acrylate to amine",
                             bounds = [0.8, 2])


domain += ContinuousVariable(name="aldehyde_amine_ratio",
                             description = "molar ratio of aldehyde to amine",
                             bounds=[0.8, 2])

domain += DiscreteVariable(name="co_cat",
                           description="Enumeration of the two potential cocatalysts",
                           values = ['co_cat_1', 'co_cat_2'])


domain += SolventDescriptorSet(select_subset=summit.data.UCB_CPRD_GUIDE)

domain #The domain should display as a pandas dataframe

In [13]:
variables = [{'name': 'solvent', 'type': 'bandit', 'domain':solvent_ds.values}]
domain = GPyOpt.Design_space(variables)

## 2. Construct Initial Design

In [14]:
lhs = GPyOpt.experiment_design.latin_design.LatinDesign(domain)
exps = lhs.get_samples(10, criterion='maximin')
exps

array([[ 1.00000e+02,  6.73000e-01,  1.48900e+02,  1.38200e+00,
         3.46900e+01,  1.91000e+00,  0.00000e+00, -1.19000e+02,
         8.10000e+01,  3.60000e-01,  4.09000e+00,  1.55220e+02,
         0.00000e+00,  1.68045e+01,  0.00000e+00,  0.00000e+00,
         1.23900e-01],
       [ 1.24000e+02,  1.05700e+00,  1.17500e+02,  1.58500e+00,
         3.93800e+01,  4.76000e+00,  1.31000e+00, -1.50000e+01,
         1.88000e+02,  1.64000e+00,  2.64000e+00,  6.30000e-01,
         1.41900e-01,  1.59746e+01,  1.87600e-01,  0.00000e+00,
         4.03710e+00],
       [ 7.30000e+01,  9.46000e-01,  7.73000e+01,  1.43300e+00,
         2.00800e+01,  1.78900e+02,  4.12000e+00,  2.80000e+01,
         2.05000e+02,  2.68000e+00, -1.37000e+00,  4.30000e-01,
         8.13200e-01,  8.93270e+00,  1.92720e+00,  3.55500e-01,
         2.73180e+00],
       [ 1.82000e+02,  9.66000e-01,  1.88700e+02,  1.57000e+00,
         6.19300e+01,  2.38000e+00,  0.00000e+00,  5.20000e+01,
         2.85000e+02,  6.74000e+00,

In [None]:
lhs.pca_plot(exps, n_components=2)

In [None]:
lhs.design_coverage()