In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from surrogate_model_functions import plot_3d_model, loo_error
from summit.strategies import TSEMO
from summit.models import GPyModel
from summit.data import solvent_ds, ucb_ds, DataSet
from summit.domain import Domain, DescriptorsVariable,ContinuousVariable
from summit.initial_design import LatinDesigner
# from summit.optimizers import EnumerationOptimizer
# from summit.objective import HV

import GPy
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
#Read in solubility data
solubilities = pd.read_csv('inputs/solubilities.csv')
solubilities = solubilities.set_index('cas_number')
solubilities = DataSet.from_df(solubilities)

#Merge data sets
solvent_ds_full = solvent_ds.join(solubilities)
solvent_ds_final = pd.merge(solvent_ds_full, ucb_ds, left_index=True,right_index=True)
print(f"{solvent_ds_final.shape[0]} solvents for optimization")

#Double check that there are no NaNs in the descriptors
values = solvent_ds_final.data_to_numpy()
values = values.astype(np.float64)
check = np.isnan(values)
assert check.all() == False

#Transform to principal componets
num_components = 3
pca = PCA(n_components=num_components)
pca.fit(solvent_ds_full.standardize())
pcs = pca.fit_transform(solvent_ds_final.standardize())
explained_var = round(pca.explained_variance_ratio_.sum()*100)
expl = f"{explained_var}% of variance is explained by {num_components} principal components."
print(expl)

#Create a new dataset with just the principal components
metadata_df = solvent_ds_final.loc[:, solvent_ds_final.metadata_columns]
pc_df = pd.DataFrame(pcs, columns = [f'PC_{i}' for i in range(num_components)], 
                     index=metadata_df.index)
pc_ds = DataSet.from_df(pc_df)
solvent_ds_pcs = pd.concat([metadata_df, pc_ds], axis=1)

80 solvents for optimization
68.0% of variance is explained by 3 principal components.


In [4]:
# Set up test problem
AD1 = 3.1
AD2 = 0.7
EAD1 = 50
EAD2 = 70
R = 8.314
cd1 = lambda t, T, Es: AD1*t*np.exp(-(EAD1+Es)/T)
cd2 = lambda t, T, Es: AD2*t*np.exp(-(EAD2+Es)/T)
Es1 = lambda pc1, pc2: -50*pc1+0.009*pc2**2-0.005*pc1**3
Es2 = lambda pc1, pc2: (-0.001*pc1-np.exp(pc2))*10

def experiment(solvent_cas, random_state=np.random.RandomState()):
    pc_solvent = solvent_ds_pcs.loc[solvent_cas][solvent_ds_pcs.data_columns].to_numpy()
    es1 = Es1(pc_solvent[0], pc_solvent[1])
    es2 = Es2(pc_solvent[0], pc_solvent[1])
    T = 5 * random_state.randn(1) + 393
    t = 0.1 * random_state.randn(1) + 7
    exper_cd1 = cd1(t, T, es1)
    exper_cd2 = cd2(t, T, es2)
#     print(f"es1: {es1}, es2: {es2}, cd1:{round(exper_cd1[0])}, cd2: {round(exper_cd2[0])}")
    conversion = exper_cd1 + exper_cd2
    de = abs(exper_cd1-exper_cd2)/(exper_cd1 +exper_cd2)
    return np.array([conversion[0], de[0]*100])

In [5]:
#Set up optimization domain
domain = Domain()
domain += DescriptorsVariable(name='solvent',
                             description='solvent for the borrowing hydrogen reaction',
                             ds=solvent_ds_pcs)
domain += ContinuousVariable(name='conversion',
                             description='relative conversion to triphenylphosphine oxide determined by LCMS',
                             bounds=[0, 100],
                             is_output=True)
domain += ContinuousVariable(name='de',
                             description='diastereomeric excess determined by ratio of LCMS peaks',
                             bounds=[0, 100],
                             is_output=True)
domain

0,1,2,3
Name,Type,Description,Values
solvent,"descriptors, input",solvent for the borrowing hydrogen reaction,80 examples of 3 descriptors
conversion,"continuous, output",relative conversion to triphenylphosphine oxide determined by LCMS,"[0,100]"
de,"continuous, output",diastereomeric excess determined by ratio of LCMS peaks,"[0,100]"


In [6]:
#Initial design
random_state = np.random.RandomState(1000)
lhs = LatinDesigner(domain,random_state)
initial_design = lhs.generate_experiments(8)
initial_design

Unnamed: 0,cas_number
0,141-78-6
1,107-21-1
2,76-05-1
3,96-49-1
4,75-52-5
5,105-58-8
6,100-51-6
7,111-87-5


In [7]:
#Initial experiments
random_state = np.random.RandomState(1000)
initial_experiments = [experiment(cas, random_state) 
                       for cas in initial_design.to_frame()['cas_number']]
initial_experiments = pd.DataFrame(initial_experiments, columns=['conversion', 'de'])
initial_experiments = DataSet.from_df(initial_experiments)
design_df = initial_design.to_frame()
design_df = design_df.rename(index=int, columns={'cas_number': 'solvent'})
design_ds = DataSet.from_df(design_df)
initial_experiments = initial_experiments.merge(design_ds, left_index=True, right_index=True)
initial_experiments

Unnamed: 0,conversion,de,solvent
0,22.349157,62.676888,141-78-6
1,40.803339,79.004945,107-21-1
2,31.622528,73.791094,76-05-1
3,42.847986,72.092186,96-49-1
4,34.900218,76.412596,75-52-5
5,20.627487,57.66712,105-58-8
6,26.382769,60.097645,100-51-6
7,20.934565,48.885554,111-87-5


In [27]:
#Run the optimization
input_dim = domain.num_continuous_dimensions() +domain.num_discrete_variables()
kernels = [GPy.kern.Matern52(input_dim = input_dim, ARD=True)
           for _ in range(2)]
models = [GPyModel(kernel=kernels[i]) for i in range(2)]
# tsemo = TSEMO(domain, models, acquisition=HV(), optimizer=EnumerationOptimizer())
tsemo = TSEMO(domain, models)
design = tsemo.generate_experiments(initial_experiments, 8, 
                                    normalize=False, num_spectral_samples=500)

In [28]:
design

Unnamed: 0_level_0,stenutz_name,cosmo_name,chemical_formula,solvent_class,solvent_name,PC_0,PC_1,PC_2
cas_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
123-25-1,diethyl butanedioate,ethylsuccinate,C8H14O4,Ether,Dimetyl succinate,-1.949775,4.843216,-0.31796
126-33-0,sulfolane,"tetrahydrothiophene-1,1-dioxide",C4H8O2S,Dipolar aprotic,Sulfolane,3.556021,4.48936,-1.491517
102-76-1,triacetin,glycerol-triacetate,C9H14O6,Ester,Glycol triacetate,-1.09089,7.36731,-0.289705
540-84-1,"2,2,4-trimethylpentane","2,2,4-trimethylpentane",C8H18,Alkane,Isooctane,-5.249607,-0.72305,1.303672
7226-23-5,"1,3-dimethyltetrahydropyrimidin-2(1H)-one","1,3-dimethyltetrahydropyrimidin-2(1h)-one",C6H12N2O,Dipolar aprotic,Dimethyl phosphonyl urea (DMPU),0.420531,3.248744,-0.349683
56-81-5,"1,2,3-propanetriol",glycerol,C3H8O3,Alcohol,Glycerol,5.691078,3.500596,4.144049
111-70-6,heptanol,1-heptanol,C7H16O,Alcohol,1-Heptanol,-1.062291,1.485131,2.298888
108-32-7,"4-methyl-2-oxo-1,3-dioxolane",propylenecarbonate,C4H6O3,Dipolar aprotic,Propylene carbonate,3.253236,2.655611,-1.765168
