The strategy for the initial design is to choose several solvents using a latin hypercub in PCA space and then do a latin hypercube for each solvent. 

In [18]:
#Import all the necessary packages
from summit.data import solvent_ds, ucb_ds, DataSet
from summit.domain import Domain, ContinuousVariable, DiscreteVariable, DescriptorsVariable
from summit.initial_design.latin_designer import LatinDesigner, lhs
from summit.initial_design.base import _closest_point_indices
from sklearn.decomposition import PCA
import numpy as np
import plotly.graph_objects as go

Create the principal components dataset.

In [2]:
NUM_COMPONENTS = 3

In [57]:
def create_pca_ds(solvent_ds, num_components):
    #Reduce solvent descriptors to 3 principal components
    pca = PCA(n_components=num_components)
    solvent_arr_std = solvent_ds.standardize()
    solvent_arr_pca = pca.fit_transform(solvent_arr_std)

    #Dataset manipulations
    solvent_ds_pcs = DataSet({(f'PC_{i}', 'DATA'): solvent_arr_pca[:, i] for i in range(num_components)}, 
                             index=solvent_ds.index)
    solvent_ds_pcs.columns.names = ['NAME', 'TYPE']
    order = solvent_ds.columns.codes[0]
    num_columns = len(solvent_ds.columns.levels[0])
    solvent_ds_new = solvent_ds.copy()
    solvent_ds_new.columns = solvent_ds.columns.set_codes(np.array([order.tolist(), [1]*num_columns]))
    solvent_ds_pcs = solvent_ds_pcs.join(solvent_ds_new)
    
    return solvent_ds_pcs

solvent_ds_pcs = create_pca_ds(solvent_ds, num_components=NUM_COMPONENTS)
solvent_ds_pcs = solvent_ds_pcs.reindex(ucb_ds.index).dropna()

Now, we need to specify the optimization space.

In [104]:


domain = Domain()

domain += DiscreteVariable(name="co_cat",
                           description="enumeration of the two potential cocatalysts",
                           levels = ['co_cat_1', 'co_cat_2'])


domain += ContinuousVariable(name="catalyst_conc",
                             description = "Concentration of the catalyst in mM",
                             units="mM",
                             bounds=[0.1, 10])

domain += ContinuousVariable(name="cocatalyst_conc",
                             description = "co-catalyst loading",
                             bounds=[15, 1500])


domain += ContinuousVariable(name="acrylate_concentration",
                             description = "molar ratio of acrylate to amine",
                             units="mM",
                             bounds = [10, 1000])


domain += ContinuousVariable(name="aldehyde_amine_ratio",
                             description = "molar ratio of aldehyde to amine",
                             bounds=[0.8, 2])

domain += ContinuousVariable(name='temperature',
                             description = "reaction temperature",
                             bounds=[20, 50])


domain += DescriptorsVariable(name="solvent",
                             description="Descriptors of the solvent",
                             ds=solvent_ds_pcs)

domain #The domain should display as an html table 

0,1,2,3
Name,Type,Description,Values
co_cat,"discrete, input",enumeration of the two potential cocatalysts,2 levels
catalyst_conc,"continuous, input",Concentration of the catalyst in mM,"[0.1,10]"
cocatalyst_conc,"continuous, input",co-catalyst loading,"[15,1500]"
acrylate_concentration,"continuous, input",molar ratio of acrylate to amine,"[10,1000]"
aldehyde_amine_ratio,"continuous, input",molar ratio of aldehyde to amine,"[0.8,2]"
temperature,"continuous, input",reaction temperature,"[20,50]"


Set up the initial design. Start with solvent/.

In [76]:
continuous_design = lhs(NUM_COMPONENTS, 8)
solvent_arr_pcs = solvent_ds_pcs.zero_to_one()
design_indices = _closest_point_indices(continuous_design, solvent_arr_pcs)
design_arr_pcs = solvent_arr_pcs[design_indices[:, 0]] 

In [93]:
all_solvents_plot = go.Scatter3d(x=solvent_arr_pcs[:, 0],
                                 y=solvent_arr_pcs[:, 1],
                                 z=solvent_arr_pcs[:, 2],
                                 mode='markers',
                                 marker=dict(opacity=0.2),
                                 name='All Solvents')

lines = [go.Scatter3d(x=[solvents[0], continuous[0]],
                      y=[solvents[1], continuous[1]],
                      z=[solvents[2], continuous[2]], 
                      legendgroup='connections',
                      name='Connections',
                      showlegend= True if i==0 else False,
                      mode='lines',
                      line=dict(
                                color='darkblue',
                                width=2)
                      
                     )
         for i, (solvents, continuous) in enumerate(zip(design_arr_pcs, continuous_design))]

continuous_design_plot = go.Scatter3d(x=continuous_design[:, 0],
                                      y=continuous_design[:, 1],
                                      z=continuous_design[:, 2],
                                      mode='markers',
                                      marker=dict(opacity=1.0, color='lightblue'),
                                      name='LHS Design Points')

design_solvents_plot = go.Scatter3d(x=design_arr_pcs[:, 0],
                                    y=design_arr_pcs[:, 1],
                                    z=design_arr_pcs[:, 2], 
                                    mode='markers',
                                    marker=dict(color='darkblue'),
                                    name='Initial Design Solvents')

fig = go.Figure([all_solvents_plot, continuous_design_plot, design_solvents_plot] + lines)

# tight layout
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0), 
                  title=dict(text='Connection between Continuous Points and Initial Design',
                             y=0.95),
                  xaxis=dict(title="Hello"))
fig.show()

In [116]:
listed = solvent_ds_pcs.iloc[design_indices[:, 0]][['stenutz_name', 'PC_0', 'PC_1', 'PC_2']]
listed.to_csv('outputs/solvents.csv')
listed

Unnamed: 0_level_0,stenutz_name,PC_0,PC_1,PC_2
cas_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100-51-6,benzyl alcohol,-0.53317,2.017986,0.633553
67-64-1,2-propanone,-3.51016,-1.058576,-1.611283
107-21-1,ethanediol,-4.529107,2.642788,0.375805
75-52-5,nitromethane,-4.320855,1.226416,-0.027927
67-66-3,trichloromethane,-1.506288,-0.730936,2.556629
75-05-8,acetonitrile,-4.851335,0.196809,-1.286102
110-86-1,pyridine,-1.669022,0.310181,-0.195536
628-63-7,pentyl acetate,0.797974,-0.688031,-1.575729


In [115]:
rs = np.random.RandomState(100)
lhs = LatinDesigner(domain, random_state=rs)
design = lhs.generate_experiments(num_experiments=10, criterion='center', unique=False)
design = design.to_frame().sort_values(by=['co_cat'])
design.to_csv('outputs/continuous.csv')
design

Unnamed: 0,co_cat,catalyst_conc,cocatalyst_conc,acrylate_concentration,aldehyde_amine_ratio,temperature
1,co_cat_1,6.535,237.75,851.5,1.7,39.5
3,co_cat_1,1.585,89.25,554.5,1.94,45.5
5,co_cat_1,8.515,386.25,950.5,1.82,42.5
7,co_cat_1,7.525,1128.75,257.5,1.58,36.5
0,co_cat_2,5.545,534.75,455.5,0.86,30.5
2,co_cat_2,0.595,1425.75,356.5,0.98,33.5
4,co_cat_2,9.505,1277.25,59.5,1.22,27.5
6,co_cat_2,2.575,980.25,653.5,1.1,24.5
8,co_cat_2,4.555,831.75,158.5,1.34,48.5
9,co_cat_2,3.565,683.25,752.5,1.46,21.5
