# Initial Design

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from summit.data import solvent_ds, ucb_list, DataSet
from summit.domain import Domain, DescriptorsVariable
from summit.experiment_design import LatinDesign
import pandas as pd
import numpy as np

## 1. Merge datasets

The relative solubilties (log<sub>10</sub> mole fraction) of potassium hydroxide in the solvents in the database were calculated using COSMO-RS and are provided in [solubilities.csv](solubilities.csv). These solubilites now need to be integrated into the database.

Additionally, Paul Deutsche from UCB pharma has given us a subset of pharmaceutically relevant solvents that we will use for the optimization.

In [12]:
solvent_ds.head(5)

Unnamed: 0_level_0,stenutz_name,cosmo_name,chemical_formula,molecular_weight,density,molar_volume,refractive_index,molecular_refractive_power,dielectric_constant,dipole_moment,...,boiling_point,viscosity,partition_coefficient,vapour_pressure,henry_constant,sigma_1,sigma_2,sigma_3,sigma_4,sigma_5
cas_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
98-07-7,(trichloromethyl)benzene,(trichloromethyl)-benzene,C7H5Cl3,195,1.173,166.6,1.557,53.67,6.9,2.14,...,220,1.85,3.68,0.2,2015.87,0.2054,18.7984,0.0,0.0,3.2973
111-78-4,"(1Z,5Z)-cycloocta-1,5-diene","1,5-cyclooctadiene",C8H12,108,0.88,122.9,1.493,35.73,2.38,0.17,...,148,1.0,2.98,8.87,1884.53,0.0,15.2165,0.4711,0.0,0.9742
5194-51-4,"(2E,4E)-2,4-hexadiene","trans,trans-2,4-hexadiene",C6H10,82,0.727,112.9,1.456,30.7,2.22,0.36,...,79,0.37,3.05,225.18,2089.16,0.0,14.9023,0.0451,0.0,1.3634
98-87-3,(dichloromethyl)benzene,(dichloromethyl)-benzene,C7H6Cl2,161,1.25,128.8,1.55,41.04,6.9,2.05,...,205,1.99,2.94,0.24,2235.31,0.8684,16.7703,0.0,0.0081,3.8876
2004-70-8,"(E)-1,3-pentadiene",1-trans-3-pentadiene,C5H8,68,0.683,99.7,1.43,25.77,2.32,0.68,...,42,0.3,2.47,702.08,2326.13,0.0,12.8734,0.0397,0.0,1.8346


In [13]:
#Read in solubility data
solubilities = pd.read_csv('solubilities.csv')
solubilities = solubilities.set_index('cas_number')
solubilities = DataSet.from_df(solubilities)
solubilities.head(5)

Unnamed: 0_level_0,solubility
cas_number,Unnamed: 1_level_1
98-07-7,-5.34825
111-78-4,-6.310797
5194-51-4,-6.592425
98-87-3,-0.808447
2004-70-8,-6.135296


In [7]:
#Not all of the solvents off the UCB list are in the database, so we'll take the subset that are
missed_cas_numbers = []
successful_cas_numbers = []
for i, cas in enumerate(ucb_list):
    try:
        select = solvent_ds.loc[cas, :]
        successful_cas_numbers.append(cas)
    except KeyError:
        missed_cas_numbers.append(cas)
successful_cas_numbers = np.array(successful_cas_numbers)
print(f"{len(missed_cas_numbers)} out of {len(ucb_list)} solvents from the UCB list are not in our solvent database.")

35 out of 115 solvents from the UCB list are not in our solvent database.


In [16]:
#Merge data sets
solvent_ds_new = solvent_ds.join(solubilities)
solvent_ds_new = solvent_ds_new.loc[successful_cas_numbers, :]
print(f"{solvent_ds_new.shape[0]} solvents for optimization")
solvent_ds_new.head(5) #Show first 5 rows of dataset

80 solvents for optimization


Unnamed: 0_level_0,stenutz_name,cosmo_name,chemical_formula,molecular_weight,density,molar_volume,refractive_index,molecular_refractive_power,dielectric_constant,dipole_moment,...,viscosity,partition_coefficient,vapour_pressure,henry_constant,sigma_1,sigma_2,sigma_3,sigma_4,sigma_5,solubility
cas_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
76-05-1,trifluoroacetic acid,trifluoroaceticacid,C2HF3O2,114,1.489,76.6,1.285,13.66,8.55,2.26,...,0.91,1.37,106.21,3442.8,1.2863,9.5322,0.5483,0.9011,1.2518,0.0
108-24-7,acetic anhydride,aceticanhydride,C4H6O3,102,1.082,94.4,1.39,22.37,20.0,2.8,...,0.78,0.35,1.6,3069.35,0.3781,11.2705,2.1978,0.0,4.3632,-1.845329
64-18-6,formic acid,formicacid,CH2O2,46,1.22,37.7,1.37,8.53,58.5,1.42,...,2.9,-0.9,17.02,7808.98,1.5164,4.0381,1.6768,0.7389,1.5975,0.0
79-09-4,propanoic acid,propionicacid,C3H6O2,74,0.99,74.8,1.386,17.58,3.1,1.76,...,3.92,0.17,4.44,3383.22,0.9013,8.4421,1.7836,0.6082,1.8287,0.0
107-21-1,ethanediol,glycol,C2H6O2,62,1.115,55.7,1.429,14.35,37.7,2.27,...,13.55,-1.34,0.22,4546.93,1.2765,6.4309,2.0458,0.5717,2.6165,0.0


In [9]:
values = solvent_ds_new.descriptors_to_numpy()
values = values.astype(np.float64)
check = np.isnan(values)
np.where(check==True)

(array([], dtype=int64), array([], dtype=int64))

## 2. Design Space

We now need to setup the design space, which is just the descriptors for the solvents.

In [10]:
domain = Domain()
domain += DescriptorsVariable(name='solvent',
                             description='solvent for the borrowing hydrogen reaction',
                             ds=solvent_ds_new)
domain

0,1,2,3
Name,Type,Description,Values
solvent,descriptors,solvent for the borrowing hydrogen reaction,80 examples of 19 descriptors


## 3. Initial Design

In [11]:
seed = np.random.RandomState(100)
lhs = LatinDesign(domain, random_state=seed)
experiments = lhs.generate_experiments(10)
experiments
indices = experiments.get_indices('solvent')[:,0]
solvent_ds_new.iloc[indices, :][solvent_ds_new.metadata_columns]

Unnamed: 0_level_0,stenutz_name,cosmo_name,chemical_formula
cas_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
75-52-5,nitromethane,nitromethane,CH3NO2
108-24-7,acetic anhydride,aceticanhydride,C4H6O3
98-08-8,trifluoromethylbenzene,trifluoromethylbenzene,C7H5F3
108-32-7,"4-methyl-2-oxo-1,3-dioxolane",propylenecarbonate,C4H6O3
120-92-3,cyclopentanone,cyclopentanone,C5H8O
107-06-2,"1,2-dichloroethane","1,2-dichloroethane",C2H4Cl2
107-21-1,ethanediol,glycol,C2H6O2
100-51-6,benzyl alcohol,benzylalcohol,C7H8O
57-55-6,"1,2-propanediol",propyleneglycol,C3H8O2
107-21-1,ethanediol,glycol,C2H6O2
