# Import Packages and Set Parameters

In [1]:
# Packages for modeling

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import Matern
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C ,WhiteKernel as Wht,Matern as matk
from tqdm import tqdm
from scipy.stats import norm



In [2]:
# Uncertainty Weighting

epsilon = 0 # Default = 0; Adds allowance for minimum Expected Improvement that will be recommended (additive). Increasing this will mean that EI can be lower and still recommend points which will favor exploration.
weight = 1 # Default = 1; Weights the value of prediction uncertainty for expected improvment (multiplicative)

Num_of_recs = 5 # Select desired number of recommendations

# Data Creation

In [None]:
# Read in data here. Add more data below and concatenate in the next cell as needed.

data_in_0 = pd.read_excel('../Data/Experimental Data/Initial.xlsx', engine='openpyxl')
data_in_1 = pd.read_excel('../Data/Experimental Data/1st_iteration_CEs.xlsx', engine='openpyxl')
data_in_2 = pd.read_excel('../Data/Experimental Data/2nd_iteration_CEs.xlsx', engine='openpyxl')
data_in_3 = pd.read_excel('../Data/Experimental Data/3rd_iteration_CE.xlsx', engine='openpyxl')
data_in_4 = pd.read_excel('../Data/Experimental Data/4th_iteration_CE.xlsx', engine='openpyxl')
data_in_5 = pd.read_excel('../Data/Experimental Data/5th_iteration_CE.xlsx', engine='openpyxl')
data_in_6 = pd.read_excel('../Data/Experimental Data/6th_iteration_CE.xlsx', engine='openpyxl')
#data_in_7 = pd.read_excel('../Data/Experimental Data/7th_iteration_CE.xlsx', engine='openpyxl')

#all_exptl_data = pd.read_excel('Data/exptl_data_241217.xlsx', engine='openpyxl')
#print(data_in_alltest)


In [25]:
Gaus_data_0 = data_in_0[['#', '<CE> (%)', 'DME', 'DEGDME', 'TEGDME', 'CH3DME', 'THF', 'MTHF', 'THP', 'diethyl ether', 'DEE', 'DOL']].dropna() # Drops any incomplete data in these columns
#Gaus_data = Gaus_data_0 # Comment this out if you have more than 1 input file
Gaus_data_1 = data_in_1[['#', '<CE> (%)', 'DME', 'DEGDME', 'TEGDME', 'CH3DME', 'THF', 'MTHF', 'THP', 'diethyl ether', 'DEE', 'DOL']].dropna() # Uncomment to add more data.
Gaus_data_2 = data_in_2[['#', '<CE> (%)', 'DME', 'DEGDME', 'TEGDME', 'CH3DME', 'THF', 'MTHF', 'THP', 'diethyl ether', 'DEE', 'DOL']].dropna()
Gaus_data_3 = data_in_3[['#', '<CE> (%)', 'DME', 'DEGDME', 'TEGDME', 'CH3DME', 'THF', 'MTHF', 'THP', 'diethyl ether', 'DEE', 'DOL']].dropna()
Gaus_data_4 = data_in_4[['#', '<CE> (%)', 'DME', 'DEGDME', 'TEGDME', 'CH3DME', 'THF', 'MTHF', 'THP', 'diethyl ether', 'DEE', 'DOL']].dropna()
Gaus_data_5 = data_in_5[['#', '<CE> (%)', 'DME', 'DEGDME', 'TEGDME', 'CH3DME', 'THF', 'MTHF', 'THP', 'diethyl ether', 'DEE', 'DOL']].dropna()
Gaus_data_6 = data_in_6[['#', '<CE> (%)', 'DME', 'DEGDME', 'TEGDME', 'CH3DME', 'THF', 'MTHF', 'THP', 'diethyl ether', 'DEE', 'DOL']].dropna()
#Gaus_data_7 = data_in_7[['#', '<CE> (%)', 'DME', 'DEGDME', 'TEGDME', 'CH3DME', 'THF', 'MTHF', 'THP', 'diethyl ether', 'DEE', 'DOL']].dropna()

#Gaus_data = pd.concat((Gaus_data_0, Gaus_data_1)).reset_index(drop=True) # Uncomment to combine mulitple data files into 1
Gaus_data = pd.concat((Gaus_data_0, Gaus_data_1, Gaus_data_2, Gaus_data_3, Gaus_data_4, Gaus_data_5, Gaus_data_6)).reset_index(drop=True) # Uncomment to combine mulitple data files into 1

# print(Gaus_data) # Uncomment to see data

Gaus_data['<CE> (%)'] = np.log10(np.reciprocal(1 - (Gaus_data['<CE> (%)']/100))) # Log scaling


In [26]:
# Read in Data Space

Plotting_points = pd.read_excel('../Data/Data Space/0.05 Mole Interval.xlsx', engine='openpyxl', index_col=0) # Read in by 0.05 for plotting
Rec_points = pd.read_excel('../Data/Data Space/0.1 Mole Interval.xlsx', engine='openpyxl', index_col=0) # Read in by 0.1 for recommendations

In [27]:
Y_scale = StandardScaler()
X_scale = StandardScaler()

Plotting_points_scaled = X_scale.fit_transform(Plotting_points) # Scale data space for GP modeling
Rec_points_scaled = X_scale.transform(Rec_points) # Scale data space for GP modeling


X = X_scale.transform(Gaus_data[['DME', 'DEGDME', 'TEGDME', 'CH3DME', 'THF', 'MTHF', 'THP', 'diethyl ether', 'DEE', 'DOL']]) # Collected data transformed for GP modeling
X_unscaled = Gaus_data[['DME', 'DEGDME', 'TEGDME', 'CH3DME', 'THF', 'MTHF', 'THP', 'diethyl ether', 'DEE', 'DOL']] # Collected data without CE %

Y = Y_scale.fit_transform(Gaus_data[['<CE> (%)']]) # Scale for GP regression target

# GP Model Creation

In [28]:
Kernel = 1 * Matern(length_scale=1, length_scale_bounds=(0.1, 2), nu=1.5) + Wht(1.0, (1e-6, 1000))
kernel_lit = C(1.0, (1e-3,1e3)) * matk([1.0]*10,[[1e-12, 1e8]]*10,1.5) + Wht(1.0, (1e-6, 1e3))
Gaus_model = GaussianProcessRegressor(kernel=kernel_lit, n_restarts_optimizer=30, random_state=15)
Gaus_model.fit(X,Y)

Pred, Std = Gaus_model.predict(Plotting_points_scaled, return_std = True)
Pred_Next, Std_Next = Gaus_model.predict(Rec_points_scaled, return_std = True)



In [29]:
Pred_x, Std_x = Gaus_model.predict(X, return_std = True)
ybest = Y_scale.transform(np.array(Gaus_data['<CE> (%)']).max().reshape(-1, 1))



In [30]:
zzval=((Pred-ybest-epsilon)/Std)
expI=(Pred-ybest-epsilon)*norm.cdf(zzval)+Std*norm.pdf(zzval)*weight

for i in range(0,expI.shape[0]):
    if Std[i] < 0:
        expI[i] = 0

zzval_Next=((Pred_Next-ybest)/Std_Next)
expI_Next=(Pred_Next-ybest-epsilon)*norm.cdf(zzval_Next)+Std_Next*norm.pdf(zzval_Next)*weight

for i in range(0,expI_Next.shape[0]):
    if Std_Next[i] < 0:
        expI_Next[i] = 0

In [31]:
# Copying data, predicting, and creating a sorted dataframe for plotting points
Plotting_points_pred = Plotting_points.copy()
Plotting_points_pred['Predicted CE'], Plotting_points_pred['Log Predicted CE StD'], Plotting_points_pred['Expected Improvement'] = (1-np.reciprocal(10**Y_scale.inverse_transform(Pred.reshape(-1, 1))))*100, Y_scale.inverse_transform(Std.reshape(-1, 1)) - Y_scale.mean_[0], expI.reshape(-1,1)
Plotting_points_pred_sorted = Plotting_points_pred.copy().sort_values(by='Expected Improvement', ascending=True)

# Copying data, predicting, and creating a sorted dataframe for recommending points
Rec_points_pred = Rec_points.copy()
Rec_points_pred['Predicted CE'], Rec_points_pred['Log Predicted CE StD'], Rec_points_pred['Expected Improvement'] = (1-np.reciprocal(10**Y_scale.inverse_transform(Pred_Next.reshape(-1, 1))))*100, Y_scale.inverse_transform(Std_Next.reshape(-1, 1)) - Y_scale.mean_[0], expI_Next.reshape(-1,1)
Rec_points_pred_sorted = Rec_points_pred.copy().sort_values(by='Expected Improvement', ascending=True)

# Creating dataset for farthest point sampling if needed
FPS_Select_Scaled = pd.DataFrame(Rec_points_scaled, columns=Rec_points.columns) # Farthest point sampling in case it is needed later
FPS_Select_Scaled['Predicted CE'], FPS_Select_Scaled['Log Predicted CE StD'], FPS_Select_Scaled['Expected Improvement'] = Y_scale.inverse_transform(Pred_Next.reshape(-1, 1)), Y_scale.inverse_transform(Std_Next.reshape(-1, 1)) - Y_scale.mean_[0], expI_Next.reshape(-1,1)

# Check how many points are tied for the maximum EI and create data frame
Maximum_EI_df = FPS_Select_Scaled.loc[np.round(Rec_points_pred['Expected Improvement'],3) == np.round(Rec_points_pred['Expected Improvement'].max(), 3)].copy().drop(['Predicted CE', 'Log Predicted CE StD', 'Expected Improvement'], axis=1).reset_index(drop=True)
Maximum_EI_df_full = Rec_points_pred.loc[np.round(Rec_points_pred['Expected Improvement'],3) == np.round(Rec_points_pred['Expected Improvement'].max(), 3)].copy().reset_index(drop=True)

# Print how many data points are tied for the maximum
print(Maximum_EI_df.shape[0], Maximum_EI_df_full.shape[0])

1 1


In [32]:
if Maximum_EI_df.shape[0] > Num_of_recs:

    FPS_points_full = Rec_points_pred.loc[Rec_points_pred['Expected Improvement'] == Rec_points_pred['Expected Improvement'].max()].copy().iloc[0:1]

    FPS_points = pd.DataFrame(X, columns=Plotting_points.columns)

    for i in tqdm(range(0,Num_of_recs)):
        distances = np.zeros(Maximum_EI_df.shape[0])
        for j in range(0,Maximum_EI_df.shape[0]):
            temp = Maximum_EI_df.iloc[j].to_numpy().reshape(1,10) - FPS_points
            distances[j] = np.min(np.sum(np.square(temp), axis=1))
        max_idx = np.argmax(distances)
        FPS_points = pd.concat((FPS_points, Maximum_EI_df.iloc[max_idx:max_idx+1]), axis=0).reset_index(drop=True)

        FPS_points_full = pd.concat((FPS_points_full, Maximum_EI_df_full.iloc[max_idx:max_idx+1]), axis=0).reset_index(drop=True)

    Recs = FPS_points_full.reset_index(drop=True).iloc[-Num_of_recs:].sort_values(by='Expected Improvement', ascending=False).reset_index(drop=True)

else:
    Recs = Rec_points_pred_sorted.iloc[-Num_of_recs:].sort_values(by='Expected Improvement', ascending=False).reset_index(drop=True)

In [33]:
# Export Recs
Recs.to_excel('../Data/Recs/Recs - ' + 'epsilon ' + str(epsilon) + ' weight ' + str(weight) + '.xlsx')
# print(Recs) # Uncomment to see recommended solvents

#export all points:
Rec_points_pred_sorted.to_excel('../Data/Recs/all_predictions_EIsorted.xlsx')
Rec_points_pred.to_excel('../Data/Recs/all_predictions_byNumber.xlsx')

In [34]:
# Generate top 3 predictions:

#all points sorted by CE, equivalent to Rec_points_pred_sorted
CE_sortlist = Rec_points_pred.copy().sort_values(by='Predicted CE', ascending=True)

#df of scaled parameter space for the compositions with max CE,
Maximum_CE_df_full = Rec_points_pred.loc[np.round(Rec_points_pred['Predicted CE'],3) == np.round(Rec_points_pred['Predicted CE'].max(), 3)].copy().reset_index(drop=True)

#above df with predictions dropped
Maximum_CE_df = Maximum_CE_df_full.copy().drop(['Predicted CE', 'Log Predicted CE StD', 'Expected Improvement'], axis=1).reset_index(drop=True)

if Maximum_CE_df.shape[0] > 3: #if there are more than 3 top entries, proceed with FPS

    FPS_points_full = Rec_points_pred.loc[Rec_points_pred['Predicted CE'] == Rec_points_pred['Predicted CE'].max()].copy().iloc[0:1]

    FPS_points = pd.DataFrame(X, columns=Plotting_points.columns)

    for i in tqdm(range(0,3)):
        distances = np.zeros(Maximum_CE_df.shape[0])
        for j in range(0,Maximum_CE_df.shape[0]):
            temp = Maximum_CE_df.iloc[j].to_numpy().reshape(1,10) - FPS_points
            distances[j] = np.min(np.sum(np.square(temp), axis=1))
        max_idx = np.argmax(distances)
        FPS_points = pd.concat((FPS_points, Maximum_CE_df.iloc[max_idx:max_idx+1]), axis=0).reset_index(drop=True)

        FPS_points_full = pd.concat((FPS_points_full, Maximum_CE_df_full.iloc[max_idx:max_idx+1]), axis=0).reset_index(drop=True)

    top3Recs = FPS_points_full.reset_index(drop=True).iloc[-3:].sort_values(by='Predicted CE', ascending=False).reset_index(drop=True)

else:
    top3Recs = CE_sortlist.iloc[-3:].sort_values(by='Predicted CE', ascending=False).reset_index(drop=True)

# print(top3Recs) Uncomment to see top 3 recommendations by CE

# Export Recs
top3Recs.to_excel('../Data/Recs/top3.xlsx')