sustainable-processes · marcosfelt · Jul 13, 2020 · Jun 22, 2020 · Jun 23, 2020 · Jun 23, 2020
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,8 @@ tqdm = "^4.46.1"
 ipywidgets = "^7.5.1"
 gpyopt = "^1.2.6"
 matplotlib = "^3.2.2"
+torch = "1.4.0"
+blitz-bayesian-pytorch = "^0.2.3"
 
 [tool.poetry.dev-dependencies]
 pytest = "^3.0"

diff --git a/summit/benchmarks/__init__.py b/summit/benchmarks/__init__.py
@@ -1,2 +1,3 @@
 from .snar import SnarBenchmark
 from .test_functions import Himmelblau, Hartmann3D
+from .reizman_suzuki_emulator import ReizmanSuzukiEmulator
diff --git a/summit/benchmarks/experiment_emulator/__init__.py b/summit/benchmarks/experiment_emulator/__init__.py
@@ -0,0 +1,2 @@
+import os, sys
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
diff --git a/summit/benchmarks/experiment_emulator/bnn.py b/summit/benchmarks/experiment_emulator/bnn.py
@@ -0,0 +1,223 @@
+import os
+import os.path as osp
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import numpy as np
+import csv
+import matplotlib.pyplot as plt
+
+from blitz.modules import BayesianLinear
+from blitz.utils import variational_estimator
+
+from sklearn.model_selection import train_test_split
+
+from experimental_datasets import load_reizman_suzuki
+#=======================================================================
+
+# set dataset
+dataset_name = "reizman_suzuki"
+case = 2
+target = "TON"
+X, y = load_reizman_suzuki(return_X_y=True, case=case)
+
+# set hyperparameters
+epochs = 300
+initial_lr = 0.001
+early_stopping_epochs = 20
+
+# adapt target (only if multiple targets), comment out for single-objective dataset
+target_dim = 0
+if target == "TON":
+    target_dim = 0
+elif target == "yield":
+    target_dim = 1
+
+y = y[:,target_dim]
+
+# adapt model name
+model_name = str(dataset_name) + "_case" + str(case) + "_" + str(target)
+
+# adapt save directory
+save_path = osp.join(osp.dirname(osp.realpath(__file__)), "trained_models/BNN")
+
+print("<---- Dataset: {} case {}, Target property: {} ---->".format(dataset_name, case, target))
+
+#=======================================================================
+
+# convert categorical variables to one-hot tensors
+tmp_ligand_type = torch.tensor(X[:,0]).int()
+tmp_ligand_type = torch.unique(tmp_ligand_type, True, True)[1]
+num_types = int(tmp_ligand_type.max().item() + 1)
+tmp_ligand_type = F.one_hot(tmp_ligand_type, num_classes=num_types).to(torch.float)
+
+# standardize continuous input variables
+tmp_inp_cont = torch.tensor(X[:,1:]).float()
+inp_mean = tmp_inp_cont.mean(axis=0)
+inp_std = tmp_inp_cont.std(axis=0)
+tmp_inp_cont = (tmp_inp_cont - inp_mean) / inp_std 
+
+# X - input: concatenate one-hot caterogical variables and continuous variables
+X = torch.cat((tmp_ligand_type, tmp_inp_cont), axis=1)
+inp_dim = X.shape[1]
+
+# divide target variable by average
+y = torch.tensor(y).float()
+out_mean = y.mean(axis=0)
+y = y / out_mean 
+
+# split data into training and test set
+## random split
+#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.1,shuffle=True)
+## predefined split (takes the #<test_size> last points of the dataset csv file)
+test_size = 8
+X_train, X_test, y_train, y_test = X[:-test_size], X[-test_size:], y[:-test_size], y[-test_size:]
+
+ds_train = torch.utils.data.TensorDataset(X_train, y_train)
+dataloader_train = torch.utils.data.DataLoader(ds_train, batch_size=4, shuffle=True)
+
+ds_test = torch.utils.data.TensorDataset(X_test, y_test)
+dataloader_test = torch.utils.data.DataLoader(ds_test, batch_size=16, shuffle=True)
+
+print("<---- Length of train dataset: {} ---->".format(X_train.shape[0]))
+print("<---- Length of test dataset: {} ---->".format(X_test.shape[0]))
+
+#=======================================================================
+
+@variational_estimator
+class BayesianRegressor(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+
+        self.blinear1 = BayesianLinear(input_dim, 24)
+        self.blinear2 = BayesianLinear(24, 24)
+        self.blinear3 = BayesianLinear(24, output_dim)
+        #self.linear = nn.Linear(24, output_dim)
+
+    def forward(self, x):
+        x = F.leaky_relu(self.blinear1(x))
+        #x = F.dropout(x, p=0.1, training=self.training)
+        x = F.leaky_relu(self.blinear2(x))
+        x = F.dropout(x, p=0.1, training=self.training)
+        x = F.relu(self.blinear3(x))
+        #x = self.linear(x)
+        y = x
+        return y.view(-1)
+
+#=======================================================================
+
+# Training of model on given dataloader
+def train(loader):
+    regressor.train()
+
+    for i, (datapoints, labels) in enumerate(loader):
+        data = datapoints.to(device)
+        optimizer.zero_grad()
+        loss = regressor.sample_elbo(inputs=datapoints.to(device),
+                           labels=labels.to(device),
+                           criterion=criterion,
+                           sample_nbr=3,
+                           complexity_cost_weight=1/X_train.shape[0])
+        loss.backward()
+        optimizer.step()
+
+
+# Evaluate model for given dataloader
+def evaluate_regression(loader):
+    regressor.eval()
+
+    mae = 0
+    for i, (datapoints, labels) in enumerate(loader):
+        data = datapoints.to(device)
+        tmp_pred_data = regressor(data) * out_mean
+        tmp_real_data = labels * out_mean
+        mae += (tmp_pred_data - tmp_real_data).abs().mean()
+
+    return mae
+
+#=======================================================================
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+regressor = BayesianRegressor(inp_dim, 1).to(device)
+optimizer = optim.Adam(regressor.parameters(), lr=initial_lr)
+scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+    optimizer, factor=0.7, patience=3, min_lr=0.00001)
+criterion = torch.nn.MSELoss()
+
+#=======================================================================
+
+print("\n<---- Start training of BNN model ---->\n")
+
+max_iter_stop = early_stopping_epochs   # maximum number of consecutive iteration w/o improvement after which training is stopped
+tmp_iter_stop = 0
+best_train_mae = float("inf")
+for epoch in range(300):
+
+    lr = scheduler.optimizer.param_groups[0]["lr"]
+
+    # train model
+    train(dataloader_train)
+
+    # TODO: define stopping criterion! To use training mae is not the best way to do this (-> overfitting, usually we have an extra validation set, problem: small dataset size, cross-validation is exhaustive at this point)
+    train_mae = evaluate_regression(dataloader_train)
+    scheduler.step(train_mae)
+
+    # if prediction accuracy was improved in current epoch, reset <tmp_iter_stop> and save model
+    if best_train_mae > train_mae:
+        best_train_mae = train_mae
+        tmp_iter_stop = 0
+        save_model_weights = osp.join(save_path, model_name + "_BNN_model.pt")
+        torch.save(regressor.state_dict(), save_model_weights)
+    # if prediction accuracy was not imporved in current epoch, increase <tmp_iter_stop> and stop training if <max_iter_stop> is reached
+    else:
+        tmp_iter_stop += 1
+        if tmp_iter_stop >= max_iter_stop:
+            break
+
+    # print mean absolute error (MAE) on training set for current epoch (same for test set every 100th epoch)
+    print("   -- Epoch: {:03d}, LR: {:7f}, Train MAE: {:4f}".format(epoch, lr, train_mae))
+    if epoch%100==0:
+        test_mae = evaluate_regression(dataloader_test)
+        print("   -> Epoch: {:03d}, Test MAE: {:4f}".format(epoch, test_mae))
+
+print("\n<---- End training of BNN model ---->\n")
+
+#=======================================================================
+
+print("<---- Postprocessing ---->\n")
+
+# load final model from epoch with lowest prediction accuracy 
+regressor.load_state_dict(torch.load(osp.join(save_path, model_name + "_BNN_model.pt")))
+# freeze the model, in order to predict using only their weight distribution means
+regressor.freeze_()
+
+# get final model predictions for training and test data
+y_train_pred = regressor(X_train) * out_mean
+y_train = y_train * out_mean
+y_test_pred = regressor(X_test) * out_mean
+y_test = y_test * out_mean
+
+# Write model performance to general csv file
+path_csv = (osp.join(save_path, model_name + "_exp_pred.csv"))
+with open(path_csv,"w+", newline="") as result_file:
+    wr = csv.writer(result_file, quoting=csv.QUOTE_ALL)
+    wr.writerow(["Exp", "Pred"])
+    wr.writerow(["Training set"])
+    for i in range(y_train.shape[0]):
+        wr.writerow([y_train[i].item(), y_train_pred[i].item()]) 
+    wr.writerow(["Test set"])
+    for i in range(y_test.shape[0]):
+        wr.writerow([y_test[i].item(), y_test_pred[i].item()]) 
+
+# Create parity plots
+plt.figure(figsize=(5,5))
+plt.plot(y_train.detach().numpy(), y_train_pred.detach().numpy(), "o")
+plt.plot(y_test.detach().numpy(), y_test_pred.detach().numpy(), "x")
+plt.xlabel("Experimental y", fontsize=16)
+plt.ylabel("Predicted y", fontsize=16)
+plt.savefig(osp.join(save_path, model_name + "_ParityPlot"))
+
+print("<---- Finished! ---->\n")
+
diff --git a/summit/benchmarks/experiment_emulator/data/README.md b/summit/benchmarks/experiment_emulator/data/README.md
@@ -0,0 +1,15 @@
+# data
+
+This folder contains experimental datasets. The datasets are used in order to build probalistic models that mimic the experiments in order to test the strategies within Summit on these models. This prevents the need to run a large number of experiments but should always be seen in the light of the fact that these models only reproduce experiments (with errors) and do not replace costly experiments.
+
+The "<dataset_name>_train_test.csv" files include the same data points as the original dataset files but are ordered according to training (first) and test set (second). Please refer to our paper for the numbers of data points included in the training and test set, respectively.
+
+Following experimental datasets are included so far:
+
+* **reizman_2016_suzuki - Suzuki-Miyaura cross-coupling reaction** datasets for 4 different cases - by Reizman et al. (2016)
+
+'''
+Reizman, B. J., Wang, Y. M., Buchwald, S. L., & Jensen, K. F. (2016). Suzuki–Miyaura cross-coupling optimization enabled by automated feedback. Reaction chemistry & engineering, 1(6), 658-666
+'''
+
+
diff --git a/summit/benchmarks/experiment_emulator/data/reizman_2016_suzuki.xlsx b/summit/benchmarks/experiment_emulator/data/reizman_2016_suzuki.xlsx
diff --git a/summit/benchmarks/experiment_emulator/data/reizman_suzuki_case1.csv b/summit/benchmarks/experiment_emulator/data/reizman_suzuki_case1.csv
@@ -0,0 +1,98 @@
+96,4,,,,
+ Catalyst, tres (s), T  (ºC),Cat. Loading (mol%),TON,Yield (%)
+P1-L3,600,30,0.498,1.1,0.6
+P1-L6,600,30,2.515,0.2,0.6
+P1-L4,60,30,2.508,0.2,0.6
+P1-L1,60,30,0.513,1.1,0.6
+P1-L2,600,30,2.513,0.2,0.6
+P1-L5,60,30,0.508,1.1,0.6
+P1-L7,600,30,0.506,1.1,0.6
+P2-L1,60,30,2.509,0.2,0.6
+P2-L1,600,110,0.496,8.5,4.3
+P1-L4,600,110,0.512,84.7,43.4
+P1-L6,60,110,0.498,1.1,0.6
+P1-L1,600,110,2.509,24,60.2
+P1-L5,600,110,2.512,16.7,42
+P1-L7,60,110,2.499,33.8,84.6
+P1-L2,60,110,0.508,16.9,8.5
+P1-L3,60,110,2.489,21.8,54.4
+P1-L7,189.7,65.3,1.123,0.5,0.6
+P1-L1,189.7,65.3,1.106,22.5,24.9
+P1-L6,600,65.3,2.515,0.2,0.6
+P2-L1,189.7,65.3,2.509,8.4,21.1
+P1-L4,189.7,65.3,2.508,21.2,53.1
+P1-L5,189.7,65.3,1.106,0.5,0.6
+P1-L3,600,65.3,1.106,5.1,5.6
+P1-L2,600,65.3,1.129,6.7,7.5
+P1-L4,600,110,1.106,82.6,91.3
+P1-L1,600,110,2.509,27.4,68.6
+P1-L2,189.7,110,2.513,21.1,52.9
+P1-L6,189.7,110,1.127,6.6,7.4
+P1-L7,600,110,2.499,30,75
+P1-L3,189.7,110,2.489,21.6,53.8
+P1-L5,600,110,2.512,18.9,47.6
+P2-L1,600,110,1.131,15,17
+P1-L4,600,30,0.512,1.1,0.6
+P1-L5,600,30,0.508,1.1,0.6
+P1-L1,600,30,0.513,1.1,0.6
+P2-L1,600,30,0.496,1.1,0.6
+P1-L2,60,110,2.513,16.6,41.9
+P1-L3,60,110,2.489,31.2,77.5
+P1-L6,60,110,2.515,3.4,8.5
+P1-L7,60,110,2.499,34.2,85.4
+P1-L4,60,67.5,2.508,13.4,33.4
+P1-L5,60,66.7,2.512,0.2,0.6
+P1-L3,60,67,2.489,9.8,24.4
+P1-L7,60,66.8,2.499,1.4,3.6
+P1-L1,60,66.3,2.509,4.9,12.4
+P2-L1,60,67,2.509,4.5,11.2
+P1-L7,155.6,110,2.499,32.5,81.3
+P1-L5,109.3,110,2.482,15.8,39.1
+P2-L1,104.5,110,2.509,11.3,28.4
+P1-L1,109.2,110,2.482,29.8,74.1
+P1-L3,166.5,110,2.489,24.5,60.9
+P1-L4,60,110,0.512,20.1,10.3
+P1-L7,60,110,0.506,1.1,0.6
+P1-L3,60,110,0.498,43.7,21.7
+P1-L1,60,110,0.513,40,20.5
+P1-L4,60,110,0.512,17,8.7
+P1-L3,600,110,0.968,24.5,23.7
+P1-L1,600,110,0.971,42.6,41.4
+P1-L5,600,110,0.957,40.6,38.8
+P1-L4,600,110,1.268,72.4,91.8
+P1-L7,600,110,0.814,46.6,38
+P1-L4,161.7,110,2.104,39.9,84
+P1-L1,146.1,110,2.509,27.7,69.6
+P1-L3,185.5,110,2.489,25.2,62.8
+P1-L7,176.5,110,2.499,27.5,68.6
+P1-L1,60,110,2.266,37.1,83.9
+P1-L4,60,110,1.915,34.4,65.7
+P1-L7,60,110,2.303,35,80.7
+P1-L3,60,110,2.323,27.7,64.4
+P1-L4,600,110,2.508,30.8,82.1
+P1-L4,600,110,2.508,31.9,80
+P1-L4,600,67.3,1.214,24.4,29.6
+P1-L4,600,68.7,1.268,16.1,20.4
+P1-L4,199.3,110,1.241,57.8,71.6
+P1-L1,202.8,110,1.592,44.6,70.9
+P1-L4,600,110,1.16,65.1,75.5
+P1-L4,600,110,1.16,71.4,82.7
+P1-L4,600,110,1.106,65.3,72.2
+P1-L4,600,110,1.106,83.9,92.8
+P1-L4,600,110,1.187,60.7,72.1
+P1-L4,600,110,1.106,73,80.7
+P1-L4,600,66.3,0.998,20.2,20.1
+P1-L4,600,67.6,0.998,18,17.9
+P1-L4,600,110,1.241,70.5,87.5
+P1-L4,600,110,1.268,63.2,80.2
+P1-L4,600,110,1.187,65.8,78.1
+P1-L4,189.1,110,2.508,39.4,98.7
+P1-L4,600,110,1.268,65.6,83.1
+P1-L4,199.8,110,2.508,33.5,83.9
+P1-L4,199.6,110,2.508,33.2,83.2
+P1-L4,600,110,1.241,62,76.9
+P1-L4,600,110,1.025,73.3,75
+P1-L4,600,110,1.079,81.2,87.6
+P1-L4,600,110,1.133,67,75.9
+P1-L4,600,110,1.052,65.4,68.7
+P1-L4,600,110,1.106,71.2,78.7