# Test Uni-MOl repre

In [None]:
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import pandas as pd
import torch
import torch.nn as nn

# Import relevant scikit-learn modules
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import AdaBoostRegressor
from scipy import stats
from sklearn.decomposition import PCA

import os

from utils import train_and_evaluate_mlp_multiple_splits
import random
import utils

# 19 science

In [7]:
# Load Uni_mol repres of 19_science data
np.seterr(divide='ignore',invalid='ignore')

# Define path and csv files
DATA_DIR = './data/19_science/'
# OUT_DIR = 'out/models_unimol_infer'+datetime.now().strftime('%y%m%d%H%M')+'/'
OUT_DIR = 'out/'
if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)
# 分子特征
INPUTS_Catlyst_repr = 'Catalyst_smirepr.csv'  # Unscaled  data 
INPUTS_Imine_repr = 'Imine_smirepr.csv'
INPUTS_Thiol_repr = 'Thiol_smirepr.csv'
INPUTS_Origin_DF = '19_science_total.csv'

inputs_Catlyst_repr = pd.read_csv(DATA_DIR + INPUTS_Catlyst_repr)
inputs_Imine_repr = pd.read_csv(DATA_DIR + INPUTS_Imine_repr)
inputs_Thiol_repr = pd.read_csv(DATA_DIR + INPUTS_Thiol_repr)
delta_G = pd.read_csv(DATA_DIR + INPUTS_Origin_DF)['Output']

inputs = np.concatenate([inputs_Catlyst_repr,inputs_Imine_repr,inputs_Thiol_repr],axis=1)


In [26]:
# 数据划分

# 是否读取之前的划分
new_split = 1

if new_split == 1:
    # 随机生成train和test的indices
    total_cleaned_len = len(inputs_Catlyst_repr)
    train_index = random.sample(range(total_cleaned_len), 600)#论文中用了600个作为训练集 int(1075*0.7)
    train_indices = np.zeros((total_cleaned_len)).astype(np.bool_)
    for tcl in range(total_cleaned_len):
        if tcl in train_index:
            train_indices[tcl] = True
    test_indices = ~ train_indices
    np.savetxt(OUT_DIR + '19_data_train_indices.csv', train_indices, delimiter = ',')
    np.savetxt(OUT_DIR + '19_data_test_indices.csv', test_indices, delimiter = ',')

elif new_split == 0:
    # read saved train indices and test indices
    train_indices = np.loadtxt(OUT_DIR + 'clean_data_train_indices.csv',dtype='bool', delimiter=',')
    test_indices = np.loadtxt(OUT_DIR + 'clean_data_test_indices.csv',dtype='bool', delimiter=',')

    # Load yield data
    delta2G = np.array(delta_G)
    delta2G = delta2G.flatten()
    delta2G = np.nan_to_num(delta2G, nan=0)
    print('len(inputs): ', len(inputs))
    # Use the indices to generate train/test sets
    X_train = inputs[train_indices]
    #X_train = inputs[:2]
    y_train = delta2G[train_indices]
    featuresTrain = torch.from_numpy(X_train)
    targetsTrain = torch.from_numpy(y_train)
    batch_size = 32 # len(X_train)
    print('batch_size: ', batch_size)

    X_test = inputs[test_indices]
    y_test = delta2G[test_indices]
    featuresTest = torch.from_numpy(X_test)
    targetsTest = torch.from_numpy(y_test)#.type(torch.LongTensor)
    batch_size_test = len(X_test)
    print('batch_size_test: ', batch_size_test)

    train = torch.utils.data.TensorDataset(featuresTrain,targetsTrain)
    test = torch.utils.data.TensorDataset(featuresTest,targetsTest)

    train_loader = torch.utils.data.DataLoader(train, batch_size = batch_size, shuffle = False)
    test_loader = torch.utils.data.DataLoader(test, batch_size = batch_size_test, shuffle = False)

In [9]:
train_and_evaluate_mlp_multiple_splits(
    inputs=inputs,
    labels=delta_G,
    save_path='out/19_mlp_lr_0.0005_split_30_70-30.csv',
    num_splits=30,
    train_size=int(1075*0.70),
    num_epochs=1000,
    batch_size=64,
    lr=0.0001,
    patience=50
)

Using device: cuda

==== Split 1/30 ====
Early stopping at epoch 384, best MAE = 0.1447
Final for split 1: R2 = 0.9160, MAE = 0.1447

==== Split 2/30 ====
Early stopping at epoch 469, best MAE = 0.1420
Final for split 2: R2 = 0.9118, MAE = 0.1420

==== Split 3/30 ====
Early stopping at epoch 436, best MAE = 0.1441
Final for split 3: R2 = 0.9108, MAE = 0.1441

==== Split 4/30 ====
Early stopping at epoch 301, best MAE = 0.1442
Final for split 4: R2 = 0.9215, MAE = 0.1442

==== Split 5/30 ====
Early stopping at epoch 270, best MAE = 0.1500
Final for split 5: R2 = 0.9109, MAE = 0.1500

==== Split 6/30 ====
Early stopping at epoch 413, best MAE = 0.1413
Final for split 6: R2 = 0.9023, MAE = 0.1413

==== Split 7/30 ====
Early stopping at epoch 662, best MAE = 0.1474
Final for split 7: R2 = 0.9155, MAE = 0.1474

==== Split 8/30 ====
Early stopping at epoch 430, best MAE = 0.1513
Final for split 8: R2 = 0.9107, MAE = 0.1513

==== Split 9/30 ====
Early stopping at epoch 308, best MAE = 0.1472


In [41]:
# pca
pca = PCA(n_components=100)
inputs_reduced = pca.fit_transform(inputs)
train_and_evaluate_mlp_multiple_splits(
    inputs=inputs_reduced,
    labels=delta_G,
    save_path='out/19_mlp_lr_0.0005_split_10_600-475_pca.csv',
    num_splits=10,
    num_epochs=1000,
    train_size=600,
    batch_size=64,
    lr=0.0001,
    patience=50
)

Using device: cuda

==== Split 1/10 ====
Early stopping at epoch 169, best MAE = 0.1692
Final for split 1: R2 = 0.8890, MAE = 0.1692

==== Split 2/10 ====
Early stopping at epoch 167, best MAE = 0.1672
Final for split 2: R2 = 0.8936, MAE = 0.1672

==== Split 3/10 ====
Early stopping at epoch 155, best MAE = 0.1616
Final for split 3: R2 = 0.8910, MAE = 0.1616

==== Split 4/10 ====
Early stopping at epoch 169, best MAE = 0.1635
Final for split 4: R2 = 0.8886, MAE = 0.1635

==== Split 5/10 ====
Early stopping at epoch 185, best MAE = 0.1605
Final for split 5: R2 = 0.8952, MAE = 0.1605

==== Split 6/10 ====
Early stopping at epoch 141, best MAE = 0.1646
Final for split 6: R2 = 0.9017, MAE = 0.1646

==== Split 7/10 ====
Early stopping at epoch 162, best MAE = 0.1709
Final for split 7: R2 = 0.8879, MAE = 0.1709

==== Split 8/10 ====
Early stopping at epoch 163, best MAE = 0.1648
Final for split 8: R2 = 0.8911, MAE = 0.1648

==== Split 9/10 ====
Early stopping at epoch 169, best MAE = 0.1504


## 18_science

In [42]:
DATA_DIR = './data/18_science/'
# OUT_DIR = 'out/models_unimol_infer'+datetime.now().strftime('%y%m%d%H%M')+'/'
OUT_DIR = 'out/'

INPUTS_Aryl_halide_repr = 'Aryl haliderepr.csv'  # Unscaled  data 
INPUTS_Additive_repr = 'Additiverepr.csv'
INPUTS_Base_repr = 'Baserepr.csv'
INPUTS_Ligand_repr = 'Ligandrepr.csv'
INPUTS_Origin_DF = '18 science_original_chem.xlsx'

inputs_Aryl_halide_repr = pd.read_csv(DATA_DIR + INPUTS_Aryl_halide_repr)
inputs_Additive_repr = pd.read_csv(DATA_DIR + INPUTS_Additive_repr)
inputs_Base_repr = pd.read_csv(DATA_DIR + INPUTS_Base_repr)
inputs_Ligand_repr = pd.read_csv(DATA_DIR + INPUTS_Ligand_repr)
yields = pd.read_excel(DATA_DIR + INPUTS_Origin_DF)['Output']

inputs = np.concatenate([inputs_Aryl_halide_repr ,inputs_Additive_repr,inputs_Base_repr,inputs_Ligand_repr],axis=1)

In [None]:
train_and_evaluate_mlp_multiple_splits(
    inputs=inputs,
    labels=yields,
    save_path='out/18_mlp_lr_0.0005.csv',
    num_splits=30,
    train_size=int(len(inputs)*0.7),
    num_epochs=1000,
    batch_size=32,
    lr=0.0005,
    patience=60
)

Using device: cuda

==== Split 1/30 ====
Early stopping at epoch 618, best MAE = 3.8814
Final for split 1: R2 = 0.9545, MAE = 3.8814

==== Split 2/30 ====
Early stopping at epoch 499, best MAE = 4.1260
Final for split 2: R2 = 0.9564, MAE = 4.1260

==== Split 3/30 ====
Early stopping at epoch 362, best MAE = 3.9903
Final for split 3: R2 = 0.9525, MAE = 3.9903

==== Split 4/30 ====
Early stopping at epoch 409, best MAE = 3.8576
Final for split 4: R2 = 0.9573, MAE = 3.8576

==== Split 5/30 ====
Early stopping at epoch 642, best MAE = 3.7816
Final for split 5: R2 = 0.9572, MAE = 3.7816

==== Split 6/30 ====
Early stopping at epoch 437, best MAE = 3.8867
Final for split 6: R2 = 0.9590, MAE = 3.8867

==== Split 7/30 ====
Early stopping at epoch 524, best MAE = 4.0746
Final for split 7: R2 = 0.9509, MAE = 4.0746

==== Split 8/30 ====
Early stopping at epoch 502, best MAE = 3.8250
Final for split 8: R2 = 0.9587, MAE = 3.8250

==== Split 9/30 ====
Early stopping at epoch 557, best MAE = 3.6965


In [43]:
# pca
pca = PCA(n_components=100)
inputs_reduced = pca.fit_transform(inputs)
train_and_evaluate_mlp_multiple_splits(
    inputs=inputs,
    labels=yields,
    save_path='out/18_mlp_lr_0.0005_pca.csv',
    num_splits=30,
    train_size=int(len(inputs)*0.7),
    num_epochs=1000,
    batch_size=32,
    lr=0.0005,
    patience=60
)

Using device: cuda

==== Split 1/30 ====
Early stopping at epoch 500, best MAE = 3.9183
Final for split 1: R2 = 0.9565, MAE = 3.9183

==== Split 2/30 ====
Early stopping at epoch 573, best MAE = 3.8597
Final for split 2: R2 = 0.9562, MAE = 3.8597

==== Split 3/30 ====
Early stopping at epoch 485, best MAE = 4.0720
Final for split 3: R2 = 0.9514, MAE = 4.0720

==== Split 4/30 ====
Early stopping at epoch 391, best MAE = 4.3981
Final for split 4: R2 = 0.9460, MAE = 4.3981

==== Split 5/30 ====
Early stopping at epoch 704, best MAE = 4.0508
Final for split 5: R2 = 0.9532, MAE = 4.0508

==== Split 6/30 ====
Early stopping at epoch 385, best MAE = 4.2225
Final for split 6: R2 = 0.9516, MAE = 4.2225

==== Split 7/30 ====
Early stopping at epoch 530, best MAE = 3.8957
Final for split 7: R2 = 0.9556, MAE = 3.8957

==== Split 8/30 ====
Early stopping at epoch 493, best MAE = 3.9361
Final for split 8: R2 = 0.9470, MAE = 3.9361

==== Split 9/30 ====
Early stopping at epoch 551, best MAE = 3.7476


## Suzuki dataset

In [44]:
DATA_DIR = './data/suzuki/'
OUT_DIR = 'out/'
if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)
# 分子特征
INPUTS_reactant1_repr = 'reactant_1repr.csv'  # Unscaled  data 
INPUTS_reactant2_repr = 'reactant_2repr.csv'
INPUTS_reagent1_repr = 'reagent_1repr.csv'
INPUTS_solvent1_repr = 'solvent_1repr.csv'
INPUTS_ligand_repr = 'ligandrepr.csv'
INPUTS_Origin_DF = 'suzuki.xlsx'

inputs_reactant1_repr = pd.read_csv(DATA_DIR + INPUTS_reactant1_repr)
inputs_reactant2_repr = pd.read_csv(DATA_DIR + INPUTS_reactant2_repr)
inputs_reagent1_repr = pd.read_csv(DATA_DIR + INPUTS_reagent1_repr)
inputs_solvent1_repr = pd.read_csv(DATA_DIR + INPUTS_solvent1_repr)
inputs_Ligand_repr = pd.read_csv(DATA_DIR + INPUTS_ligand_repr)
df_ori = pd.read_excel(DATA_DIR + INPUTS_Origin_DF)
df = df_ori.dropna(axis=0, how='any')
inputs = np.concatenate([inputs_reactant1_repr ,inputs_reactant2_repr,inputs_reagent1_repr,inputs_solvent1_repr,inputs_Ligand_repr],axis=1)
yields = df['Output']

In [21]:
train_and_evaluate_mlp_multiple_splits(
    inputs=inputs,
    labels=yields,
    save_path='out/suzuki_mlp.csv',
    num_splits=30,
    train_size=int(len(inputs)*0.7),
    num_epochs=1000,
    batch_size=64,
    lr=0.0005,
    patience=40
)

Using device: cuda

==== Split 1/30 ====
Early stopping at epoch 143, best MAE = 0.0838
Final for split 1: R2 = 0.8302, MAE = 0.0838

==== Split 2/30 ====
Early stopping at epoch 129, best MAE = 0.0760
Final for split 2: R2 = 0.8631, MAE = 0.0760

==== Split 3/30 ====
Early stopping at epoch 153, best MAE = 0.0803
Final for split 3: R2 = 0.8492, MAE = 0.0803

==== Split 4/30 ====
Early stopping at epoch 133, best MAE = 0.0754
Final for split 4: R2 = 0.8579, MAE = 0.0754

==== Split 5/30 ====
Early stopping at epoch 163, best MAE = 0.0791
Final for split 5: R2 = 0.8530, MAE = 0.0791

==== Split 6/30 ====
Early stopping at epoch 134, best MAE = 0.0804
Final for split 6: R2 = 0.8455, MAE = 0.0804

==== Split 7/30 ====
Early stopping at epoch 151, best MAE = 0.0769
Final for split 7: R2 = 0.8661, MAE = 0.0769

==== Split 8/30 ====
Early stopping at epoch 155, best MAE = 0.0788
Final for split 8: R2 = 0.8484, MAE = 0.0788

==== Split 9/30 ====
Early stopping at epoch 156, best MAE = 0.0801


In [45]:
# pca
pca = PCA(n_components=100)
inputs_reduced = pca.fit_transform(inputs)
train_and_evaluate_mlp_multiple_splits(
    inputs=inputs,
    labels=yields,
    save_path='out/suzuki_mlp_lr_0.0005_pca.csv',
    num_splits=30,
    train_size=int(len(inputs)*0.7),
    num_epochs=1000,
    batch_size=32,
    lr=0.0005,
    patience=60
)

Using device: cuda

==== Split 1/30 ====
Early stopping at epoch 204, best MAE = 0.0844
Final for split 1: R2 = 0.8192, MAE = 0.0844

==== Split 2/30 ====
Early stopping at epoch 182, best MAE = 0.0783
Final for split 2: R2 = 0.8533, MAE = 0.0783

==== Split 3/30 ====
Early stopping at epoch 201, best MAE = 0.0822
Final for split 3: R2 = 0.8253, MAE = 0.0822

==== Split 4/30 ====
Early stopping at epoch 209, best MAE = 0.0814
Final for split 4: R2 = 0.8438, MAE = 0.0814

==== Split 5/30 ====
Early stopping at epoch 216, best MAE = 0.0819
Final for split 5: R2 = 0.8385, MAE = 0.0819

==== Split 6/30 ====
Early stopping at epoch 174, best MAE = 0.0806
Final for split 6: R2 = 0.8466, MAE = 0.0806

==== Split 7/30 ====
Early stopping at epoch 211, best MAE = 0.0776
Final for split 7: R2 = 0.8510, MAE = 0.0776

==== Split 8/30 ====
Early stopping at epoch 157, best MAE = 0.0812
Final for split 8: R2 = 0.8320, MAE = 0.0812

==== Split 9/30 ====
Early stopping at epoch 183, best MAE = 0.0802


# Try with tripeptide catalyzed conjugate addition reaction

In [5]:
DATA_DIR = './data/peptide_data/'
# OUT_DIR = 'out/models_unimol_infer'+datetime.now().strftime('%y%m%d%H%M')+'/'
OUT_DIR = 'out/'

INPUTS_tripep_repr = 'peptide_SMILES.csv'  # Unscaled  data 
INPUTS_reactant1_repr = 'Reactant_1_smi.csv'
INPUTS_reactant2_repr = 'Reactant_2_smi.csv'
INPUTS_Origin_DF = 'conjugate_addition.csv'

inputs_tripep_repr = pd.read_csv(DATA_DIR + INPUTS_tripep_repr)
inputs_reactant1_repr = pd.read_csv(DATA_DIR + INPUTS_reactant1_repr)
inputs_reactant2_repr = pd.read_csv(DATA_DIR + INPUTS_reactant2_repr)

yields = pd.read_csv(DATA_DIR + INPUTS_Origin_DF)['Reaction I eesyn (%)']

inputs = np.concatenate([inputs_tripep_repr ,inputs_reactant1_repr,inputs_reactant2_repr],axis=1)

In [6]:
# inputs_reduced = pca.fit_transform(inputs)
train_and_evaluate_mlp_multiple_splits(
    inputs=inputs,
    labels=yields,
    save_path='out/peptide_mlp_30_lr_0.0005.csv',
    num_splits=30,
    train_size=int(len(inputs)*0.70),
    num_epochs=1000,
    batch_size=32,
    lr=0.0005,
    patience=60
)

Using device: cuda

==== Split 1/30 ====
Early stopping at epoch 590, best MAE = 24.2555
Final for split 1: R2 = 0.6718, MAE = 24.2555

==== Split 2/30 ====
Early stopping at epoch 761, best MAE = 22.0514
Final for split 2: R2 = 0.7730, MAE = 22.0514

==== Split 3/30 ====
Early stopping at epoch 66, best MAE = 67.9260
Final for split 3: R2 = -0.0100, MAE = 67.9260

==== Split 4/30 ====
Early stopping at epoch 756, best MAE = 25.0309
Final for split 4: R2 = 0.6380, MAE = 25.0309

==== Split 5/30 ====
Early stopping at epoch 801, best MAE = 17.0314
Final for split 5: R2 = 0.8856, MAE = 17.0314

==== Split 6/30 ====
Early stopping at epoch 971, best MAE = 30.0721
Final for split 6: R2 = 0.6177, MAE = 30.0721

==== Split 7/30 ====
Early stopping at epoch 861, best MAE = 22.3007
Final for split 7: R2 = 0.7612, MAE = 22.3007

==== Split 8/30 ====
Early stopping at epoch 66, best MAE = 69.2699
Final for split 8: R2 = -0.0272, MAE = 69.2699

==== Split 9/30 ====
Early stopping at epoch 920, be