### Preprocessing of the features for subsequent modelling

In [None]:
# import all required packages 
import math
import numpy as np
import pandas as pd

#install package for excel support
#!pip install xlrd --user
import xlrd

In [None]:
# import the data from the excel sheet
column_names = ['Ligand','Ligand_SMILES_chemdraw','Pd_Species','Pd_Species_SMILES','Boronic_acid','Boronic_acid_SMILES',
                'HOMO','LUMO','totale_dipole','E_RB3LYP',
                'N_sterimol_L','N_sterimol_B1','N_sterimol_B5',
                'R_sterimol_L','R_sterimol_B1','R_sterimol_B5',
                'R_only_sterimol_L','R_only_sterimol_B1','R_only_sterimol_B5',
                'NBO_N','NBO_OR','NBO__O','NBO_AcO',
                'chelpg_N','chelpg_OR','chelpg_O','chelpg_AcO',
                'V_buried',
                'N_H_proton',
                'Yield']

raw_dataset = pd.read_excel('HTE_Dataset_all_new_2.xlsx',names=column_names)

dataset = raw_dataset.copy()

dataset = shuffle(dataset)
print('Full dataset shape:',dataset.shape)

## Feature processing

In [None]:
# Morgan 2 Fingerprints

ligands_mol_list = []

for element in dataset.Ligand_SMILES_chemdraw:
    mol = Chem.MolFromSmiles(element)
    ligands_mol_list.append(mol)

ligands_fp_list = []

for element in ligands_mol_list:
    lig_fp = AllChem.GetMorganFingerprintAsBitVect(element,2,nBits=1024)
    ligands_fp_list.append(lig_fp)
    
ligands_fp_list_2 = []

for element in ligands_fp_list:
    ligands_fp_list_array = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(element,ligands_fp_list_array)
    ligands_fp_list_2.append(ligands_fp_list_array)


Pd_mol_list = []

for element in dataset.Pd_Species_SMILES:
    mol = Chem.MolFromSmiles(element)
    Pd_mol_list.append(mol)


Pd_fp_list = []

for element in Pd_mol_list:
    Pd_fp = AllChem.GetMorganFingerprintAsBitVect(element,2,nBits=1024)
    Pd_fp_list.append(Pd_fp)
    
Pd_fp_list_2 = []

for element in Pd_fp_list:
    Pd_fp_list_array = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(element,Pd_fp_list_array)
    Pd_fp_list_2.append(Pd_fp_list_array)

Boronic_acid_mol_list = []

for element in dataset.Boronic_acid_SMILES:
    mol = Chem.MolFromSmiles(element)
    Boronic_acid_mol_list.append(mol)

Boronic_acid_fp_list = []

for element in Boronic_acid_mol_list:
    Boronic_acid_fp = AllChem.GetMorganFingerprintAsBitVect(element,2,nBits=1024)
    Boronic_acid_fp_list.append(Boronic_acid_fp)
    
Boronic_acid_fp_list_2 = []

for element in Boronic_acid_fp_list:
    Boronic_acid_fp_list_array = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(element,Boronic_acid_fp_list_array)
    Boronic_acid_fp_list_2.append(Boronic_acid_fp_list_array)

ligands_fp_list_2 = np.array(ligands_fp_list_2)

Pd_fp_list_2 = np.array(Pd_fp_list_2)

Boronic_acid_fp_list_2 = np.array(Boronic_acid_fp_list_2)

x = np.append(ligands_fp_list_2,Pd_fp_list_2,axis=1)
fingerprints_input = np.append(x,Boronic_acid_fp_list_2,axis=1)

pca = PCA(n_components=30)

pca_ligands = pca.fit_transform(ligands_fp_list_2)
pca_precat = pca.fit_transform(Pd_fp_list_2)
pca_boronic = pca.fit_transform(Boronic_acid_fp_list_2)

fp_pca_data = np.append(pca_ligands,pca_precat,axis=1)
fp_pca_data = np.append(fp_pca_data,pca_boronic,axis=1)

# OHE
ligand_onehot = pd.get_dummies(dataset['Ligand'], prefix="ligand")
precat_onehot = pd.get_dummies(dataset['Pd_Species'], prefix="precat")
boronic_acid_onehot = pd.get_dummies(dataset['Boronic_acid'], prefix="boronic_acid")

combined = [ligand_onehot,precat_onehot,boronic_acid_onehot]
combined_small = [precat_onehot,boronic_acid_onehot]

dataset_one_hot = pd.concat(combined, axis=1)
dataset_one_hot_small = pd.concat(combined_small, axis=1)

# DFT descriptors

homo = np.array(dataset.HOMO).reshape(-1,1)

lumo = np.array(dataset.LUMO).reshape(-1,1)

total_dipole = np.array(dataset.totale_dipole).reshape(-1,1)

E_RB3LYP = np.array(dataset.E_RB3LYP).reshape(-1,1)

dft_data = np.append(homo,lumo,axis=1)

# Sterimol

N_sterimol_l = dataset.N_sterimol_L
N_sterimol_b1 = dataset.N_sterimol_L
N_sterimol_b5 = dataset.N_sterimol_L

R_sterimol_l = dataset.R_sterimol_L
R_sterimol_b1 = dataset.R_sterimol_L
R_sterimol_b5 = dataset.R_sterimol_L

sterimol_combined = [N_sterimol_l,N_sterimol_b1,N_sterimol_b5,R_sterimol_l,R_sterimol_b1,R_sterimol_b5]
data_2_sterimol = pd.concat(sterimol_combined, axis=1)

R_only_sterimol_l = dataset.R_only_sterimol_L
R_only_sterimol_b1 = dataset.R_only_sterimol_L
R_only_sterimol_b5 = dataset.R_only_sterimol_L

sterimol_only_residue_combined = [R_only_sterimol_l,R_only_sterimol_b1,R_only_sterimol_b5]
data_only_R_sterimol = pd.concat(sterimol_only_residue_combined, axis=1)

# NBO

NBO_N = np.array(dataset.NBO_N).reshape(-1,1)
NBO_OR = np.array(dataset.NBO_OR).reshape(-1,1)
NBO_O__ = np.array(dataset.NBO__O).reshape(-1,1)
NBO_AcO = np.array(dataset.NBO_AcO).reshape(-1,1)

NBO_data = np.append(NBO_N,NBO_OR,axis=1)
NBO_data = np.append(NBO_data,NBO_O__,axis=1)
NBO_data = np.append(NBO_data,NBO_AcO,axis=1)

# CHELPG

chelpg_N = dataset.chelpg_N
chelpg_OR = dataset.chelpg_OR
chelpg_O = dataset.chelpg_O
chelpg_AcO = dataset.chelpg_AcO

chelpg_combined = [chelpg_N,chelpg_OR,chelpg_O,chelpg_AcO]
chelpg_data = pd.concat(chelpg_combined, axis=1)

# other descriptors and yield

vol_bur = np.array(dataset.V_buried).reshape(-1,1)
n_proton = np.array(dataset.N_H_proton).reshape(-1,1)
yield_numerical = np.array(dataset.Yield).reshape(-1,1)


In [None]:
# Make the inputs and the outputs ready

full_dataset = data_2_sterimol
full_dataset = np.append(full_dataset,dft_data,axis=1)
full_dataset = np.append(full_dataset,fp_pca_data,axis=1)
full_dataset = np.append(full_dataset,dataset_one_hot,axis=1)
full_dataset = np.append(full_dataset,NBO_data,axis=1)
full_dataset = np.append(full_dataset,chelpg_data,axis=1)
full_dataset = np.append(full_dataset,vol_bur,axis=1)
full_dataset = np.append(full_dataset,n_proton,axis=1)
full_dataset = np.append(full_dataset,yield_numerical,axis=1)


# input = np.append(input,data_only_R_sterimol,axis=1)
# input = np.append(input,chelpg_data,axis=1)

feature_column_names = ['N_sterimol_l','N_sterimol_b1','N_sterimol_b5','R_sterimol_l','R_sterimol_b1','R_sterimol_b5',
                       'homo', 'lumo',
                       'ligands_PCA_1',' ligands_PCA_2',' ligands_PCA_3','ligands_PCA_4','ligands_PCA_5','ligands_PCA_6','ligands_PCA_7','ligands_PCA_8','ligands_PCA_9','ligands_PCA_10','ligands_PCA_11','ligands_PCA_12','ligands_PCA_13','ligands_PCA_14','ligands_PCA_15','ligands_PCA_16','ligands_PCA_17','ligands_PCA_18','ligands_PCA_19','ligands_PCA_20','ligands_PCA_21','ligands_PCA_22','ligands_PCA_23','ligands_PCA_24','ligands_PCA_25','ligands_PCA_26','ligands_PCA_27','ligands_PCA_28','ligands_PCA_29','ligands_PCA_30',
                       'precat_PCA_1',' precat_PCA_2',' precat_PCA_3','precat_PCA_4','precat_PCA_5','precat_PCA_6','precat_PCA_7','precat_PCA_8','precat_PCA_9','precat_PCA_10','precat_PCA_11','precat_PCA_12','precat_PCA_13','precat_PCA_14','precat_PCA_15','precat_PCA_16','precat_PCA_17','precat_PCA_18','precat_PCA_19','precat_PCA_20','precat_PCA_21','precat_PCA_22','precat_PCA_23','precat_PCA_24','precat_PCA_25','precat_PCA_26','precat_PCA_27','precat_PCA_28','precat_PCA_29','precat_PCA_30',
                       'boronic_PCA_1',' boronic_PCA_2',' boronic_PCA_3','boronic_PCA_4','boronic_PCA_5','boronic_PCA_6','boronic_PCA_7','boronic_PCA_8','boronic_PCA_9','boronic_PCA_10','boronic_PCA_11','boronic_PCA_12','boronic_PCA_13','boronic_PCA_14','boronic_PCA_15','boronic_PCA_16','boronic_PCA_17','boronic_PCA_18','boronic_PCA_19','boronic_PCA_20','boronic_PCA_21','boronic_PCA_22','boronic_PCA_23','boronic_PCA_24','boronic_PCA_25','boronic_PCA_26','boronic_PCA_27','boronic_PCA_28','boronic_PCA_29','boronic_PCA_30',
                       'OHE_lig_1',' OHE_lig_2',' OHE_lig_3','OHE_lig_4','OHE_lig_5','OHE_lig_6','OHE_lig_7','OHE_lig_9','OHE_lig_10','OHE_lig_11','OHE_lig_12','OHE_lig_13','OHE_lig_14','OHE_lig_15','OHE_lig_16','OHE_lig_17','OHE_lig_18','OHE_lig_19','OHE_lig_20','OHE_lig_21','OHE_lig_22','OHE_lig_23','OHE_lig_24','OHE_lig_25','OHE_lig_26','OHE_lig_27','OHE_lig_28','OHE_lig_29','OHE_lig_30',' OHE_lig_31',' OHE_lig_32',
                       'OHE_precat_1','OHE_precat_2','OHE_precat_3',
                       'OHE_bor_1','OHE_bor_2',
                       'NBO_N','NBO_OR','NBO_O','NBO_AcO',
                       'chelpg_N','chelpg_OR','chelpg_O','chelpg_AcO',
                       'vol_bur',
                       'n_proton',
                       'Yield']

full_dataset_input = pd.DataFrame(full_dataset)
full_dataset_input.columns = feature_column_names