In [10]:
import pandas as pd 
import numpy as np
import sklearn 
import rdkit
from rdkit import Chem
import rdkit.Chem.Descriptors
import rdkit.Chem.Fragments
import rdkit.Chem.Lipinski 
import pyepal
from pyepal import PALSklearn 
import sklearn
from sklearn.metrics import mean_absolute_error as mae
from sklearn.gaussian_process import GaussianProcessRegressor 
from sklearn.gaussian_process.kernels import RBF
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [11]:
df = pd.read_csv('AllProps_1400BzNSN.csv')
df

Unnamed: 0,Entry Number,File Name,SMILES,Ered,HOMO,Gsol,Absorption Wavelength
0,0,BTZ-Neutral-wb97xd-1000,CCOCN(C(C)=O)c1c(OC)c(C#N)c([N+](=O)[O-])c2nsnc12,3.006284,-8.811686,-0.790614,381.65
1,1,BTZ-Neutral-wb97xd-1001,CCOCN(C(C)=O)c1c(OC)c(C#N)c(S(=O)(=O)N(C)C)c2n...,2.511715,-8.794816,-1.096917,354.74
2,2,BTZ-Neutral-wb97xd-1002,CCOCN(C(C)=O)c1c(OC)c(OC)c(C)c2nsnc12,1.715196,-8.254153,-0.780220,334.28
3,3,BTZ-Neutral-wb97xd-1003,CCOCN(C(C)=O)c1c(OC)c(OC)c(C(F)(F)F)c2nsnc12,1.932468,-8.745838,-0.737337,319.72
4,4,BTZ-Neutral-wb97xd-1004,CCOCN(C(C)=O)c1c(OC)c(OC)c(C#N)c2nsnc12,2.198147,-8.643801,-0.828164,336.77
...,...,...,...,...,...,...,...
1402,1402,BTZ-Neutral-wb97xd-995,CCOCN(C(C)=O)c1c(OC)c(C#N)c(OC(F)(F)F)c2nsnc12,2.416752,-8.942294,-0.854448,328.27
1403,1403,BTZ-Neutral-wb97xd-996,CCOCN(C(C)=O)c1c(OC)c(C#N)c(SC)c2nsnc12,2.322877,-8.362449,-0.763050,385.59
1404,1404,BTZ-Neutral-wb97xd-997,CCOCN(C(C)=O)c1c(OC)c(C#N)c(SC(F)(F)F)c2nsnc12,2.546326,-9.005422,-0.746996,344.75
1405,1405,BTZ-Neutral-wb97xd-998,CCOCN(C(C)=O)c1c(OC)c(C#N)c(Br)c2nsnc12,2.372209,-8.532240,-0.800137,362.34


In [13]:
smiles = df['SMILES']
# reduction potential 
y_red = df['Ered']
# solvation free energy 
y_sol = df['Gsol']
# absorption wavelength 
y_wav = df['Absorption Wavelength']

In [14]:
def generate_descriptors(smile): 
    mol = Chem.MolFromSmiles(smile)
    #sssr = rdkit.Chem.rdmolops.GetSSSR(mol)
    clogp = rdkit.Chem.Crippen.MolLogP(mol)
    molar_refractivity = rdkit.Chem.Crippen.MolMR(mol)
    molecular_weight = rdkit.Chem.rdMolDescriptors.CalcExactMolWt(mol)
    tpsa = rdkit.Chem.rdMolDescriptors.CalcTPSA(mol)
    chi0n = rdkit.Chem.rdMolDescriptors.CalcChi0n(mol)
    chi1n = rdkit.Chem.rdMolDescriptors.CalcChi1n(mol)
    chi2n = rdkit.Chem.rdMolDescriptors.CalcChi2n(mol)
    chi3n = rdkit.Chem.rdMolDescriptors.CalcChi3n(mol)
    chi4n = rdkit.Chem.rdMolDescriptors.CalcChi4n(mol)
    chi0v = rdkit.Chem.rdMolDescriptors.CalcChi0v(mol)
    chi1v = rdkit.Chem.rdMolDescriptors.CalcChi1v(mol)
    chi2v = rdkit.Chem.rdMolDescriptors.CalcChi2v(mol)
    chi3v = rdkit.Chem.rdMolDescriptors.CalcChi3v(mol)
    chi4v = rdkit.Chem.rdMolDescriptors.CalcChi4v(mol)
    fracsp3 = rdkit.Chem.rdMolDescriptors.CalcFractionCSP3(mol)
    hall_kier_alpha = rdkit.Chem.rdMolDescriptors.CalcHallKierAlpha(mol)
    kappa1 = rdkit.Chem.rdMolDescriptors.CalcKappa1(mol)
    kappa2 = rdkit.Chem.rdMolDescriptors.CalcKappa2(mol)
    kappa3 = rdkit.Chem.rdMolDescriptors.CalcKappa3(mol)
    # Labute's approximate surface area (labuteasa) 
    labuteasa = rdkit.Chem.rdMolDescriptors.CalcLabuteASA(mol)
    n_aliphatic_rings = rdkit.Chem.rdMolDescriptors.CalcNumAliphaticRings(mol)
    n_aromatic_rings = rdkit.Chem.rdMolDescriptors.CalcNumAromaticRings(mol)
    n_amide_bonds = rdkit.Chem.rdMolDescriptors.CalcNumAmideBonds(mol)
    n_atom_stereocenters = rdkit.Chem.rdMolDescriptors.CalcNumAtomStereoCenters(mol)
    n_bridgehead_atoms = rdkit.Chem.rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
    n_HBA = rdkit.Chem.rdMolDescriptors.CalcNumLipinskiHBA(mol)
    n_HBD = rdkit.Chem.rdMolDescriptors.CalcNumLipinskiHBD(mol)
    n_hetero_atoms = rdkit.Chem.rdMolDescriptors.CalcNumHeteroatoms(mol)
    n_hetero_cycles = rdkit.Chem.rdMolDescriptors.CalcNumHeterocycles(mol)
    n_rings = rdkit.Chem.rdMolDescriptors.CalcNumRings(mol)
    n_rotatable_bonds = rdkit.Chem.rdMolDescriptors.CalcNumRotatableBonds(mol)
    n_spiro = rdkit.Chem.rdMolDescriptors.CalcNumSpiroAtoms(mol)
    n_saturated_rings = rdkit.Chem.rdMolDescriptors.CalcNumSaturatedRings(mol)
    n_heavy_atoms = rdkit.Chem.rdMolDescriptors.CalcNumHeavyAtoms(mol)
    n_nh_oh = rdkit.Chem.Lipinski.NHOHCount(mol)
    n_n_o = rdkit.Chem.Lipinski.NOCount(mol)
    n_valence_electrons = rdkit.Chem.Descriptors.NumValenceElectrons(mol)
    max_partial_charge = rdkit.Chem.Descriptors.MaxPartialCharge(mol)
    min_partial_charge = rdkit.Chem.Descriptors.MinPartialCharge(mol) 
    fr_C_O = rdkit.Chem.Fragments.fr_C_O(mol)
    fr_C_O_noCOO = rdkit.Chem.Fragments.fr_C_O_noCOO(mol)
    fr_Al_OH = rdkit.Chem.Fragments.fr_Al_OH(mol)
    fr_Ar_OH = rdkit.Chem.Fragments.fr_Ar_OH(mol) 
    fr_methoxy = rdkit.Chem.Fragments.fr_methoxy(mol) 
    fr_oxime = rdkit.Chem.Fragments.fr_oxime(mol)
    fr_ester = rdkit.Chem.Fragments.fr_ester(mol) 
    fr_Al_COO = rdkit.Chem.Fragments.fr_Al_COO(mol)
    fr_Ar_COO = rdkit.Chem.Fragments.fr_Ar_COO(mol)
    fr_COO = rdkit.Chem.Fragments.fr_COO(mol)
    fr_COO2 = rdkit.Chem.Fragments.fr_COO2(mol)
    fr_ketone = rdkit.Chem.Fragments.fr_ketone(mol) 
    fr_ether = rdkit.Chem.Fragments.fr_ether(mol) 
    fr_phenol = rdkit.Chem.Fragments.fr_phenol(mol) 
    fr_aldehyde = rdkit.Chem.Fragments.fr_aldehyde(mol) 
    fr_quatN = rdkit.Chem.Fragments.fr_quatN(mol) 
    fr_NH2 = rdkit.Chem.Fragments.fr_NH2(mol) 
    fr_NH1 = rdkit.Chem.Fragments.fr_NH1(mol) 
    fr_NH0 = rdkit.Chem.Fragments.fr_NH0(mol)
    fr_Ar_N = rdkit.Chem.Fragments.fr_Ar_N(mol)
    fr_Ar_NH = rdkit.Chem.Fragments.fr_Ar_NH(mol)
    fr_aniline = rdkit.Chem.Fragments.fr_aniline(mol)
    fr_Imine = rdkit.Chem.Fragments.fr_Imine(mol)
    fr_nitrile = rdkit.Chem.Fragments.fr_nitrile(mol)
    fr_hdrzine = rdkit.Chem.Fragments.fr_hdrzine(mol)
    fr_hdrzone = rdkit.Chem.Fragments.fr_hdrzone(mol)
    fr_nitroso = rdkit.Chem.Fragments.fr_nitroso(mol)
    fr_N_O = rdkit.Chem.Fragments.fr_N_O(mol)
    fr_nitro = rdkit.Chem.Fragments.fr_nitro(mol)
    fr_azo = rdkit.Chem.Fragments.fr_azo(mol)
    fr_diazo = rdkit.Chem.Fragments.fr_diazo(mol)
    fr_azide = rdkit.Chem.Fragments.fr_azide(mol)
    fr_amide = rdkit.Chem.Fragments.fr_amide(mol)
    fr_priamide = rdkit.Chem.Fragments.fr_priamide(mol) 
    fr_amidine = rdkit.Chem.Fragments.fr_amidine(mol)
    fr_guanido = rdkit.Chem.Fragments.fr_guanido(mol)
    fr_Nhpyrrole = rdkit.Chem.Fragments.fr_Nhpyrrole(mol)
    fr_imide = rdkit.Chem.Fragments.fr_imide(mol)
    fr_isocyan = rdkit.Chem.Fragments.fr_isocyan(mol) 
    fr_isothiocyan = rdkit.Chem.Fragments.fr_isothiocyan(mol)
    fr_thiocyan = rdkit.Chem.Fragments.fr_thiocyan(mol)
    fr_halogen = rdkit.Chem.Fragments.fr_halogen(mol)
    fr_alkyl_halide = rdkit.Chem.Fragments.fr_alkyl_halide(mol) 
    fr_sulfide = rdkit.Chem.Fragments.fr_sulfide(mol)
    fr_SH = rdkit.Chem.Fragments.fr_SH(mol)
    fr_C_S = rdkit.Chem.Fragments.fr_C_S(mol)
    fr_sulfone = rdkit.Chem.Fragments.fr_sulfone(mol) 
    fr_sulfonamd = rdkit.Chem.Fragments.fr_sulfonamd(mol)
    fr_barbitur = rdkit.Chem.Fragments.fr_barbitur(mol)
    fr_urea = rdkit.Chem.Fragments.fr_urea(mol)
    fr_term_acetylene = rdkit.Chem.Fragments.fr_term_acetylene(mol)
    fr_imidazole = rdkit.Chem.Fragments.fr_imidazole(mol)
    fr_furan = rdkit.Chem.Fragments.fr_furan(mol)
    fr_thiophene = rdkit.Chem.Fragments.fr_thiophene(mol) 
    fr_thiazole = rdkit.Chem.Fragments.fr_thiazole(mol)
    fr_oxazole = rdkit.Chem.Fragments.fr_oxazole(mol)
    fr_pyridine = rdkit.Chem.Fragments.fr_pyridine(mol)
    fr_piperdine = rdkit.Chem.Fragments.fr_piperdine(mol)
    fr_piperzine = rdkit.Chem.Fragments.fr_piperzine(mol)
    fr_morpholine = rdkit.Chem.Fragments.fr_morpholine(mol)
    fr_lactam = rdkit.Chem.Fragments.fr_lactam(mol)
    fr_lactone = rdkit.Chem.Fragments.fr_lactone(mol)
    fr_tetrazole = rdkit.Chem.Fragments.fr_tetrazole(mol)
    fr_epoxide = rdkit.Chem.Fragments.fr_epoxide(mol)
    fr_ubrch_alkane = rdkit.Chem.Fragments.fr_unbrch_alkane(mol) 
    fr_bicyclic = rdkit.Chem.Fragments.fr_bicyclic(mol)
    fr_benzene = rdkit.Chem.Fragments.fr_benzene(mol)
    fr_phos_acid = rdkit.Chem.Fragments.fr_phos_acid(mol)
    fr_phos_ester = rdkit.Chem.Fragments.fr_phos_ester(mol)
    fr_nitro_arom = rdkit.Chem.Fragments.fr_nitro_arom(mol)
    fr_nitro_arom_nonortho = rdkit.Chem.Fragments.fr_nitro_arom_nonortho(mol)
    fr_dihydropyridine = rdkit.Chem.Fragments.fr_dihydropyridine(mol)
    fr_phenol_noOrthoHbond = rdkit.Chem.Fragments.fr_phenol_noOrthoHbond(mol)
    fr_Al_OH_noTert = rdkit.Chem.Fragments.fr_Al_OH_noTert(mol)
    fr_benzodiazepine = rdkit.Chem.Fragments.fr_benzodiazepine(mol)
    fr_para_hydroxylation = rdkit.Chem.Fragments.fr_para_hydroxylation(mol)
    fr_allylic_oxid = rdkit.Chem.Fragments.fr_allylic_oxid(mol)
    fr_aryl_methyl = rdkit.Chem.Fragments.fr_aryl_methyl(mol)
    fr_Ndealkylation1 = rdkit.Chem.Fragments.fr_Ndealkylation1(mol) 
    fr_Ndealkylation2 = rdkit.Chem.Fragments.fr_Ndealkylation2(mol)
    fr_alkyl_carbamate = rdkit.Chem.Fragments.fr_alkyl_carbamate(mol)
    fr_ketone_Topliss = rdkit.Chem.Fragments.fr_ketone_Topliss(mol)
    fr_ArN = rdkit.Chem.Fragments.fr_ArN(mol)
    fr_HOCCN = rdkit.Chem.Fragments.fr_HOCCN(mol)
    
    #sssr, 
    return clogp, molar_refractivity, molecular_weight, tpsa, chi0n, chi1n, chi2n, chi3n, chi4n, chi0v, chi1v, chi2v, chi3v, chi4v, fracsp3, hall_kier_alpha, kappa1, kappa2, kappa3, labuteasa, n_aliphatic_rings, n_aromatic_rings, n_amide_bonds, n_atom_stereocenters, n_bridgehead_atoms, n_HBA, n_HBD, n_hetero_atoms, n_hetero_cycles, n_rings, n_rotatable_bonds, n_spiro, n_saturated_rings, n_heavy_atoms, n_nh_oh, n_n_o, n_valence_electrons, max_partial_charge, min_partial_charge, fr_C_O, fr_C_O_noCOO, fr_Al_OH, fr_Ar_OH, fr_methoxy, fr_oxime, fr_ester, fr_Al_COO, fr_Ar_COO, fr_COO, fr_COO2, fr_ketone, fr_ether, fr_phenol, fr_aldehyde, fr_quatN, fr_NH2, fr_NH1, fr_NH0, fr_Ar_N, fr_Ar_NH, fr_aniline, fr_Imine, fr_nitrile, fr_hdrzine, fr_hdrzone, fr_nitroso, fr_N_O, fr_nitro, fr_azo, fr_diazo, fr_azide, fr_amide, fr_priamide, fr_amidine, fr_guanido, fr_Nhpyrrole, fr_imide, fr_isocyan, fr_isothiocyan, fr_thiocyan, fr_halogen, fr_alkyl_halide, fr_sulfide, fr_SH, fr_C_S, fr_sulfone, fr_sulfonamd, fr_barbitur, fr_urea,   fr_imidazole, fr_furan, fr_thiophene, fr_thiazole, fr_oxazole , fr_pyridine , fr_piperdine , fr_piperzine, fr_morpholine, fr_lactam , fr_lactone , fr_tetrazole , fr_epoxide , fr_ubrch_alkane , fr_bicyclic, fr_benzene, fr_phos_acid, fr_phos_ester, fr_nitro_arom, fr_nitro_arom_nonortho , fr_dihydropyridine , fr_phenol_noOrthoHbond, fr_Al_OH_noTert , fr_benzodiazepine , fr_para_hydroxylation , fr_allylic_oxid , fr_aryl_methyl, fr_Ndealkylation1, fr_Ndealkylation2, fr_alkyl_carbamate , fr_ketone_Topliss, fr_ArN, fr_HOCCN, fr_term_acetylene, 

In [15]:
# create lists 
#mol_sssr = []
mol_clogp = []
mol_molar_refractivity = [] 
mol_molecular_weight = []  
mol_tpsa = [] 
mol_chi0n = []
mol_chi1n = []
mol_chi2n = []
mol_chi3n = []
mol_chi4n = []
mol_chi0v = []
mol_chi1v = []
mol_chi2v = []
mol_chi3v = []
mol_chi4v = []
mol_fracsp3 = []
mol_hall_kier_alpha = [] 
mol_kappa1 = []
mol_kappa2 = []
mol_kappa3 = []
mol_labuteasa = []
mol_n_aliphatic_rings = [] 
mol_n_aromatic_rings = []
mol_n_amide_bonds = []
mol_n_atom_stereocenters = []
mol_n_bridgehead_atoms = []
mol_n_HBA = []
mol_n_HBD = []
mol_n_hetero_atoms = [] 
mol_n_hetero_cycles = []
mol_n_rings = []
mol_n_rotatable_bonds = [] 
mol_n_spiro = []
mol_n_saturated_rings = [] 
mol_n_heavy_atoms = []
mol_n_nh_oh = []
mol_n_n_o = []
mol_n_valence_electrons = [] 
mol_max_partial_charge = []
mol_min_partial_charge = []
mol_fr_C_O = []
mol_fr_C_O_noCOO = []
mol_fr_Al_OH = []
mol_fr_Ar_OH = []
mol_fr_methoxy = []
mol_fr_oxime = []
mol_fr_ester = []
mol_fr_Al_COO = []
mol_fr_Ar_COO = []
mol_fr_COO = []
mol_fr_COO2 = []
mol_fr_ketone = []
mol_fr_ether  = []
mol_fr_phenol = []
mol_fr_aldehyde = []
mol_fr_quatN = []
mol_fr_NH2 = []
mol_fr_NH1 = []
mol_fr_NH0 = []
mol_fr_Ar_N = []
mol_fr_Ar_NH = []
mol_fr_aniline  = []
mol_fr_Imine  = []
mol_fr_nitrile  = []
mol_fr_hdrzine  = []
mol_fr_hdrzone  = []
mol_fr_nitroso  = []
mol_fr_N_O = []
mol_fr_nitro = []
mol_fr_azo = []
mol_fr_diazo = []
mol_fr_azide = []
mol_fr_amide = []
mol_fr_priamide = []
mol_fr_amidine = []
mol_fr_guanido = []
mol_fr_Nhpyrrole = []
mol_fr_imide = []
mol_fr_isocyan  = []
mol_fr_isothiocyan = []
mol_fr_thiocyan = []
mol_fr_halogen = []
mol_fr_alkyl_halide = []
mol_fr_sulfide = []
mol_fr_SH = []
mol_fr_C_S = []
mol_fr_sulfone = []
mol_fr_sulfonamd = []
mol_fr_barbitur = []
mol_fr_urea = []
mol_fr_term_acetylene = []
mol_fr_imidazole  = []
mol_fr_furan = []
mol_fr_thiophene  = []
mol_fr_thiazole  = []
mol_fr_oxazole  = []
mol_fr_pyridine  = []
mol_fr_piperdine = []
mol_fr_piperzine = []
mol_fr_morpholine = []
mol_fr_lactam  = []
mol_fr_lactone  = []
mol_fr_tetrazole  = []
mol_fr_epoxide  = []
mol_fr_ubrch_alkane  = []
mol_fr_bicyclic = []
mol_fr_benzene = []
mol_fr_phos_acid  = []
mol_fr_phos_ester = []
mol_fr_nitro_arom = []
mol_fr_nitro_arom_nonortho  = []
mol_fr_dihydropyridine  = []
mol_fr_phenol_noOrthoHbond = []
mol_fr_Al_OH_noTert = []
mol_fr_benzodiazepine = []
mol_fr_para_hydroxylation = [] 
mol_fr_allylic_oxid  = []
mol_fr_aryl_methyl = []
mol_fr_Ndealkylation1 = []
mol_fr_Ndealkylation2 = []
mol_fr_alkyl_carbamate = [] 
mol_fr_ketone_Topliss = []
mol_fr_ArN = []
mol_fr_HOCCN = []

#sssr,
for molecule in smiles: 
    clogp, molar_refractivity, molecular_weight, tpsa, chi0n, chi1n, chi2n, chi3n, chi4n, chi0v, chi1v, chi2v, chi3v, chi4v, fracsp3, hall_kier_alpha, kappa1, kappa2, kappa3, labuteasa, n_aliphatic_rings, n_aromatic_rings, n_amide_bonds, n_atom_stereocenters, n_bridgehead_atoms, n_HBA, n_HBD, n_hetero_atoms, n_hetero_cycles, n_rings, n_rotatable_bonds, n_spiro, n_saturated_rings, n_heavy_atoms, n_nh_oh, n_n_o, n_valence_electrons, max_partial_charge, min_partial_charge, fr_C_O, fr_C_O_noCOO, fr_Al_OH, fr_Ar_OH, fr_methoxy, fr_oxime, fr_ester, fr_Al_COO, fr_Ar_COO, fr_COO, fr_COO2, fr_ketone, fr_ether, fr_phenol, fr_aldehyde, fr_quatN, fr_NH2, fr_NH1, fr_NH0, fr_Ar_N, fr_Ar_NH, fr_aniline, fr_Imine, fr_nitrile, fr_hdrzine, fr_hdrzone, fr_nitroso, fr_N_O, fr_nitro, fr_azo, fr_diazo, fr_azide, fr_amide, fr_priamide, fr_amidine, fr_guanido, fr_Nhpyrrole, fr_imide, fr_isocyan, fr_isothiocyan, fr_thiocyan, fr_halogen, fr_alkyl_halide, fr_sulfide, fr_SH, fr_C_S, fr_sulfone, fr_sulfonamd, fr_barbitur, fr_urea,   fr_imidazole, fr_furan, fr_thiophene, fr_thiazole, fr_oxazole , fr_pyridine , fr_piperdine , fr_piperzine, fr_morpholine, fr_lactam , fr_lactone , fr_tetrazole , fr_epoxide , fr_ubrch_alkane , fr_bicyclic, fr_benzene, fr_phos_acid, fr_phos_ester, fr_nitro_arom, fr_nitro_arom_nonortho , fr_dihydropyridine , fr_phenol_noOrthoHbond, fr_Al_OH_noTert , fr_benzodiazepine , fr_para_hydroxylation , fr_allylic_oxid , fr_aryl_methyl, fr_Ndealkylation1, fr_Ndealkylation2, fr_alkyl_carbamate , fr_ketone_Topliss, fr_ArN, fr_HOCCN, fr_term_acetylene = generate_descriptors(molecule) 
    
    #mol_sssr.append(sssr)
    mol_clogp.append(clogp)
    mol_molar_refractivity.append(molar_refractivity)
    mol_molecular_weight.append(molecular_weight)
    mol_tpsa.append(tpsa)
    mol_chi0n.append(chi0n)
    mol_chi1n.append(chi1n)
    mol_chi2n.append(chi2n)
    mol_chi3n.append(chi3n)
    mol_chi4n.append(chi4n)
    mol_chi0v.append(chi0v)
    mol_chi1v.append(chi1v)
    mol_chi2v.append(chi2v)
    mol_chi3v.append(chi3v)
    mol_chi4v.append(chi4v)
    mol_fracsp3.append(fracsp3)
    mol_hall_kier_alpha.append(hall_kier_alpha)
    mol_kappa1.append(kappa1)
    mol_kappa2.append(kappa2)
    mol_kappa3.append(kappa3)
    mol_labuteasa.append(labuteasa)
    mol_n_aliphatic_rings.append(n_aliphatic_rings)
    mol_n_aromatic_rings.append(n_aromatic_rings)
    mol_n_amide_bonds.append(n_amide_bonds)
    mol_n_atom_stereocenters.append(n_atom_stereocenters)
    mol_n_bridgehead_atoms.append(n_bridgehead_atoms)
    mol_n_HBA.append(n_HBA)
    mol_n_HBD.append(n_HBD)
    mol_n_hetero_atoms.append(n_hetero_atoms)
    mol_n_hetero_cycles.append(n_hetero_cycles)
    mol_n_rings.append(n_rings)
    mol_n_rotatable_bonds.append(n_rotatable_bonds)
    mol_n_spiro.append(n_spiro)
    mol_n_saturated_rings.append(n_saturated_rings)
    mol_n_heavy_atoms.append(n_heavy_atoms)
    mol_n_nh_oh.append(n_nh_oh)
    mol_n_n_o.append(n_n_o)
    mol_n_valence_electrons.append(n_valence_electrons)
    mol_max_partial_charge.append(max_partial_charge)
    mol_min_partial_charge.append(min_partial_charge)
    mol_fr_C_O.append(fr_C_O)
    mol_fr_C_O_noCOO.append(fr_C_O_noCOO)
    mol_fr_Al_OH.append(fr_Al_OH)
    mol_fr_Ar_OH.append(fr_Ar_OH)
    mol_fr_methoxy.append(fr_methoxy)
    mol_fr_oxime.append(fr_oxime)
    mol_fr_ester.append(fr_ester)
    mol_fr_Al_COO.append(fr_Al_COO)
    mol_fr_Ar_COO.append(fr_Ar_COO)
    mol_fr_COO.append(fr_COO)
    mol_fr_COO2.append(fr_COO2)
    mol_fr_ketone.append(fr_ketone)
    mol_fr_ether.append(fr_ether)
    mol_fr_phenol.append(fr_phenol)
    mol_fr_aldehyde.append(fr_aldehyde)
    mol_fr_quatN.append(fr_quatN)
    mol_fr_NH2.append(fr_NH2)
    mol_fr_NH1.append(fr_NH1)
    mol_fr_NH0.append(fr_NH0)
    mol_fr_Ar_N.append(fr_Ar_N)
    mol_fr_Ar_NH.append(fr_Ar_NH)
    mol_fr_aniline.append(fr_aniline)
    mol_fr_Imine.append(fr_Imine)
    mol_fr_nitrile.append(fr_nitrile)
    mol_fr_hdrzine.append(fr_hdrzine)
    mol_fr_hdrzone.append(fr_hdrzone)
    mol_fr_nitroso.append(fr_nitroso)
    mol_fr_N_O.append(fr_N_O)
    mol_fr_nitro.append(fr_nitro)
    mol_fr_azo.append(fr_azo)
    mol_fr_diazo.append(fr_diazo)
    mol_fr_azide.append(fr_azide)
    mol_fr_amide.append(fr_amide)
    mol_fr_priamide.append(fr_priamide)
    mol_fr_amidine.append(fr_amidine)
    mol_fr_guanido.append(fr_guanido)
    mol_fr_Nhpyrrole.append(fr_Nhpyrrole)
    mol_fr_imide.append(fr_imide)
    mol_fr_isocyan.append(fr_isocyan)
    mol_fr_isothiocyan.append(fr_isothiocyan)
    mol_fr_thiocyan.append(fr_thiocyan)
    mol_fr_halogen.append(fr_halogen)
    mol_fr_alkyl_halide.append(fr_alkyl_halide)
    mol_fr_sulfide.append(fr_sulfide)
    mol_fr_SH.append(fr_SH)
    mol_fr_C_S.append(fr_C_S)
    mol_fr_sulfone.append(fr_sulfone)
    mol_fr_sulfonamd.append(fr_sulfonamd)
    mol_fr_barbitur.append(fr_barbitur)
    mol_fr_urea.append(fr_urea)
    mol_fr_term_acetylene.append(fr_term_acetylene)
    mol_fr_imidazole.append(fr_imidazole)
    mol_fr_furan.append(fr_furan)
    mol_fr_thiophene.append(fr_thiophene)
    mol_fr_thiazole.append(fr_thiazole)
    mol_fr_oxazole.append(fr_oxazole)
    mol_fr_pyridine.append(fr_pyridine)
    mol_fr_piperdine.append(fr_piperdine)
    mol_fr_piperzine.append(fr_piperzine)
    mol_fr_morpholine.append(fr_morpholine)
    mol_fr_lactam.append(fr_lactam)
    mol_fr_lactone.append(fr_lactone)
    mol_fr_tetrazole.append(fr_tetrazole)
    mol_fr_epoxide.append(fr_epoxide)
    mol_fr_ubrch_alkane.append(fr_ubrch_alkane)
    mol_fr_bicyclic.append(fr_bicyclic)
    mol_fr_benzene.append(fr_benzene)
    mol_fr_phos_acid.append(fr_phos_acid)
    mol_fr_phos_ester.append(fr_phos_ester)
    mol_fr_nitro_arom.append(fr_nitro_arom)
    mol_fr_nitro_arom_nonortho.append(fr_nitro_arom_nonortho)
    mol_fr_dihydropyridine.append(fr_dihydropyridine)
    mol_fr_phenol_noOrthoHbond.append(fr_phenol_noOrthoHbond)
    mol_fr_Al_OH_noTert.append(fr_Al_OH_noTert)
    mol_fr_benzodiazepine.append(fr_benzodiazepine)
    mol_fr_para_hydroxylation.append(fr_para_hydroxylation)
    mol_fr_allylic_oxid.append(fr_allylic_oxid)
    mol_fr_aryl_methyl.append(fr_aryl_methyl)
    mol_fr_Ndealkylation1.append(fr_Ndealkylation1)
    mol_fr_Ndealkylation2.append(fr_Ndealkylation2)
    mol_fr_alkyl_carbamate.append(fr_alkyl_carbamate)
    mol_fr_ketone_Topliss.append(fr_ketone_Topliss)
    mol_fr_ArN.append(fr_ArN)
    mol_fr_HOCCN.append(fr_HOCCN)
    
    #'sssr':mol_sssr,
X = pd.DataFrame({'clogp': mol_clogp, 
                  'molar_refractivity':mol_molar_refractivity,
                  'molecular_weight': mol_molecular_weight,
                  'tpsa': mol_tpsa,
                  'chi0n': mol_chi0n,
                  'chi1n': mol_chi1n,
                  'chi2n': mol_chi2n,
                  'chi3n': mol_chi3n,
                  'chi4n': mol_chi4n,
                  'chi0v': mol_chi0v,
                  'chi1v': mol_chi1v,
                  'chi2v': mol_chi2v,
                  'chi3v': mol_chi3v,
                  'chi4v': mol_chi4v, 
                  'fracsp3': mol_fracsp3,
                  'hall_kier_alpha': mol_hall_kier_alpha, 
                  'kappa1': mol_kappa1,
                  'kappa2': mol_kappa2,
                  'kappa3': mol_kappa3,
                  'labuteasa': mol_labuteasa,  
                  'n_aliphatic_rings': mol_n_aliphatic_rings, 
                  'n_aromatic_rings': mol_n_aromatic_rings,
                  'n_amide_bonds': mol_n_amide_bonds,
                  'n_atom_stereocenters': mol_n_atom_stereocenters, 
                  'n_bridgehead_atoms': mol_n_bridgehead_atoms,
                  'n_HBA': mol_n_HBA,
                  'n_HBD': mol_n_HBD,
                  'n_hetero_atoms': mol_n_hetero_atoms, 
                  'n_hetero_cycles': mol_n_hetero_cycles, 
                  'n_rings': mol_n_rings,
                  'n_rotatable_bonds':mol_n_rotatable_bonds,
                  'n_spiro': mol_n_spiro,
                  'n_saturated_rings': mol_n_saturated_rings, 
                  'n_heavy_atoms': mol_n_heavy_atoms,
                  'n_nh_oh': mol_n_nh_oh,
                  'n_n_o': mol_n_n_o,  
                  'n_valence_electrons': mol_n_valence_electrons,
                  'max_partial_charge': mol_max_partial_charge,
                  'min_partial_charge': mol_min_partial_charge,
                  'fr_C_O': mol_fr_C_O,
                  'fr_C_O_noCOO': mol_fr_C_O_noCOO,  
                  'fr_Al_OH': mol_fr_Al_OH,
                  'fr_Ar_OH': mol_fr_Ar_OH,
                  'fr_methoxy': mol_fr_methoxy,  
                  'fr_oxime': mol_fr_oxime,
                  'fr_ester': mol_fr_ester,
                  'fr_Al_COO': mol_fr_Al_COO,  
                  'fr_Ar_COO': mol_fr_Ar_COO,
                  'fr_COO': mol_fr_COO,
                  'fr_COO2': mol_fr_COO2,  
                  'fr_ketone': mol_fr_ketone, 
                  'fr_ether': mol_fr_ether,
                  'fr_phenol': mol_fr_phenol, 
                  'fr_aldehyde': mol_fr_aldehyde, 
                  'fr_quatN': mol_fr_quatN,
                  'fr_NH2': mol_fr_NH2,
                  'fr_NH1': mol_fr_NH1, 
                  'fr_NH0':  mol_fr_NH0, 
                  'fr_Ar_N': mol_fr_Ar_N,  
                  'fr_Ar_NH': mol_fr_Ar_NH, 
                  'fr_aniline': mol_fr_aniline,  
                  'fr_Imine': mol_fr_Imine,
                  'fr_nitrile': mol_fr_nitrile,  
                  'fr_hdrzine': mol_fr_hdrzine, 
                  'fr_hdrzone': mol_fr_hdrzone, 
                  'fr_nitroso': mol_fr_nitroso, 
                  'fr_N_O': mol_fr_N_O,
                  'fr_nitro': mol_fr_nitro, 
                  'fr_azo': mol_fr_azo,
                  'fr_diazo': mol_fr_diazo, 
                  'fr_azide': mol_fr_azide,
                  'fr_amide': mol_fr_amide,
                  'fr_priamide': mol_fr_priamide,  
                  'fr_amidine': mol_fr_amidine,
                  'fr_guanido': mol_fr_guanido, 
                  'fr_Nhpyrrole': mol_fr_Nhpyrrole,  
                  'fr_imide': mol_fr_imide,
                  'fr_isocyan': mol_fr_isocyan,  
                  'fr_isothiocyan': mol_fr_isothiocyan, 
                  'fr_thiocyan': mol_fr_thiocyan,
                  'fr_halogen': mol_fr_halogen,
                  'fr_alkyl_halide': mol_fr_alkyl_halide,
                  'fr_sulfide': mol_fr_sulfide,
                  'fr_SH': mol_fr_SH,
                  'fr_C_S': mol_fr_C_S,  
                  'fr_sulfone': mol_fr_sulfone,
                  'fr_sulfonamd': mol_fr_sulfonamd, 
                  'fr_barbitur': mol_fr_barbitur, 
                  'fr_urea': mol_fr_urea,
                  'fr_term_acetylene': mol_fr_term_acetylene, 
                  'fr_imidazole': mol_fr_imidazole,
                  'fr_furan': mol_fr_furan,
                  'fr_thiophene': mol_fr_thiophene, 
                  'fr_thiazole': mol_fr_thiazole,
                  'fr_oxazole': mol_fr_oxazole,
                  'fr_pyridine': mol_fr_pyridine, 
                  'fr_piperdine': mol_fr_piperdine, 
                  'fr_piperzine': mol_fr_piperzine,
                  'fr_morpholine': mol_fr_morpholine,  
                  'fr_lactam': mol_fr_lactam,  
                  'fr_lactone': mol_fr_lactone, 
                  'fr_tetrazole': mol_fr_tetrazole,  
                  'fr_epoxide': mol_fr_epoxide,
                  'fr_ubrch_alkane': mol_fr_ubrch_alkane,
                  'fr_bicyclic': mol_fr_bicyclic,
                  'fr_benzene': mol_fr_benzene,
                  'fr_phos_acid': mol_fr_phos_acid, 
                  'fr_phos_ester': mol_fr_phos_ester, 
                  'fr_nitro_arom': mol_fr_nitro_arom, 
                  'fr_nitro_arom_nonortho': mol_fr_nitro_arom_nonortho,
                  'fr_dihydropyridine': mol_fr_dihydropyridine,
                  'fr_phenol_noOrthoHbond': mol_fr_phenol_noOrthoHbond,
                  'fr_Al_OH_noTert': mol_fr_Al_OH_noTert,
                  'fr_benzodiazepine': mol_fr_benzodiazepine,
                  'fr_para_hydroxylation': mol_fr_para_hydroxylation, 
                  'fr_allylic_oxid': mol_fr_allylic_oxid,
                  'fr_aryl_methyl': mol_fr_aryl_methyl,
                  'fr_Ndealkylation1': mol_fr_Ndealkylation1,  
                  'fr_Ndealkylation2': mol_fr_Ndealkylation2,
                  'fr_alkyl_carbamate': mol_fr_alkyl_carbamate, 
                  'fr_ketone_Topliss': mol_fr_ketone_Topliss,
                  'fr_ArN': mol_fr_ArN,
                  'fr_HOCCN': mol_fr_HOCCN 
            })

In [16]:
pca = PCA(n_components=22)
X = pca.fit_transform(X)

scaler = StandardScaler()
X = scaler.fit_transform(X)

y_red = -y_red 
y_wav = pd.DataFrame([-np.abs(i - 375) for i in y_wav])
y = pd.concat([y_red, y_sol, y_wav], axis=1)
print(y)

          Ered      Gsol      0
0    -3.006284 -0.790614  -6.65
1    -2.511715 -1.096917 -20.26
2    -1.715196 -0.780220 -40.72
3    -1.932468 -0.737337 -55.28
4    -2.198147 -0.828164 -38.23
...        ...       ...    ...
1402 -2.416752 -0.854448 -46.73
1403 -2.322877 -0.763050 -10.59
1404 -2.546326 -0.746996 -30.25
1405 -2.372209 -0.800137 -12.66
1406 -1.925666 -0.913494 -35.50

[1407 rows x 3 columns]


In [17]:
n_data = len(smiles)
indices = np.arange(0, n_data)

def MBO_GPR():
    objective_1_models = GaussianProcessRegressor(kernel=RBF(), n_restarts_optimizer=10, normalize_y=True, alpha=1e-5)
    objective_2_models = GaussianProcessRegressor(kernel=RBF(), n_restarts_optimizer=10, normalize_y=True, alpha=1e-5)
    objective_3_models = GaussianProcessRegressor(kernel=RBF(), n_restarts_optimizer=10, normalize_y=True, alpha=1e-5)
    
    #objective_1_models = build_gbdt_tuple(n_estimators=50, num_leaves=10)
    #objective_2_models = build_gbdt_tuple(n_estimators=50, num_leaves=10)
    #objective_3_models = build_gbdt_tuple(n_estimators=50, num_leaves=10)

    palinstance = PALSklearn(X, [objective_1_models, objective_2_models, objective_3_models], 3, coef_var_threshold=10, beta_scale=1/50)

    init_train_indices = np.random.choice(indices, 10)

    palinstance.update_train_set(init_train_indices, y.iloc[init_train_indices])

    # palinstance.cross_validation_points = 0

    while palinstance.number_unclassified_points > 0:  
        new_index = palinstance.run_one_step()
        print(palinstance)
        
        if new_index is not None: 
            palinstance.update_train_set(new_index, y.iloc[new_index])
        else: 
            break

    opt_ind = palinstance.pareto_optimal_indices

    return smiles[opt_ind], y.iloc[opt_ind], palinstance.sampled_indices, objective_1_models, objective_2_models, objective_3_models

In [18]:
par_smiles, par_y, sampled, model1, model2, model3 = MBO_GPR() 

# X is a np array 
#print(X) 
# y is a pd array 
#print(y)

# generate a set of unsampled molecules
X_new = np.delete(X, sampled, axis=0)
y_new = y.drop(labels=sampled, axis=0)

n_unsampled = len(y_new) 
# generate indices to randomly select molecules from X_new and y_new 
test_indices = np.random.choice(np.arange(0, n_unsampled), size=10)

# select 10 random molecules to use as test set 
X_test = X_new[test_indices]
y_test = y_new.iloc[test_indices]

yred_test = y_test['Ered'].to_numpy()
ysol_test = y_test['Gsol'].to_numpy()
# the column lable for Absorption Wavelength in y is 0 
ywav_test = y_test[0].to_numpy()

red_maes = [] 
sol_maes = [] 
wav_maes = [] 

for model in [model1, model2, model3]: 
    prediction = model.predict(X_test) 
    red_mae = mae(yred_test, prediction)
    sol_mae = mae(ysol_test, prediction)
    wav_mae = mae(ywav_test, prediction)
    
    red_maes.append(red_mae)
    sol_maes.append(sol_mae)
    wav_maes.append(wav_mae)
    
print("red score: ", np.amin(red_maes))
print("sol score: ", np.amin(sol_maes))
print("wav score: ", np.amin(wav_maes))

will automatically expand to use the same value in every dimension
the mean standard deviation is 5.23.
Your model might not be predictive and/or overconfident.
In the docs, you find hints on how to make models more robust.


pyepal at iteration 2.         3 Pareto optimal points,         11 discarded points,         1393 unclassified points.
pyepal at iteration 3.         3 Pareto optimal points,         18 discarded points,         1386 unclassified points.
pyepal at iteration 4.         3 Pareto optimal points,         18 discarded points,         1386 unclassified points.
pyepal at iteration 5.         4 Pareto optimal points,         19 discarded points,         1384 unclassified points.
pyepal at iteration 6.         4 Pareto optimal points,         20 discarded points,         1383 unclassified points.
pyepal at iteration 7.         4 Pareto optimal points,         133 discarded points,         1270 unclassified points.
pyepal at iteration 8.         5 Pareto optimal points,         153 discarded points,         1249 unclassified points.
pyepal at iteration 9.         5 Pareto optimal points,         202 discarded points,         1200 unclassified points.
pyepal at iteration 10.         5 Pareto opti

PALLogger - INFO - Done. No unclassified point left.


pyepal at iteration 57.         35 Pareto optimal points,         1372 discarded points,         0 unclassified points.
red score:  2.5342827740000002
sol score:  0.61287804
wav score:  38.135000000000005


In [19]:
%load_ext watermark
%watermark -v -p pandas,numpy,rdkit,pyepal,sklearn

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Python implementation: CPython
Python version       : 3.9.13
IPython version      : 7.31.1

pandas : 1.4.4
numpy  : 1.21.5
rdkit  : 2022.9.1
pyepal : 0.6.1
sklearn: 0.24.2

