In [1]:
from sklearn import svm
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import optuna

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel, RBF, WhiteKernel, DotProduct, Matern
from sklearn.model_selection import KFold, cross_validate
import random
from tqdm import trange
import pickle 

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
df1 = pd.read_csv('data.csv')
df = df1.sample(frac=1.0)

In [20]:
x_name = ['PN_nN', 'PN_nP', 'PN_ATS8v', 'PN_ATSC8c', 'PN_ATSC1are', 'PN_ATSC1p',
       'PN_AATSC8c', 'PN_MATS2c', 'PN_GATS5p', 'PN_C1SP1', 'PN_NdsssP',
       'PN_SaaaC', 'PN_StN', 'PN_SdsN', 'PN_SdsssP', 'PN_EState_VSA5',
       'PN_EState_VSA9', 'PN_VSA_EState3', 'PN_MDEO-22', 'PN_n10FRing',
       'PN_n10FaRing', 'PN_GGI10', 'ini_AATS1pe', 'ini_AATS2i', 'ini_AATS3i',
       'ini_AATSC1se', 'ini_AATSC2pe', 'ini_MATS6v', 'ini_MATS1i',
       'ini_BCUTare-1l', 'ini_NaaN', 'ini_SaaN', 'ini_GhoseFilter',
       'ini_n6aHRing', 'dsc_rate', 'PN_pre_homo', 'ini_pre_hl']
X = np.array(df[x_name], dtype=float)

Y = np.array(df['tp'])

x_mean = np.nanmean(X, axis=0)
x_std = np.nanstd(X, axis=0)
y_mean = np.nanmean(Y, axis=0)
y_std = np.nanstd(Y, axis=0)

X_std = (X-x_mean)/(1e-9+x_std)
Y_std = (Y-y_mean)/(1e-9+y_std)
X_std[np.isnan(X_std)] = 0
Y_std[np.isnan(Y_std)] = 0

In [21]:
def objective(trial):    
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
    C = trial.suggest_float("C", 1, 100, step=1)
    epsilon = trial.suggest_float("epsilon", 0.01, 0.20, step=0.01)
    scores = []
    for i in range(10):
        model = svm.SVR(kernel='rbf', C=C, gamma=gamma, epsilon=epsilon)
        x_train, x_test, y_train, y_test = train_test_split(X_std, Y_std, random_state=i, test_size=0.1)
        model.fit(x_train, y_train)
        score1 = -model.score(x_train, y_train)
        score2 = -model.score(x_test, y_test)
#         print(score1, score2)
        tmp = score2 + max(0, (score2-score1))
        scores.append(tmp)    
    return np.mean(scores)

In [22]:
study = optuna.create_study()
study.optimize(objective, n_trials=256)

[32m[I 2023-07-14 14:25:00,341][0m A new study created in memory with name: no-name-e7f6edf9-841a-45c8-92f2-45d8a5d49b75[0m
[32m[I 2023-07-14 14:25:00,406][0m Trial 0 finished with value: -0.6802290077721739 and parameters: {'gamma': 'scale', 'C': 17.0, 'epsilon': 0.16}. Best is trial 0 with value: -0.6802290077721739.[0m
[32m[I 2023-07-14 14:25:00,491][0m Trial 1 finished with value: -0.7278799836541647 and parameters: {'gamma': 'auto', 'C': 50.0, 'epsilon': 0.05}. Best is trial 1 with value: -0.7278799836541647.[0m
[32m[I 2023-07-14 14:25:00,581][0m Trial 2 finished with value: -0.7387836822643017 and parameters: {'gamma': 'auto', 'C': 85.0, 'epsilon': 0.06999999999999999}. Best is trial 2 with value: -0.7387836822643017.[0m
[32m[I 2023-07-14 14:25:00,647][0m Trial 3 finished with value: -0.7198077930035118 and parameters: {'gamma': 'scale', 'C': 73.0, 'epsilon': 0.17}. Best is trial 2 with value: -0.7387836822643017.[0m
[32m[I 2023-07-14 14:25:00,699][0m Trial 4 fin

[32m[I 2023-07-14 14:25:04,845][0m Trial 40 finished with value: -0.7135013185085011 and parameters: {'gamma': 'scale', 'C': 62.0, 'epsilon': 0.17}. Best is trial 32 with value: -0.7578935336762708.[0m
[32m[I 2023-07-14 14:25:04,988][0m Trial 41 finished with value: -0.7578935336762708 and parameters: {'gamma': 'auto', 'C': 95.0, 'epsilon': 0.02}. Best is trial 32 with value: -0.7578935336762708.[0m
[32m[I 2023-07-14 14:25:05,125][0m Trial 42 finished with value: -0.7562567069427171 and parameters: {'gamma': 'auto', 'C': 87.0, 'epsilon': 0.02}. Best is trial 32 with value: -0.7578935336762708.[0m
[32m[I 2023-07-14 14:25:05,267][0m Trial 43 finished with value: -0.7578842854408298 and parameters: {'gamma': 'auto', 'C': 94.0, 'epsilon': 0.02}. Best is trial 32 with value: -0.7578935336762708.[0m
[32m[I 2023-07-14 14:25:05,388][0m Trial 44 finished with value: -0.7485638858987647 and parameters: {'gamma': 'auto', 'C': 95.0, 'epsilon': 0.04}. Best is trial 32 with value: -0.7

[32m[I 2023-07-14 14:25:10,054][0m Trial 80 finished with value: -0.7485638858987647 and parameters: {'gamma': 'auto', 'C': 95.0, 'epsilon': 0.04}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:10,200][0m Trial 81 finished with value: -0.7578842854408298 and parameters: {'gamma': 'auto', 'C': 94.0, 'epsilon': 0.02}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:10,346][0m Trial 82 finished with value: -0.7559532871832804 and parameters: {'gamma': 'auto', 'C': 86.0, 'epsilon': 0.02}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:10,519][0m Trial 83 finished with value: -0.7571626968829801 and parameters: {'gamma': 'auto', 'C': 97.0, 'epsilon': 0.01}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:10,649][0m Trial 84 finished with value: -0.7538609763098403 and parameters: {'gamma': 'auto', 'C': 90.0, 'epsilon': 0.03}. Best is trial 74 with value: -0.75

[32m[I 2023-07-14 14:25:15,818][0m Trial 120 finished with value: -0.7539933570939426 and parameters: {'gamma': 'auto', 'C': 100.0, 'epsilon': 0.03}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:15,968][0m Trial 121 finished with value: -0.7578842854408298 and parameters: {'gamma': 'auto', 'C': 94.0, 'epsilon': 0.02}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:16,119][0m Trial 122 finished with value: -0.7580851996904178 and parameters: {'gamma': 'auto', 'C': 98.0, 'epsilon': 0.02}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:16,271][0m Trial 123 finished with value: -0.7580851996904178 and parameters: {'gamma': 'auto', 'C': 98.0, 'epsilon': 0.02}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:16,449][0m Trial 124 finished with value: -0.7575029258239591 and parameters: {'gamma': 'auto', 'C': 99.0, 'epsilon': 0.01}. Best is trial 74 with value:

[32m[I 2023-07-14 14:25:21,670][0m Trial 160 finished with value: -0.7557840105394497 and parameters: {'gamma': 'auto', 'C': 90.0, 'epsilon': 0.01}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:21,829][0m Trial 161 finished with value: -0.7580851996904178 and parameters: {'gamma': 'auto', 'C': 98.0, 'epsilon': 0.02}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:21,981][0m Trial 162 finished with value: -0.7580851996904178 and parameters: {'gamma': 'auto', 'C': 98.0, 'epsilon': 0.02}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:22,133][0m Trial 163 finished with value: -0.7582278567813925 and parameters: {'gamma': 'auto', 'C': 100.0, 'epsilon': 0.02}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:22,266][0m Trial 164 finished with value: -0.7539933570939426 and parameters: {'gamma': 'auto', 'C': 100.0, 'epsilon': 0.03}. Best is trial 74 with value

[32m[I 2023-07-14 14:25:27,742][0m Trial 200 finished with value: -0.7247845147919005 and parameters: {'gamma': 'auto', 'C': 40.0, 'epsilon': 0.03}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:27,896][0m Trial 201 finished with value: -0.7582278567813925 and parameters: {'gamma': 'auto', 'C': 100.0, 'epsilon': 0.02}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:28,048][0m Trial 202 finished with value: -0.7582278567813925 and parameters: {'gamma': 'auto', 'C': 100.0, 'epsilon': 0.02}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:28,201][0m Trial 203 finished with value: -0.7580851996904178 and parameters: {'gamma': 'auto', 'C': 98.0, 'epsilon': 0.02}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:28,317][0m Trial 204 finished with value: -0.7249604125421485 and parameters: {'gamma': 'auto', 'C': 46.0, 'epsilon': 0.01}. Best is trial 74 with value

[32m[I 2023-07-14 14:25:33,512][0m Trial 240 finished with value: -0.7580851996904178 and parameters: {'gamma': 'auto', 'C': 98.0, 'epsilon': 0.02}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:33,659][0m Trial 241 finished with value: -0.7582278567813925 and parameters: {'gamma': 'auto', 'C': 100.0, 'epsilon': 0.02}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:33,811][0m Trial 242 finished with value: -0.7582278567813925 and parameters: {'gamma': 'auto', 'C': 100.0, 'epsilon': 0.02}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:33,968][0m Trial 243 finished with value: -0.7580851996904178 and parameters: {'gamma': 'auto', 'C': 98.0, 'epsilon': 0.02}. Best is trial 74 with value: -0.7582278567813925.[0m
[32m[I 2023-07-14 14:25:34,139][0m Trial 244 finished with value: -0.7575577590541425 and parameters: {'gamma': 'auto', 'C': 100.0, 'epsilon': 0.01}. Best is trial 74 with valu

In [27]:
from sklearn.ensemble import BaggingRegressor
svr = svm.SVR(kernel='rbf', 
              C=study.best_trial.params['C'], 
              gamma=study.best_trial.params['gamma'], 
              epsilon=study.best_trial.params['epsilon'], 
             )
regr = BaggingRegressor(base_estimator=svr, 
                        n_estimators=64, 
                        random_state=42,
                        max_samples=1.,
                        ).fit(X_std, Y_std)

regr.score(X_std, Y_std)

0.9394248426157601

In [28]:
PN_x_name = ['PN_nN', 'PN_nP', 'PN_ATS8v', 'PN_ATSC8c', 'PN_ATSC1are', 'PN_ATSC1p',
       'PN_AATSC8c', 'PN_MATS2c', 'PN_GATS5p', 'PN_C1SP1', 'PN_NdsssP',
       'PN_SaaaC', 'PN_StN', 'PN_SdsN', 'PN_SdsssP', 'PN_EState_VSA5',
       'PN_EState_VSA9', 'PN_VSA_EState3', 'PN_MDEO-22', 'PN_n10FRing',
       'PN_n10FaRing', 'PN_GGI10']
PN_x_names = [name[3:] for name in PN_x_name]
ini_x_name = ['ini_AATS1pe', 'ini_AATS2i', 'ini_AATS3i',
       'ini_AATSC1se', 'ini_AATSC2pe', 'ini_MATS6v', 'ini_MATS1i',
       'ini_BCUTare-1l', 'ini_NaaN', 'ini_SaaN', 'ini_GhoseFilter',
       'ini_n6aHRing']

ini_x_names = [name[4:] for name in ini_x_name]

regr.x_scaler = [x_mean, x_std]
regr.y_scaler = [y_mean, y_std]
regr.x_name = x_name
regr.PN_xnames = PN_x_names
regr.ini_xnames = ini_x_names
regr.other_name = ['dsc_rate', 'PN_pre_homo', 'ini_pre_hl']

regr.ini_dict = {'4APN': 'N#Cc1ccc(Oc2ccc(N)cc2)cc1C#N',
                 '3APN': 'N#Cc1ccc(Oc2cccc(N)c2)cc1C#N',
                 'p_BAPS': 'Nc1ccc(Oc2ccc(S(=O)(=O)c3ccc(Oc4ccc(N)cc4)cc3)cc2)cc1',
                 'm_APB': 'Nc1cccc(Oc2cccc(Oc3cccc(N)c3)c2)c1',
                 'DDE': 'Nc1ccc(Oc2ccc(N)cc2)cc1',
                 'BDB': 'N#Cc1c(Oc2ccc(N)cc2)cccc1Oc1ccc(N)cc1',
                 'm_BAPS': 'Nc1cccc(Oc2ccc(S(=O)(=O)c3ccc(Oc4cccc(N)c4)cc3)cc2)c1',
                 'DDS': 'Nc1ccc(S(=O)(=O)c2ccc(N)cc2)cc1',
                 'MDA': 'Nc1ccc(Cc2ccc(N)cc2)cc1',
                 'HPPN': 'N#Cc1ccc(Oc2ccc(O)cc2)cc1C#N',
                 'p_APB': 'Nc1ccc(Oc2ccc(Oc3ccc(N)cc3)cc2)cc1',
                 'MI': 'Nc1nc(N)nc(N)n1',
                 'a_APB': 'Nc1ccc(Oc2cccc(Oc3ccc(N)cc3)c2)cc1',
                 'BPA': 'CC(C)(c1ccc(O)cc1)c1ccc(O)cc1',
                 'TCP': 'O=C1NC(Cc2ccc(O)cc2)C(=O)NC1Cc1ccc(O)cc1'}

In [29]:
with open('Tp_brgr_svr.pkl', 'wb') as f:
    pickle.dump(regr, f)