In [2]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

train = pd.read_csv('./data/finger_Pharmacophore_fp_train.csv')
train.drop(columns=['id','SMILES'],inplace=True)
train = train.rename(columns = {'Num_H_Acceptors': 'Acceptors','Num_H_Donors': 'Donors','Num_RotatableBonds':'RotatableBonds'})


In [16]:
import optuna
import optuna.logging
from tqdm import tqdm
from sklearn.model_selection import train_test_split
def hype_tune(col):
    x_train, x_valid, y_train, y_valid = train_test_split(train.drop(columns=['HLM',"MLM"]),train[f'{col}'], test_size=0.33, random_state=42)

    dtrain = xgb.DMatrix(data=x_train, label=y_train, enable_categorical=True)
    dvalid = xgb.DMatrix(data=x_valid, label=y_valid, enable_categorical=True)

    def objective(trial):
        param = {
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0),
            'gamma': trial.suggest_float('gamma', 1e-3, 10),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0),
            'colsample_bytree': trial.suggest_categorical('colsample_bytree',[0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
            'subsample': trial.suggest_categorical('subsample', [0.6, 0.7, 0.8, 1.0]),
            'max_depth': trial.suggest_categorical('max_depth', [3, 4, 5, 6]),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
            'eta' : 0.1,
            'tree_method' : "hist",
            'gpu_id': 0
        }

        model = xgb.train(params=param, dtrain=dtrain, num_boost_round=1000,
                        evals=[(dvalid, 'valid')], early_stopping_rounds=50, verbose_eval=False)

        preds = model.predict(dvalid)
        rmse = mean_squared_error(y_valid, preds, squared=False)
        return rmse
        

        

    study = optuna.create_study(direction='minimize', study_name=None)
    with tqdm(total=500) as pbar:  
        def callback(study, trial):
            pbar.update(1)  

        study.optimize(objective, n_trials=500, callbacks=[callback])
        
    df = study.trials_dataframe().sort_values(by=['value'], ascending=[True]).reset_index(drop=True)
    best = study.best_trial
    print("best:",best)
    #df.to_csv('parameters.csv', index=False)
    df.head(5)
    return best

MLM

In [17]:
mlm_best = hype_tune("MLM")

[I 2023-09-12 23:53:05,567] A new study created in memory with name: no-name-1c83019e-bce5-4c13-8ff9-f8fa4755252f
  0%|          | 0/500 [00:00<?, ?it/s][I 2023-09-12 23:53:06,253] Trial 0 finished with value: 32.42421042482447 and parameters: {'reg_lambda': 3.598221995151246, 'gamma': 7.143928261248063, 'reg_alpha': 3.5406860614847755, 'colsample_bytree': 1.0, 'subsample': 0.7, 'max_depth': 3, 'min_child_weight': 251}. Best is trial 0 with value: 32.42421042482447.
  0%|          | 1/500 [00:00<05:41,  1.46it/s][I 2023-09-12 23:53:06,695] Trial 1 finished with value: 32.50107621139295 and parameters: {'reg_lambda': 2.169876271567572, 'gamma': 3.4836914755766566, 'reg_alpha': 3.4676668550548997, 'colsample_bytree': 0.5, 'subsample': 0.8, 'max_depth': 6, 'min_child_weight': 80}. Best is trial 0 with value: 32.42421042482447.
  0%|          | 2/500 [00:01<04:29,  1.85it/s][I 2023-09-12 23:53:07,425] Trial 2 finished with value: 32.25197728239128 and parameters: {'reg_lambda': 0.925715217

best: FrozenTrial(number=389, state=1, values=[32.08521803346823], datetime_start=datetime.datetime(2023, 9, 12, 23, 57, 29, 14341), datetime_complete=datetime.datetime(2023, 9, 12, 23, 57, 29, 609996), params={'reg_lambda': 9.373619279832548, 'gamma': 7.5017827573564535, 'reg_alpha': 6.356655282477824, 'colsample_bytree': 0.5, 'subsample': 0.7, 'max_depth': 4, 'min_child_weight': 216}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'reg_lambda': FloatDistribution(high=10.0, log=False, low=0.001, step=None), 'gamma': FloatDistribution(high=10.0, log=False, low=0.001, step=None), 'reg_alpha': FloatDistribution(high=10.0, log=False, low=0.001, step=None), 'colsample_bytree': CategoricalDistribution(choices=(0.5, 0.6, 0.7, 0.8, 0.9, 1.0)), 'subsample': CategoricalDistribution(choices=(0.6, 0.7, 0.8, 1.0)), 'max_depth': CategoricalDistribution(choices=(3, 4, 5, 6)), 'min_child_weight': IntDistribution(high=300, log=False, low=1, step=1)}, trial_id=389, value=None)





HLM

In [18]:
hlm_best = hype_tune("HLM")

[I 2023-09-12 23:58:46,883] A new study created in memory with name: no-name-5bb0034b-86bd-4297-ae8a-cf27202a53ee
  0%|          | 0/500 [00:00<?, ?it/s][I 2023-09-12 23:58:47,570] Trial 0 finished with value: 32.84022186758119 and parameters: {'reg_lambda': 2.7375720426115366, 'gamma': 2.7057679585413146, 'reg_alpha': 7.669444156308054, 'colsample_bytree': 0.7, 'subsample': 1.0, 'max_depth': 5, 'min_child_weight': 181}. Best is trial 0 with value: 32.84022186758119.
  0%|          | 1/500 [00:00<05:41,  1.46it/s][I 2023-09-12 23:58:48,113] Trial 1 finished with value: 32.676257354229605 and parameters: {'reg_lambda': 2.4490820651760092, 'gamma': 5.065472495269545, 'reg_alpha': 7.586310273249244, 'colsample_bytree': 1.0, 'subsample': 0.7, 'max_depth': 5, 'min_child_weight': 123}. Best is trial 1 with value: 32.676257354229605.
  0%|          | 2/500 [00:01<04:59,  1.66it/s][I 2023-09-12 23:58:49,480] Trial 2 finished with value: 32.88789834124719 and parameters: {'reg_lambda': 6.715958

best: FrozenTrial(number=307, state=1, values=[32.32004021975805], datetime_start=datetime.datetime(2023, 9, 13, 0, 1, 36, 438550), datetime_complete=datetime.datetime(2023, 9, 13, 0, 1, 37, 104478), params={'reg_lambda': 9.383690143515622, 'gamma': 2.021944370520871, 'reg_alpha': 8.481307589427017, 'colsample_bytree': 0.5, 'subsample': 0.7, 'max_depth': 4, 'min_child_weight': 68}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'reg_lambda': FloatDistribution(high=10.0, log=False, low=0.001, step=None), 'gamma': FloatDistribution(high=10.0, log=False, low=0.001, step=None), 'reg_alpha': FloatDistribution(high=10.0, log=False, low=0.001, step=None), 'colsample_bytree': CategoricalDistribution(choices=(0.5, 0.6, 0.7, 0.8, 0.9, 1.0)), 'subsample': CategoricalDistribution(choices=(0.6, 0.7, 0.8, 1.0)), 'max_depth': CategoricalDistribution(choices=(3, 4, 5, 6)), 'min_child_weight': IntDistribution(high=300, log=False, low=1, step=1)}, trial_id=307, value=None)



