In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

train = pd.read_csv('./data/train_pca.csv')
train.drop(columns=['SMILES'],inplace=True)



In [5]:
import optuna
import optuna.logging
from tqdm import tqdm
from sklearn.model_selection import train_test_split
def hype_tune(col):
    x_train, x_valid, y_train, y_valid = train_test_split(train.drop(columns=['HLM',"MLM"]),train[f'{col}'], test_size=0.33, random_state=42)

    dtrain = xgb.DMatrix(data=x_train, label=y_train, enable_categorical=True)
    dvalid = xgb.DMatrix(data=x_valid, label=y_valid, enable_categorical=True)

    def objective(trial):
        param = {
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0),
            'gamma': trial.suggest_float('gamma', 1e-3, 10),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0),
            'colsample_bytree': trial.suggest_categorical('colsample_bytree',[0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
            'subsample': trial.suggest_categorical('subsample', [0.6, 0.7, 0.8, 1.0]),
            'max_depth': trial.suggest_categorical('max_depth', [3, 4, 5, 6]),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
            'eta' : 0.1,
            'tree_method' : "hist",
            'gpu_id': 0,
        }

        model = xgb.train(params=param, dtrain=dtrain, num_boost_round=1000,
                        evals=[(dvalid, 'valid')], early_stopping_rounds=50, verbose_eval=False)

        preds = model.predict(dvalid)
        rmse = mean_squared_error(y_valid, preds, squared=False)
        return rmse
        

        

    study = optuna.create_study(direction='minimize', study_name=None)
    with tqdm(total=500) as pbar:  
        def callback(study, trial):
            pbar.update(1)  

        study.optimize(objective, n_trials=500, callbacks=[callback])
        
    df = study.trials_dataframe().sort_values(by=['value'], ascending=[True]).reset_index(drop=True)
    best = study.best_trial
    print("best:",best)
    #df.to_csv('parameters.csv', index=False)
    df.head(5)
    return best

MLM

In [6]:
mlm_best = hype_tune("MLM")

[I 2023-09-21 19:48:44,925] A new study created in memory with name: no-name-3f6b6ee5-2d5d-41b7-b559-c5e295244fec
  0%|          | 0/500 [00:00<?, ?it/s][I 2023-09-21 19:48:45,110] Trial 0 finished with value: 33.217201174768704 and parameters: {'reg_lambda': 3.5471534205138497, 'gamma': 5.530133427896877, 'reg_alpha': 9.203942521381695, 'colsample_bytree': 0.7, 'subsample': 1.0, 'max_depth': 5, 'min_child_weight': 128}. Best is trial 0 with value: 33.217201174768704.
  0%|          | 1/500 [00:00<01:31,  5.45it/s][I 2023-09-21 19:48:45,393] Trial 1 finished with value: 33.0830868792231 and parameters: {'reg_lambda': 1.5044433533979642, 'gamma': 6.300169605076662, 'reg_alpha': 3.326263443293287, 'colsample_bytree': 0.5, 'subsample': 0.8, 'max_depth': 5, 'min_child_weight': 300}. Best is trial 1 with value: 33.0830868792231.
  0%|          | 2/500 [00:00<02:00,  4.12it/s][I 2023-09-21 19:48:45,508] Trial 2 finished with value: 33.184794847099504 and parameters: {'reg_lambda': 0.16097141

best: FrozenTrial(number=215, state=1, values=[32.84134819829901], datetime_start=datetime.datetime(2023, 9, 21, 19, 49, 25, 540856), datetime_complete=datetime.datetime(2023, 9, 21, 19, 49, 25, 714365), params={'reg_lambda': 7.82951409523434, 'gamma': 4.9873883068002405, 'reg_alpha': 7.54907779273947, 'colsample_bytree': 1.0, 'subsample': 0.8, 'max_depth': 4, 'min_child_weight': 291}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'reg_lambda': FloatDistribution(high=10.0, log=False, low=0.001, step=None), 'gamma': FloatDistribution(high=10.0, log=False, low=0.001, step=None), 'reg_alpha': FloatDistribution(high=10.0, log=False, low=0.001, step=None), 'colsample_bytree': CategoricalDistribution(choices=(0.5, 0.6, 0.7, 0.8, 0.9, 1.0)), 'subsample': CategoricalDistribution(choices=(0.6, 0.7, 0.8, 1.0)), 'max_depth': CategoricalDistribution(choices=(3, 4, 5, 6)), 'min_child_weight': IntDistribution(high=300, log=False, low=1, step=1)}, trial_id=215, value=None)





HLM

In [7]:
hlm_best = hype_tune("HLM")

[I 2023-09-21 19:50:25,627] A new study created in memory with name: no-name-dfbf7f93-b748-4ec6-b99b-bafa4390e69c
  0%|          | 0/500 [00:00<?, ?it/s]

[I 2023-09-21 19:50:25,841] Trial 0 finished with value: 33.275736026486115 and parameters: {'reg_lambda': 7.232497124710596, 'gamma': 7.635028822738516, 'reg_alpha': 4.731180143133632, 'colsample_bytree': 0.9, 'subsample': 0.6, 'max_depth': 6, 'min_child_weight': 235}. Best is trial 0 with value: 33.275736026486115.
  0%|          | 1/500 [00:00<01:45,  4.72it/s][I 2023-09-21 19:50:26,041] Trial 1 finished with value: 33.24974194035999 and parameters: {'reg_lambda': 3.035824550727837, 'gamma': 5.090949333538461, 'reg_alpha': 0.2516907803145133, 'colsample_bytree': 0.5, 'subsample': 0.8, 'max_depth': 5, 'min_child_weight': 291}. Best is trial 1 with value: 33.24974194035999.
  0%|          | 2/500 [00:00<01:42,  4.88it/s][I 2023-09-21 19:50:26,147] Trial 2 finished with value: 33.24020808704199 and parameters: {'reg_lambda': 0.009346401587657958, 'gamma': 5.798398661194316, 'reg_alpha': 1.7192497585600628, 'colsample_bytree': 0.9, 'subsample': 0.8, 'max_depth': 5, 'min_child_weight': 1

best: FrozenTrial(number=41, state=1, values=[33.03524654070839], datetime_start=datetime.datetime(2023, 9, 21, 19, 50, 30, 704049), datetime_complete=datetime.datetime(2023, 9, 21, 19, 50, 30, 840065), params={'reg_lambda': 0.5940874042694309, 'gamma': 5.657703439375733, 'reg_alpha': 9.066908326467557, 'colsample_bytree': 0.9, 'subsample': 1.0, 'max_depth': 3, 'min_child_weight': 19}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'reg_lambda': FloatDistribution(high=10.0, log=False, low=0.001, step=None), 'gamma': FloatDistribution(high=10.0, log=False, low=0.001, step=None), 'reg_alpha': FloatDistribution(high=10.0, log=False, low=0.001, step=None), 'colsample_bytree': CategoricalDistribution(choices=(0.5, 0.6, 0.7, 0.8, 0.9, 1.0)), 'subsample': CategoricalDistribution(choices=(0.6, 0.7, 0.8, 1.0)), 'max_depth': CategoricalDistribution(choices=(3, 4, 5, 6)), 'min_child_weight': IntDistribution(high=300, log=False, low=1, step=1)}, trial_id=41, value=None)





In [8]:
print("mlm",mlm_best.values)
print(mlm_best.params)
print("hlm",hlm_best.values)
print(hlm_best.params)

mlm [32.84134819829901]
{'reg_lambda': 7.82951409523434, 'gamma': 4.9873883068002405, 'reg_alpha': 7.54907779273947, 'colsample_bytree': 1.0, 'subsample': 0.8, 'max_depth': 4, 'min_child_weight': 291}
hlm [33.03524654070839]
{'reg_lambda': 0.5940874042694309, 'gamma': 5.657703439375733, 'reg_alpha': 9.066908326467557, 'colsample_bytree': 0.9, 'subsample': 1.0, 'max_depth': 3, 'min_child_weight': 19}
