In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import optuna

In [3]:
df = pd.read_csv('Datasets/sars_cov_ML_dataset.csv')

X = df.drop(columns=['pIC50'])
y = df['pIC50']

selection = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_var = selection.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_var, y, test_size=0.2, random_state=21)

In [10]:
def objective(trial):

    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'booster': 'gbtree',
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
    }
    
    model = xgb.XGBRegressor(**param, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred, squared=False)
    
    return mse

# Create an Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Print the best hyperparameters
print("Best hyperparameters: ", study.best_params)
print("Best MSE: ", study.best_value)

[I 2024-06-24 18:24:36,022] A new study created in memory with name: no-name-c2a4b9d3-af1c-4cce-b063-ed86bf9a0bcc
[I 2024-06-24 18:24:36,743] Trial 0 finished with value: 0.5519526616212762 and parameters: {'lambda': 0.3720793732734532, 'alpha': 0.10349734136056177, 'subsample': 0.655091087341006, 'colsample_bytree': 0.5105513290263439, 'learning_rate': 0.01808983795792593, 'n_estimators': 664, 'max_depth': 6, 'min_child_weight': 5}. Best is trial 0 with value: 0.5519526616212762.
[I 2024-06-24 18:24:36,927] Trial 1 finished with value: 0.8514215902985016 and parameters: {'lambda': 4.799698077693233, 'alpha': 0.09451686481764342, 'subsample': 0.5646210817000777, 'colsample_bytree': 0.8290566294917222, 'learning_rate': 0.009594621775629149, 'n_estimators': 147, 'max_depth': 5, 'min_child_weight': 7}. Best is trial 0 with value: 0.5519526616212762.
[I 2024-06-24 18:24:37,599] Trial 2 finished with value: 0.5376840505524102 and parameters: {'lambda': 0.05965524026636002, 'alpha': 0.019530

Best hyperparameters:  {'lambda': 0.001666122493049674, 'alpha': 0.016923626892242607, 'subsample': 0.6933768006550779, 'colsample_bytree': 0.9598975110378186, 'learning_rate': 0.02167991402872176, 'n_estimators': 725, 'max_depth': 6, 'min_child_weight': 4}
Best MSE:  0.5181710453788757


In [11]:
# Best hyperparameters:  {'lambda': 0.06040498791964853, 
#                         'alpha': 0.006371376654991307, 
#                         'subsample': 0.6203569193718739, 
#                         'colsample_bytree': 0.9738322428904451, 
#                         'learning_rate': 0.07052530547088731, 
#                         'n_estimators': 466, 
#                         'max_depth': 8, 
#                         'min_child_weight': 6}


best_params = study.best_params
best_model = xgb.XGBRegressor(**best_params, random_state=42)

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = best_model.score(X=X_test, y=y_test)

print("R2 with best hyperparameters: ", r2)
print("RMSE with best hyperparameters: ", rmse)

R2 with best hyperparameters:  0.7762469320942736
RMSE with best hyperparameters:  0.5181710453788757
