In [1]:
import pandas as pd 
import numpy as np 

df_train = pd.read_csv('clean_data/train.csv')
df_test = pd.read_csv('clean_data/test.csv')

target_cols = ['EC1', 'EC2']
num_cols = ['BertzCT', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3v',
            'Chi4n', 'EState_VSA1', 'EState_VSA2', 'ExactMolWt', 'FpDensityMorgan1',
            'FpDensityMorgan2', 'FpDensityMorgan3', 'HallKierAlpha',
            'HeavyAtomMolWt', 'Kappa3', 'MaxAbsEStateIndex', 'MinEStateIndex',
            'NumHeteroatoms', 'PEOE_VSA10', 'PEOE_VSA14', 'PEOE_VSA6', 'PEOE_VSA7',
            'PEOE_VSA8', 'SMR_VSA10', 'SMR_VSA5', 'SlogP_VSA3', 'VSA_EState9',
            'fr_COO', 'fr_COO2']
binary_cols = ['EC3', 'EC4', 'EC5', 'EC6']

x_train = df_train[num_cols].to_numpy()
y_train = df_train[target_cols].to_numpy()

x_test = df_test[num_cols].to_numpy()

In [2]:
import xgboost as xgb
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np

# Set a fixed random seed for reproducibility
np.random.seed(42)

def train_model(x_train, y_train, x_eval, y_eval):
    def objective(trial):
        param = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 6),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
            'gamma': trial.suggest_float('gamma', 0.01, 1, log=True),
            'random_state': 42,
            'early_stopping_rounds': 10
        }

        model = xgb.XGBClassifier(**param)

        model.fit(x_train, y_train, eval_set=[(x_eval, y_eval)], verbose=False)
        y_pred = model.predict_proba(x_eval)[:, 1]
        auc_roc = roc_auc_score(y_eval, y_pred)

        return auc_roc

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    return study.best_trial.params, study.best_trial.value

# Split the data into train and evaluation sets
x_train, x_eval, y_train, y_eval = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# For EC1
best_params_1, best_auc_1 = train_model(x_train, y_train[:, 0], x_eval, y_eval[:, 0])
classifier_1 = xgb.XGBClassifier(**best_params_1)
classifier_1.fit(x_train, y_train[:, 0])
y_pred_1 = classifier_1.predict_proba(x_eval)[:, 1]

# For EC2
best_params_2, best_auc_2 = train_model(x_train, y_train[:, 1], x_eval, y_eval[:, 1])
classifier_2 = xgb.XGBClassifier(**best_params_2)
classifier_2.fit(x_train, y_train[:, 1])
y_pred_2 = classifier_2.predict_proba(x_eval)[:, 1]

auc_score_1 = roc_auc_score(y_eval[:, 0], y_pred_1)
auc_score_2 = roc_auc_score(y_eval[:, 1], y_pred_2)

print("AUC ROC score 1:", auc_score_1)
print("AUC ROC score 2:", auc_score_2)
print("Avg AUC ROC score:", (auc_score_1 + auc_score_2) / 2)


[I 2023-07-10 10:23:08,074] A new study created in memory with name: no-name-0a20a849-5708-4cf7-8e78-65e46ad573c4
[I 2023-07-10 10:23:08,450] Trial 0 finished with value: 0.7056255246395418 and parameters: {'n_estimators': 289, 'max_depth': 6, 'learning_rate': 0.016046162112628666, 'subsample': 0.5302764436298877, 'colsample_bytree': 0.6857593991533015, 'reg_alpha': 9.078306118615618, 'reg_lambda': 8.106054996299497, 'gamma': 0.02658516980643513}. Best is trial 0 with value: 0.7056255246395418.
[I 2023-07-10 10:23:08,800] Trial 1 finished with value: 0.7059001888702351 and parameters: {'n_estimators': 256, 'max_depth': 3, 'learning_rate': 0.019024088292623136, 'subsample': 0.5980410118782618, 'colsample_bytree': 0.7037567776886573, 'reg_alpha': 8.657568257244419, 'reg_lambda': 5.724257504289324, 'gamma': 0.47656170248597984}. Best is trial 1 with value: 0.7059001888702351.
[I 2023-07-10 10:23:09,048] Trial 2 finished with value: 0.7085318733952202 and parameters: {'n_estimators': 990, 

AUC ROC score 1: 0.7065598459411416
AUC ROC score 2: 0.5656470070422535
Avg AUC ROC score: 0.6361034264916976


In [3]:
print(best_auc_1)
print(best_auc_2)

0.7109184689577985
0.6030927230046949
