In [1]:
import pandas as pd 
import numpy as np 

df_train = pd.read_csv('clean_data/train.csv')
df_test = pd.read_csv('clean_data/test.csv')

In [2]:
target_cols = ['EC1', 'EC2']
num_cols = ['BertzCT', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3v',
            'Chi4n', 'EState_VSA1', 'EState_VSA2', 'ExactMolWt', 'FpDensityMorgan1',
            'FpDensityMorgan2', 'FpDensityMorgan3', 'HallKierAlpha',
            'HeavyAtomMolWt', 'Kappa3', 'MaxAbsEStateIndex', 'MinEStateIndex',
            'NumHeteroatoms', 'PEOE_VSA10', 'PEOE_VSA14', 'PEOE_VSA6', 'PEOE_VSA7',
            'PEOE_VSA8', 'SMR_VSA10', 'SMR_VSA5', 'SlogP_VSA3', 'VSA_EState9',
            'fr_COO', 'fr_COO2']
binary_cols = ['EC3', 'EC4', 'EC5', 'EC6']

In [3]:
x_train = df_train[num_cols].to_numpy()
y_train = df_train[target_cols].to_numpy()

x_test = df_test[num_cols].to_numpy()

In [4]:
from sklearn.model_selection import train_test_split

x_train, x_cv, y_train, y_cv = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [5]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score

regressor_1 = xgb.XGBRegressor(
    n_estimators=100,
    reg_lambda=1,
    gamma=0,
    max_depth=3
)

regressor_2 = xgb.XGBRegressor(
    n_estimators=100,
    reg_lambda=1,
    gamma=0,
    max_depth=3
)

regressor_1.fit(x_train, y_train[:, 0])
regressor_2.fit(x_train, y_train[:, 1])

y_pred_1 = regressor_1.predict(x_cv)
y_pred_2 = regressor_2.predict(x_cv)

auc_score_1 = roc_auc_score(y_cv[:, 0], y_pred_1)
auc_score_2 = roc_auc_score(y_cv[:, 1], y_pred_2)
print("AUC ROC score 1:", auc_score_1)
print("AUC ROC score 2:", auc_score_2)
print("Avg AUC ROC score:", (auc_score_1 + auc_score_2)/2)

AUC ROC score 1: 0.6889777347093291
AUC ROC score 2: 0.5591666666666667
Avg AUC ROC score: 0.624072200687998


## Optuna on EC1: 

In [6]:
import xgboost as xgb
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

def objective(trial):
    param = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 6),  # Decrease the max_depth parameter
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),  # Add regularization parameters with appropriate bounds
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),  # Add regularization parameters with appropriate bounds
        'gamma': trial.suggest_float('gamma', 0.01, 1, log=True),
        'random_state': 42,
        'early_stopping_rounds': 10  # Add early_stopping_rounds parameter to the XGBClassifier constructor
    }

    model = xgb.XGBClassifier(**param)

    model.fit(x_train, y_train[:, 0], eval_set=[(x_cv, y_cv[:, 0])], verbose=False)
    y_pred = model.predict_proba(x_cv)[:, 1]
    auc_roc = roc_auc_score(y_cv[:, 0], y_pred)

    return auc_roc

study_1 = optuna.create_study(direction='maximize')
study_1.optimize(objective, n_trials=100)

print('Best trial:', study_1.best_trial.params)
print('Best AUC-ROC:', study_1.best_trial.value)


[I 2023-07-08 13:56:28,623] A new study created in memory with name: no-name-29899c3e-1e49-406c-9dbf-2803369898eb
[I 2023-07-08 13:56:28,773] Trial 0 finished with value: 0.7057013916321022 and parameters: {'n_estimators': 211, 'max_depth': 5, 'learning_rate': 0.053250267923287596, 'subsample': 0.8982807829301442, 'colsample_bytree': 0.5281165315884767, 'reg_alpha': 6.881372493099553, 'reg_lambda': 1.4527792078983248, 'gamma': 0.8131354545278617}. Best is trial 0 with value: 0.7057013916321022.
[I 2023-07-08 13:56:28,902] Trial 1 finished with value: 0.6984181603463032 and parameters: {'n_estimators': 995, 'max_depth': 6, 'learning_rate': 0.028533641272142755, 'subsample': 0.8870189735963625, 'colsample_bytree': 0.9461010773909503, 'reg_alpha': 8.797133336491122, 'reg_lambda': 5.510913352023917, 'gamma': 0.3683263373653865}. Best is trial 0 with value: 0.7057013916321022.
[I 2023-07-08 13:56:29,037] Trial 2 finished with value: 0.7035807163078543 and parameters: {'n_estimators': 499, '

Best trial: {'n_estimators': 413, 'max_depth': 3, 'learning_rate': 0.029945309708004565, 'subsample': 0.6417617939478569, 'colsample_bytree': 0.5506652209185866, 'reg_alpha': 7.875415666651623, 'reg_lambda': 1.5447015338947907, 'gamma': 0.032720904849729913}
Best AUC-ROC: 0.7109753049081572


## Optuna on EC2

In [7]:
import xgboost as xgb
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

def objective(trial):
    param = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 6),  # Decrease the max_depth parameter
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),  # Add regularization parameters with appropriate bounds
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),  # Add regularization parameters with appropriate bounds
        'gamma': trial.suggest_float('gamma', 0.01, 1, log=True),
        'random_state': 42,
        'early_stopping_rounds': 10  # Add early_stopping_rounds parameter to the XGBClassifier constructor
    }

    model = xgb.XGBClassifier(**param)

    model.fit(x_train, y_train[:, 1], eval_set=[(x_cv, y_cv[:, 1])], verbose=False)
    y_pred = model.predict_proba(x_cv)[:, 1]
    auc_roc = roc_auc_score(y_cv[:, 1], y_pred)

    return auc_roc

study_2 = optuna.create_study(direction='maximize')
study_2.optimize(objective, n_trials=100)

print('Best trial:', study_2.best_trial.params)
print('Best AUC-ROC:', study_2.best_trial.value)


[I 2023-07-08 13:56:47,693] A new study created in memory with name: no-name-d534cb4b-74cd-4757-a9d1-0798c5779c7e
[I 2023-07-08 13:56:47,807] Trial 0 finished with value: 0.5948980340375587 and parameters: {'n_estimators': 855, 'max_depth': 4, 'learning_rate': 0.06868526538660243, 'subsample': 0.8927891396242273, 'colsample_bytree': 0.5260772959075153, 'reg_alpha': 3.958336181142519, 'reg_lambda': 4.395443977795521, 'gamma': 0.02652139664444022}. Best is trial 0 with value: 0.5948980340375587.
[I 2023-07-08 13:56:47,901] Trial 1 finished with value: 0.5868320862676057 and parameters: {'n_estimators': 323, 'max_depth': 6, 'learning_rate': 0.015401066288159848, 'subsample': 0.5813262027504315, 'colsample_bytree': 0.514867998726682, 'reg_alpha': 4.839359258885638, 'reg_lambda': 6.98951071444676, 'gamma': 0.0441315797345445}. Best is trial 0 with value: 0.5948980340375587.
[I 2023-07-08 13:56:48,114] Trial 2 finished with value: 0.5929368397887325 and parameters: {'n_estimators': 468, 'max

Best trial: {'n_estimators': 353, 'max_depth': 5, 'learning_rate': 0.09038555072447328, 'subsample': 0.8643154043008117, 'colsample_bytree': 0.5575545047682585, 'reg_alpha': 6.753713266029619, 'reg_lambda': 2.848176550878209, 'gamma': 0.015236947758981444}
Best AUC-ROC: 0.6044373532863849


In [11]:
import xgboost as xgb
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

def train_model(x_train, y_train, x_cv, y_cv):
    def objective(trial):
        param = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 6),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
            'gamma': trial.suggest_float('gamma', 0.01, 1, log=True),
            'random_state': 42,
            'early_stopping_rounds': 10
        }

        model = xgb.XGBClassifier(**param)

        model.fit(x_train, y_train, eval_set=[(x_cv, y_cv)], verbose=False)
        y_pred = model.predict_proba(x_cv)[:, 1]
        auc_roc = roc_auc_score(y_cv, y_pred)

        return auc_roc

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    return study.best_trial.params, study.best_trial.value

# For EC1
best_params_1, best_auc_1 = train_model(x_train, y_train[:, 0], x_cv, y_cv[:, 0])
regressor_1 = xgb.XGBRegressor(**best_params_1)
regressor_1.fit(x_train, y_train[:, 0])
y_pred_1 = regressor_1.predict(x_cv)

# For EC2
best_params_2, best_auc_2 = train_model(x_train, y_train[:, 1], x_cv, y_cv[:, 1])
regressor_2 = xgb.XGBRegressor(**best_params_2)
regressor_2.fit(x_train, y_train[:, 1])
y_pred_2 = regressor_2.predict(x_cv)

auc_score_1 = roc_auc_score(y_cv[:, 0], y_pred_1)
auc_score_2 = roc_auc_score(y_cv[:, 1], y_pred_2)

print("AUC ROC score 1:", auc_score_1)
print("AUC ROC score 2:", auc_score_2)
print("Avg AUC ROC score:", (auc_score_1 + auc_score_2) / 2)


[I 2023-07-08 14:01:17,202] A new study created in memory with name: no-name-3ad7df1c-e424-469c-bc84-ac69c57b6985
[I 2023-07-08 14:01:17,322] Trial 0 finished with value: 0.7054015241293041 and parameters: {'n_estimators': 470, 'max_depth': 5, 'learning_rate': 0.04065458048024234, 'subsample': 0.5425410505013771, 'colsample_bytree': 0.5543850501999128, 'reg_alpha': 0.3660823531967339, 'reg_lambda': 9.237435531451993, 'gamma': 0.2100923963062714}. Best is trial 0 with value: 0.7054015241293041.
[I 2023-07-08 14:01:17,502] Trial 1 finished with value: 0.7052091562973205 and parameters: {'n_estimators': 655, 'max_depth': 3, 'learning_rate': 0.017420159154010923, 'subsample': 0.9642927690482062, 'colsample_bytree': 0.6774076660206876, 'reg_alpha': 1.2561894804799278, 'reg_lambda': 0.5709543892080204, 'gamma': 0.02446073956222002}. Best is trial 0 with value: 0.7054015241293041.
[I 2023-07-08 14:01:17,655] Trial 2 finished with value: 0.7034616437059714 and parameters: {'n_estimators': 694,

AUC ROC score 1: 0.7047122917901112
AUC ROC score 2: 0.5532504401408451
Avg AUC ROC score: 0.6289813659654782


## Output

In [8]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score

# For EC1
regressor_1 = xgb.XGBRegressor(
    n_estimators=study_1.best_trial.params['n_estimators'],
    max_depth=study_1.best_trial.params['max_depth'],
    learning_rate=study_1.best_trial.params['learning_rate'],
    subsample=study_1.best_trial.params['subsample'],
    colsample_bytree=study_1.best_trial.params['colsample_bytree'],
    reg_alpha=study_1.best_trial.params['reg_alpha'],
    reg_lambda=study_1.best_trial.params['reg_lambda'],
    gamma=study_1.best_trial.params['gamma']
)

regressor_1.fit(x_train, y_train[:, 0])

# For EC2
regressor_2 = xgb.XGBRegressor(
    n_estimators=study_2.best_trial.params['n_estimators'],
    max_depth=study_2.best_trial.params['max_depth'],
    learning_rate=study_2.best_trial.params['learning_rate'],
    subsample=study_2.best_trial.params['subsample'],
    colsample_bytree=study_2.best_trial.params['colsample_bytree'],
    reg_alpha=study_2.best_trial.params['reg_alpha'],
    reg_lambda=study_2.best_trial.params['reg_lambda'],
    gamma=study_2.best_trial.params['gamma']
)

regressor_2.fit(x_train, y_train[:, 1])

y_pred_1 = regressor_1.predict(x_cv)
y_pred_2 = regressor_2.predict(x_cv)

auc_score_1 = roc_auc_score(y_cv[:, 0], y_pred_1)
auc_score_2 = roc_auc_score(y_cv[:, 1], y_pred_2)

print("AUC ROC score 1:", auc_score_1)
print("AUC ROC score 2:", auc_score_2)
print("Avg AUC ROC score:", (auc_score_1 + auc_score_2) / 2)


AUC ROC score 1: 0.70695152536375
AUC ROC score 2: 0.550631602112676
Avg AUC ROC score: 0.6287915637382131


In [9]:
y_pred_1_train = regressor_1.predict(x_train)
y_pred_2_train = regressor_2.predict(x_train)

auc_score_1_train = roc_auc_score(y_train[:, 0], y_pred_1_train)
auc_score_2_train = roc_auc_score(y_train[:, 1], y_pred_2_train)

print("AUC ROC score 1:", auc_score_1_train)
print("AUC ROC score 2:", auc_score_2_train)
print("Avg AUC ROC score:", (auc_score_1_train + auc_score_2_train) / 2)

AUC ROC score 1: 0.749340891704833
AUC ROC score 2: 0.9316604642454779
Avg AUC ROC score: 0.8405006779751554


Since the model has higher auc roc for EC2 on train set than cv, its overfitting. 

## Submission file

In [12]:
y_pred_1 = regressor_1.predict(x_test)
y_pred_2 = regressor_2.predict(x_test)

ids = df_test['id']

# creating submission file
df_y_pred_1 = pd.DataFrame({'EC1': y_pred_1})

df_y_pred_2 = pd.DataFrame({'EC2': y_pred_2})

df_ids = pd.DataFrame({'id': ids})

result = pd.concat([df_ids, df_y_pred_1, df_y_pred_2], axis=1)

result.to_csv('submissions/submission_1_xgboost_2.csv', index=False)