In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate

In [2]:
import optuna

def objective(trial):
    n_estimators = int(32*trial.suggest_float("n_estimators", 2, 16, step=1))
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.20, step=0.01)
    max_depth = int(trial.suggest_float("max_depth", 1, 5, step=1))
    subsample = trial.suggest_float("subsample", 0.5, 1, step=0.05)
    
    model = GradientBoostingRegressor(
        n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=42,
        loss='squared_error', subsample=subsample).fit(X_train, y_train)
    score = np.abs(mean_absolute_error(y_test, model.predict(X_test)) - mean_absolute_error(y_train, model.predict(X_train))) - r2_score(y_test, model.predict(X_test))
    return score

def objective_cv(trial):
    n_estimators = int(32*trial.suggest_float("n_estimators", 2, 16, step=1))
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.20, step=0.01)
    max_depth = int(trial.suggest_float("max_depth", 1, 5, step=1))
    subsample = trial.suggest_float("subsample", 0.5, 1, step=0.05)
    
    model = GradientBoostingRegressor(
        n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=42,
        loss='squared_error', subsample=subsample).fit(X_train, y_train)
    score = -1 * (cross_validate(model, X_std, Y_std, cv=5)['test_score'].mean())
    return score

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
# cross valid
path = './sp_p/'
df_name = 'SP_P_clean_des_977_256_41_15.csv'
df = pd.read_csv(path+df_name)
X = np.array(df.iloc[:,:-1], dtype=float)
Y = np.array(df.iloc[:,-1], dtype=float)

x_mean = np.nanmean(X, axis=0)
x_std = np.nanstd(X, axis=0)
y_mean = np.nanmean(Y, axis=0)
y_std = np.nanstd(Y, axis=0)

X_std = (X-x_mean)/(1e-9+x_std)
Y_std = (Y-y_mean)/(1e-9+y_std)
X_std[np.isnan(X_std)] = 0
Y_std[np.isnan(Y_std)] = 0

ratio = 0.8
size = int(ratio* len(df))

X_train, X_test = X_std[:size], X_std[size:]
y_train, y_test = Y_std[:size], Y_std[size:]

est = GradientBoostingRegressor(
    n_estimators=256, learning_rate=0.1, max_depth=3, random_state=42,
    loss='squared_error', subsample=0.9).fit(X_train, y_train)

study = optuna.create_study()
study.optimize(objective_cv, n_trials=64)

est = GradientBoostingRegressor(
        n_estimators=int(study.best_params['n_estimators']*32),
        learning_rate=study.best_params['learning_rate'],
        max_depth=int(study.best_params['max_depth']),
        random_state=42,
        loss='squared_error',
        subsample=study.best_params['subsample']
).fit(X_train, y_train)
print(mean_absolute_error(y_train, est.predict(X_train)), mean_absolute_error(y_test, est.predict(X_test)))
print(mean_squared_error(y_train, est.predict(X_train)), mean_squared_error(y_test, est.predict(X_test)))
print(np.sqrt(mean_squared_error(y_train, est.predict(X_train))), np.sqrt(mean_squared_error(y_test, est.predict(X_test))))
print(r2_score(y_train, est.predict(X_train)), r2_score(y_test, est.predict(X_test)))

[I 2024-07-25 11:18:46,271] A new study created in memory with name: no-name-0bd2efa2-4a5d-4f1d-aa59-851f18e87958
[I 2024-07-25 11:18:47,652] Trial 0 finished with value: -0.5211468191111048 and parameters: {'n_estimators': 8.0, 'learning_rate': 0.09, 'max_depth': 2.0, 'subsample': 0.75}. Best is trial 0 with value: -0.5211468191111048.
[I 2024-07-25 11:18:49,134] Trial 1 finished with value: -0.5231615645104772 and parameters: {'n_estimators': 6.0, 'learning_rate': 0.14, 'max_depth': 3.0, 'subsample': 0.8500000000000001}. Best is trial 1 with value: -0.5231615645104772.
[I 2024-07-25 11:18:50,360] Trial 2 finished with value: -0.49751193219606443 and parameters: {'n_estimators': 4.0, 'learning_rate': 0.08, 'max_depth': 5.0, 'subsample': 0.7}. Best is trial 1 with value: -0.5231615645104772.
[I 2024-07-25 11:18:51,030] Trial 3 finished with value: -0.3173645670835995 and parameters: {'n_estimators': 4.0, 'learning_rate': 0.01, 'max_depth': 2.0, 'subsample': 0.7}. Best is trial 1 with v

[I 2024-07-25 11:19:45,870] Trial 35 finished with value: -0.5189668084502097 and parameters: {'n_estimators': 3.0, 'learning_rate': 0.06999999999999999, 'max_depth': 3.0, 'subsample': 0.65}. Best is trial 27 with value: -0.5277112654486132.
[I 2024-07-25 11:19:46,709] Trial 36 finished with value: -0.5141687706182356 and parameters: {'n_estimators': 5.0, 'learning_rate': 0.09999999999999999, 'max_depth': 2.0, 'subsample': 0.7}. Best is trial 27 with value: -0.5277112654486132.
[I 2024-07-25 11:19:48,521] Trial 37 finished with value: -0.49234969483815494 and parameters: {'n_estimators': 7.0, 'learning_rate': 0.08, 'max_depth': 5.0, 'subsample': 0.55}. Best is trial 27 with value: -0.5277112654486132.
[I 2024-07-25 11:19:49,391] Trial 38 finished with value: -0.5146693351823333 and parameters: {'n_estimators': 4.0, 'learning_rate': 0.12, 'max_depth': 3.0, 'subsample': 0.7}. Best is trial 27 with value: -0.5277112654486132.
[I 2024-07-25 11:19:50,026] Trial 39 finished with value: -0.49

0.11223992115986102 0.2903794770959174
0.019557755023546022 0.16634566036792608
0.13984904369907583 0.40785495015743783
0.9813813932336299 0.7908770035018319


In [30]:
scores = cross_validate(est, X_std, Y_std, cv=5)['test_score']
print(scores.mean(),scores.std())

0.5327097188702405 0.20829122205320896
