In [2]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate
import optuna
import pickle

def objective(trial):
    n_estimators = int(32*trial.suggest_float("n_estimators", 2, 16, step=1))
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.20, step=0.01)
    max_depth = int(trial.suggest_float("max_depth", 1, 5, step=1))
    subsample = trial.suggest_float("subsample", 0.5, 1, step=0.05)
    
    model = GradientBoostingRegressor(
        n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=42,
        loss='squared_error', subsample=subsample).fit(X_train, y_train)
    score = np.abs(mean_absolute_error(y_test, model.predict(X_test)) - mean_absolute_error(y_train, model.predict(X_train))) - r2_score(y_test, model.predict(X_test))
    return score

def objective_cv(trial):
    n_estimators = int(32*trial.suggest_float("n_estimators", 2, 16, step=1))
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.20, step=0.01)
    max_depth = int(trial.suggest_float("max_depth", 1, 5, step=1))
    subsample = trial.suggest_float("subsample", 0.5, 1, step=0.05)
    
    model = GradientBoostingRegressor(
        n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=42,
        loss='squared_error', subsample=subsample).fit(X_train, y_train)
    score = -1 * (cross_validate(model, X_std, Y_std, cv=5)['test_score'].mean())
    return score

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# train/test
path = './vis/'
df_name = 'result_clean_des_863_256_23_12.csv'
df = pd.read_csv(path+df_name)
X = np.array(df.iloc[:,:-1], dtype=float)
Y = np.array(df.iloc[:,-1], dtype=float)

x_mean = np.nanmean(X, axis=0)
x_std = np.nanstd(X, axis=0)
y_mean = np.nanmean(Y, axis=0)
y_std = np.nanstd(Y, axis=0)

X_std = (X-x_mean)/(1e-9+x_std)
Y_std = (Y-y_mean)/(1e-9+y_std)
X_std[np.isnan(X_std)] = 0
Y_std[np.isnan(Y_std)] = 0

ratio = 0.8
size = int(ratio* len(df))

X_train, X_test = X_std[:size], X_std[size:]
y_train, y_test = Y_std[:size], Y_std[size:]

# est = GradientBoostingRegressor(
#     n_estimators=256, learning_rate=0.1, max_depth=3, random_state=42,
#     loss='squared_error', subsample=0.9).fit(X_train, y_train)

study = optuna.create_study()
study.optimize(objective, n_trials=64)

est = GradientBoostingRegressor(
        n_estimators=int(study.best_params['n_estimators']*32),
        learning_rate=study.best_params['learning_rate'],
        max_depth=int(study.best_params['max_depth']),
        random_state=42,
        loss='squared_error',
        subsample=study.best_params['subsample']
).fit(X_train, y_train)
print(mean_absolute_error(y_train, est.predict(X_train)), mean_absolute_error(y_test, est.predict(X_test)))
print(mean_squared_error(y_train, est.predict(X_train)), mean_squared_error(y_test, est.predict(X_test)))
print(np.sqrt(mean_squared_error(y_train, est.predict(X_train))), np.sqrt(mean_squared_error(y_test, est.predict(X_test))))
print(r2_score(y_train, est.predict(X_train)), r2_score(y_test, est.predict(X_test)))


est.features_name = list(df.columns)[:-1]
est.standarder = {'x':[x_mean, x_std], 'y':[y_mean, y_std]}
with open('models/vis_GBDT.pkl', 'wb') as f:
    pickle.dump(est, f)

In [None]:
# draw
path = 'models/'
target = 'td'
train_name = 'td_clean_des_1115_256_61_18'
model_name1 = '%s_GBDT.pkl'%target
with open(path + model_name1, 'rb') as f:
    model = pickle.load(f)

[x_mean, x_std] = model.standarder['x']
[y_mean, y_std] = model.standarder['y']
    
print(model.features_name)

candidates = pd.read_csv('%s/%s.csv'%(target, train_name)).sample(frac=1, random_state=i)
candidates_features = candidates[model.features_name]

x = candidates_features
x_ = (x-x_mean)/(1e-9+x_std)
x_ = x_.fillna(0)
pre = model.predict(x_)
pre_ = pre*(1e-9+y_std) + y_mean

df = candidates
df['%s_pre'%target] = pre_
df['split'] = int(len(pre_)*0.8) * ['train'] + (len(pre_) - int(len(pre_)*0.8)) * ['test']

df.to_csv('%s_pre.csv'%target)
num = int(len(df)*0.8)
y_train = df['Td5'][:num]
pre_train = df['%s_pre'%target][:num]
y_test = df['Td5'][num:]
pre_test = df['%s_pre'%target][num:]
print(mean_absolute_error(y_train, pre_train), mean_absolute_error(y_test, pre_test))
print(mean_squared_error(y_train, pre_train), mean_squared_error(y_test, pre_test))
print(np.sqrt(mean_squared_error(y_train, pre_train)), np.sqrt(mean_squared_error(y_test, pre_test)))
print(r2_score(y_train, pre_train), r2_score(y_test, pre_test))
#     print(i, r2_score(y_train, pre_train), r2_score(y_test, pre_test))

In [9]:
# draw
path = 'models/'
target = 'vis'
train_name = 'result_clean_des_863_256_23_12'
model_name1 = '%s_GBDT.pkl'%target
with open(path + model_name1, 'rb') as f:
    model = pickle.load(f)

[x_mean, x_std] = model.standarder['x']
[y_mean, y_std] = model.standarder['y']
    
print(model.features_name)

candidates = pd.read_csv('%s/%s.csv'%(target, train_name)).sample(frac=1, random_state=824)
candidates_features = candidates[model.features_name]

x = candidates_features
x_ = (x-x_mean)/(1e-9+x_std)
x_ = x_.fillna(0)
pre = model.predict(x_)
pre_ = pre*(1e-9+y_std) + y_mean

df = candidates
df['%s_pre'%target] = pre_
df['split'] = int(len(pre_)*0.8) * ['train'] + (len(pre_) - int(len(pre_)*0.8)) * ['test']

df.to_csv('%s_pre.csv'%target)
num = int(len(df)*0.8)
y_train = df['log_110'][:num]
pre_train = df['%s_pre'%target][:num]
y_test = df['log_110'][num:]
pre_test = df['%s_pre'%target][num:]
print(mean_absolute_error(y_train, pre_train), mean_absolute_error(y_test, pre_test))
print(mean_squared_error(y_train, pre_train), mean_squared_error(y_test, pre_test))
print(np.sqrt(mean_squared_error(y_train, pre_train)), np.sqrt(mean_squared_error(y_test, pre_test)))
print(r2_score(y_train, pre_train), r2_score(y_test, pre_test))
#     print(i, r2_score(y_train, pre_train), r2_score(y_test, pre_test))

['VE1_A', 'ATSC2Z', 'ATSC0i', 'AATSC2c', 'MATS2m', 'MATS7are', 'GATS7dv', 'GATS4s', 'ETA_eta_BR', 'EState_VSA5', 'MID_O', 'BCUT2D_CHGHI']
1.675671066146058e-06 0.06288970821413671
4.091082222420633e-12 0.007022584481296651
2.0226423861920407e-06 0.08380086205580854
0.9999999998306028 0.7491116802362363


