In [0]:
! pip install optuna

In [0]:
import numpy as np
import sklearn.metrics
from xgboost import XGBRegressor
import optuna
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [0]:
'''
データのロード
'''
import pickle

pickle_file = "./droped_HousePrice_data_kernel.pickle"
with open(pickle_file, 'rb') as f:
    X_train = pickle.load(f)
    Y_train = pickle.load(f)
    X_test = pickle.load(f)
#     train = pickle.load(f)
#     test = pickle.load(f)
#     Id = pickle.load(f)

In [0]:
# (X_train, X_val, Y_train, Y_val) = train_test_split(X_train, Y_train, test_size=0.2, random_state=666)

In [0]:
'''
ハイパーパラメータの最適化
'''
def objective(trial):
    # hypyer param
    max_depth = trial.suggest_int('max_depth', 3, 7) 
    n_estimators = trial.suggest_int('n_estimators', 1800, 3500) 
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'auto', 'log2'])
    learning_rate = trial.suggest_uniform('learning_rate', 0.005, 0.06)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 5)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.1, 1)
    subsample= trial.suggest_uniform('subsample', 0.1, 1)
    eta = trial.suggest_uniform('eta', 0.1, 0.9)
 
    # callback
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation-error')
 
    # model
    model = XGBRegressor(
        max_depth=max_depth,
        n_estimators=n_estimators,
        max_features=max_features,
        learning_rate= learning_rate,
        min_child_weight= min_child_weight, 
        colsample_bytree = colsample_bytree, 
        subsample = subsample, 
        eta = eta, 
        n_jobs=-1,
        random_state = 1,
#         callback=[pruning_callback]   # cross_validationをするときは除外
    )
    

    kf = KFold(5, shuffle=True, random_state=1).get_n_splits(X_train)
    rmse = np.sqrt(-cross_val_score(model, X_train,Y_train, scoring="neg_mean_squared_error", cv=kf, verbose=0))
    
    return rmse.mean()

In [0]:
#     # fit
#     model.fit(X_train, 
#               Y_train, 
#               eval_metric="rmse", 
#               eval_set=[
#                   (X_train, Y_train),
#                   (X_val, Y_val)
#               ], 
#               verbose=True, 
#               early_stopping_rounds = 10)
 
#     # eval
#     score = model.evals_result()['validation_1']['rmse'][-1]
#     return score


In [26]:
# 学習の実行
# https://optuna.readthedocs.io/en/stable/reference/study.html#optuna.study.Study.optimize
study = optuna.create_study()
study.optimize(func=objective, # 実行する関数
               n_trials=100, # HPO試行回数
               timeout=None, # 与えられた秒数後に学習を中止します。default=None
               n_jobs=-1 # 並列実行するjob数
              )


Series.base is deprecated and will be removed in a future version


Series.base is deprecated and will be removed in a future version


Series.base is deprecated and will be removed in a future version


Series.base is deprecated and will be removed in a future version


Series.base is deprecated and will be removed in a future version


Series.base is deprecated and will be removed in a future version


Series.base is deprecated and will be removed in a future version


Series.base is deprecated and will be removed in a future version


Series.base is deprecated and will be removed in a future version

[I 2019-05-23 04:09:38,281] Finished trial#1 resulted in value: 0.12689996993408126. Current best value is 0.12689996993408126 with parameters: {'max_depth': 4, 'n_estimators': 2775, 'max_features': 'sqrt', 'learning_rate': 0.05713597666854238, 'min_child_weight': 4, 'colsample_bytree': 0.593349003844969, 'subsample': 0.5057505303815958, 'eta': 0.7221946046606826}.

Series.base is depr

In [0]:
'''
モデルの構築
'''
model = XGBRegressor(
        max_depth = 4,
        n_estimators =2385,
        max_features = 'auto',
        learning_rate = 0.021049677273288197,
        min_child_weight= 1, 
        colsample_bytree=0.17604967495337845, 
        subsample=0.852736097498378, 
        eta=0.2916808727686327, 
        n_jobs=-1,
        random_state = 1
    )

In [20]:
from sklearn.model_selection import cross_val_score
kf = KFold(5, shuffle=True, random_state=1).get_n_splits(X_train)
rmse = np.sqrt(-cross_val_score(model, X_train,Y_train, scoring="neg_mean_squared_error", cv=kf, verbose=0))
print('rmse:',rmse.mean())


Series.base is deprecated and will be removed in a future version


Series.base is deprecated and will be removed in a future version


Series.base is deprecated and will be removed in a future version


Series.base is deprecated and will be removed in a future version


Series.base is deprecated and will be removed in a future version



rmse: 0.11838244070959343


In [21]:
model.fit(X_train, Y_train)


Series.base is deprecated and will be removed in a future version


Series.base is deprecated and will be removed in a future version



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=0.17604967495337845, eta=0.2916808727686327,
             gamma=0, importance_type='gain',
             learning_rate=0.021049677273288197, max_delta_step=0, max_depth=4,
             max_features='auto', min_child_weight=1, missing=None,
             n_estimators=2385, n_jobs=-1, nthread=None, objective='reg:linear',
             random_state=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=True, subsample=0.852736097498378)

In [22]:
'''
提出
'''
predicted_prices = np.expm1(model.predict(X_test))
print(predicted_prices[:5])

my_submission = pd.DataFrame({'Id': Id, 'SalePrice': predicted_prices})
my_submission.Id = my_submission.Id.astype(int)
my_submission.to_csv('submission.csv', index=False)

[123728.61 160061.14 192825.69 194984.47 187178.61]
