In [1]:
import optuna
from functools import partial
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats
from scipy.stats import norm, skew
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import matplotlib
font = {'family': 'Yu Mincho'}
matplotlib.rc('font', **font)

pd.set_option('max_columns',1000)
pd.set_option('max_rows',1000)

import warnings
warnings.filterwarnings('ignore')

import re
import geocoder
from geopy.distance import great_circle, vincenty
from tqdm import tqdm
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import os
import gc
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split
from time import time
import datetime
from script import RegressionPredictor
import japanize_matplotlib
# print(os.listdir("././input"))
# print(os.listdir("././submit"))

SEED=1234
n_splits=10

In [2]:
train=pd.read_csv('./input/prep_train1030.csv')
test=pd.read_csv('./input/prep_train1030.csv')
y_train = train['賃料']

In [3]:
drop_col = ['id','賃料']
## 必要な特徴量に絞る
y_train = train['賃料']
y_train_log = np.log1p(y_train)
X_train = train.drop(drop_col,axis=1)
X_test = test.drop(drop_col,axis=1)
features = ['面積','築年数','sta_min','center_dis','loc_lat','loc_lon','総階数','所在階','2017平均単価_mean','山手線','千代田線','日比谷線','丸ノ内線']
X_train = X_train[features]
X_test = X_test[features]

In [4]:
def get_default_parameter_suggestions(trial):
    """
    Get parameter sample for Boosting (like XGBoost, LightGBM)

    Args:
        trial(trial.Trial):

    Returns:
        dict: parameter sample generated by trial object
    """
    return {
        'num_iterations': 50000,
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'mae',
        'random_state': 0,
        'verbose': 5000,
        # L2 正則化
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 1e3),
        # L1 正則化
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 1e3),
        # 弱学習木ごとに使う特徴量の割合
        # 0.5 だと全体のうち半分の特徴量を最初に選んで, その範囲内で木を成長させる
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, .1),
        # 学習データ全体のうち使用する割合
        # colsample とは反対に row 方向にサンプルする
        'subsample': trial.suggest_discrete_uniform('subsample', .5, 1., .1),
        # 木の最大の深さ
        # たとえば 5 の時各弱学習木の各データに対するルールは、最大でも5に制限される.
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        # 末端ノードに含まれる最小のサンプル数
        # これを下回るような分割は作れなくなるため, 大きく設定するとより全体の傾向でしか分割ができなくなる
        # [NOTE]: 数であるのでデータセットの大きさ依存であることに注意
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 40)
    }

    


In [7]:
def objective(X_train,y_train,X_test,trial):
    params=get_default_parameter_suggestions(trial)
    kf = KFold(n_splits=3, random_state=SEED)

    rmses = list()
    feature_importances = pd.DataFrame()
    feature_importances['feature'] = X_train.columns
    predictions = pd.DataFrame({'id':list(test.id.values)})
    oof = pd.DataFrame({'id':list(train.id.values),'y_train':list(y_train.values)})
    training_start_time = time()

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X_train, y_train_log)):
        start_time = time()
        print('Training on fold {}'.format(fold + 1))

        tr_x, tr_y = X_train.iloc[trn_idx], y_train_log.iloc[trn_idx]
        vl_x, vl_y = X_train.iloc[val_idx], y_train_log.iloc[val_idx]
        print(len(tr_x), len(vl_y))

        tr_data = lgb.Dataset(tr_x, label=tr_y)
        vl_data = lgb.Dataset(vl_x, label=vl_y)
        clf = lgb.train(params, tr_data, 5000, valid_sets = [tr_data, vl_data], verbose_eval=500)
        oof.loc[val_idx,'oof'] = np.expm1(clf.predict(vl_x, num_iteration=clf.best_iteration))

        ## アンサンブル
        print('start predict')
        pred = clf.predict(X_test)
        predictions['pred'+str(fold)] = np.expm1(pred)

        feature_importances['fold_{}'.format(fold + 1)] = clf.feature_importance('gain')
        rmses.append(clf.best_score['valid_1']['l1'])

        del tr_x, tr_y, vl_x, vl_y, tr_data, vl_data
        gc.collect()
    return np.sqrt(mean_squared_error(oof['oof'], oof['y_train']))

In [None]:
f = partial(objective,X_train,y_train,X_test)
study = optuna.create_study()
study.optimize(f, n_trials=100)

Training on fold 1
20978 10489
[500]	training's l1: 0.0699343	valid_1's l1: 0.0764836
[1000]	training's l1: 0.064167	valid_1's l1: 0.0735889
[1500]	training's l1: 0.0603062	valid_1's l1: 0.0719009
[2000]	training's l1: 0.0575433	valid_1's l1: 0.0711245
[2500]	training's l1: 0.0551386	valid_1's l1: 0.0704255
[3000]	training's l1: 0.0531699	valid_1's l1: 0.0699534
[3500]	training's l1: 0.0512298	valid_1's l1: 0.069406
[4000]	training's l1: 0.0495814	valid_1's l1: 0.0690664
[4500]	training's l1: 0.0480442	valid_1's l1: 0.0687303
[5000]	training's l1: 0.0466935	valid_1's l1: 0.0685255
[5500]	training's l1: 0.0454199	valid_1's l1: 0.0682491
[6000]	training's l1: 0.0441229	valid_1's l1: 0.0680006
[6500]	training's l1: 0.0429969	valid_1's l1: 0.0678406
[7000]	training's l1: 0.0417684	valid_1's l1: 0.0676624
[7500]	training's l1: 0.0407798	valid_1's l1: 0.0675261
[8000]	training's l1: 0.0397239	valid_1's l1: 0.0673624
[8500]	training's l1: 0.0387525	valid_1's l1: 0.0672373
[9000]	training's l1

[32m[I 2019-10-31 19:53:34,376][0m Finished trial#0 resulted in value: 19663.97094128272. Current best value is 19663.97094128272 with parameters: {'reg_lambda': 56.36549282779517, 'reg_alpha': 0.008503316431288783, 'colsample_bytree': 0.5, 'subsample': 0.7, 'max_depth': 4, 'min_child_weight': 35}.[0m


Training on fold 1
20978 10489
[500]	training's l1: 0.0728494	valid_1's l1: 0.0784248
[1000]	training's l1: 0.0675624	valid_1's l1: 0.0754231
[1500]	training's l1: 0.0639919	valid_1's l1: 0.073706
[2000]	training's l1: 0.0610669	valid_1's l1: 0.0726108
[2500]	training's l1: 0.0587617	valid_1's l1: 0.0719879
[3000]	training's l1: 0.0567798	valid_1's l1: 0.0714818
[3500]	training's l1: 0.0550176	valid_1's l1: 0.0710483
[4000]	training's l1: 0.0534258	valid_1's l1: 0.0705752
[4500]	training's l1: 0.0519601	valid_1's l1: 0.0702269
[5000]	training's l1: 0.0506423	valid_1's l1: 0.0699239
[5500]	training's l1: 0.0494479	valid_1's l1: 0.0696425
[6000]	training's l1: 0.04832	valid_1's l1: 0.0694133
[6500]	training's l1: 0.0472415	valid_1's l1: 0.0692134
[7000]	training's l1: 0.0461847	valid_1's l1: 0.0690606
[7500]	training's l1: 0.0452503	valid_1's l1: 0.0689634
[8000]	training's l1: 0.0443705	valid_1's l1: 0.0688655
[8500]	training's l1: 0.0435416	valid_1's l1: 0.0687908
[9000]	training's l1:

[32m[I 2019-10-31 19:54:04,901][0m Finished trial#1 resulted in value: 19890.00858728532. Current best value is 19663.97094128272 with parameters: {'reg_lambda': 56.36549282779517, 'reg_alpha': 0.008503316431288783, 'colsample_bytree': 0.5, 'subsample': 0.7, 'max_depth': 4, 'min_child_weight': 35}.[0m


Training on fold 1
20978 10489
[500]	training's l1: 0.0737041	valid_1's l1: 0.0785522
[1000]	training's l1: 0.068783	valid_1's l1: 0.0755926
[1500]	training's l1: 0.0654781	valid_1's l1: 0.0739593
[2000]	training's l1: 0.0628766	valid_1's l1: 0.0728321
[2500]	training's l1: 0.0607867	valid_1's l1: 0.0721161
[3000]	training's l1: 0.0590294	valid_1's l1: 0.0715324
[3500]	training's l1: 0.0573955	valid_1's l1: 0.0710176
[4000]	training's l1: 0.0559498	valid_1's l1: 0.0706496
[4500]	training's l1: 0.0545812	valid_1's l1: 0.0703383
[5000]	training's l1: 0.0533488	valid_1's l1: 0.0699975
[5500]	training's l1: 0.0522003	valid_1's l1: 0.0697574
[6000]	training's l1: 0.0511819	valid_1's l1: 0.0695545
[6500]	training's l1: 0.0502122	valid_1's l1: 0.0693586
[7000]	training's l1: 0.0492188	valid_1's l1: 0.0691179
[7500]	training's l1: 0.0483331	valid_1's l1: 0.0690007
[8000]	training's l1: 0.0474836	valid_1's l1: 0.0688416
[8500]	training's l1: 0.0466776	valid_1's l1: 0.0687179
[9000]	training's l

[32m[I 2019-10-31 19:54:37,228][0m Finished trial#2 resulted in value: 20032.082965696878. Current best value is 19663.97094128272 with parameters: {'reg_lambda': 56.36549282779517, 'reg_alpha': 0.008503316431288783, 'colsample_bytree': 0.5, 'subsample': 0.7, 'max_depth': 4, 'min_child_weight': 35}.[0m


Training on fold 1
20978 10489
[500]	training's l1: 0.0842077	valid_1's l1: 0.0874976
[1000]	training's l1: 0.0842077	valid_1's l1: 0.0874976
[1500]	training's l1: 0.0842077	valid_1's l1: 0.0874976
[2000]	training's l1: 0.0842077	valid_1's l1: 0.0874976
[2500]	training's l1: 0.0842077	valid_1's l1: 0.0874976
[3000]	training's l1: 0.0842077	valid_1's l1: 0.0874976
[3500]	training's l1: 0.0842077	valid_1's l1: 0.0874976
[4000]	training's l1: 0.0842077	valid_1's l1: 0.0874976
[4500]	training's l1: 0.0842077	valid_1's l1: 0.0874976
[5000]	training's l1: 0.0842077	valid_1's l1: 0.0874976
[5500]	training's l1: 0.0842077	valid_1's l1: 0.0874976
[6000]	training's l1: 0.0842077	valid_1's l1: 0.0874976
[6500]	training's l1: 0.0842077	valid_1's l1: 0.0874976
[7000]	training's l1: 0.0842077	valid_1's l1: 0.0874976
[7500]	training's l1: 0.0842077	valid_1's l1: 0.0874976
[8000]	training's l1: 0.0842077	valid_1's l1: 0.0874976
[8500]	training's l1: 0.0842077	valid_1's l1: 0.0874976
[9000]	training's 

[32m[I 2019-10-31 19:54:46,262][0m Finished trial#3 resulted in value: 30854.11192084049. Current best value is 19663.97094128272 with parameters: {'reg_lambda': 56.36549282779517, 'reg_alpha': 0.008503316431288783, 'colsample_bytree': 0.5, 'subsample': 0.7, 'max_depth': 4, 'min_child_weight': 35}.[0m


Training on fold 1
20978 10489
[500]	training's l1: 0.059066	valid_1's l1: 0.071751
[1000]	training's l1: 0.0507775	valid_1's l1: 0.0690089
[1500]	training's l1: 0.0451704	valid_1's l1: 0.067614
[2000]	training's l1: 0.041004	valid_1's l1: 0.0667834
[2500]	training's l1: 0.0375027	valid_1's l1: 0.0662056
[3000]	training's l1: 0.034537	valid_1's l1: 0.0658449
[3500]	training's l1: 0.0319983	valid_1's l1: 0.0655124
[4000]	training's l1: 0.0296168	valid_1's l1: 0.0652427
[4500]	training's l1: 0.027615	valid_1's l1: 0.0650209
[5000]	training's l1: 0.0258328	valid_1's l1: 0.0648589
[5500]	training's l1: 0.0242066	valid_1's l1: 0.0646414
[6000]	training's l1: 0.0227341	valid_1's l1: 0.0644779
[6500]	training's l1: 0.0212956	valid_1's l1: 0.0643838
[7000]	training's l1: 0.0200933	valid_1's l1: 0.0642401
[7500]	training's l1: 0.0189876	valid_1's l1: 0.0641601
[8000]	training's l1: 0.0179142	valid_1's l1: 0.0641387
[8500]	training's l1: 0.016959	valid_1's l1: 0.0640732
[9000]	training's l1: 0.0