In [67]:
import pandas as pd
import pandas_profiling as pdp
import lightgbm as lgb
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
import optuna

In [18]:
train = pd.read_csv('../processed_data/train_v10.csv')
test = pd.read_csv('../processed_data/test_v10.csv')

In [26]:
use_col = train.columns

un_use_col = ['id','y','log_y','high_price_label','location', 'access', 'layout', 'age', 'direction', 'area','floor', 'bath_toilet', 'kitchen',
             'broadcast_com', 'facilities','parking', 'enviroment', 'structure', 'contract_period',
             'walk_time','23ku',
            #  'area_num_countall','floor_countall','room_num_countall','facilities_countall','age_countall','area_num_countall',
            ]
use_col = [c for c in use_col if c not in un_use_col]

In [27]:
X, y = train.loc[:,use_col], train.loc[:,'high_price_label']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [29]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [52]:
lgbm_params = {
        # 多値分類問題
        'objective': 'binary',
        'metric': 'accuracy',
        'stratified': True
    }

# 上記のパラメータでモデルを学習する
model = lgb.train(lgbm_params, lgb_train, valid_sets=lgb_eval)

# テストデータを予測する
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

In [93]:
pred_label = [1 if i>0.5 else 0 for i in y_pred]

In [94]:
accuracy = accuracy_score(y_test,pred_label)

In [95]:
accuracy

0.9956572396991844

In [66]:
pd.DataFrame({
    'gt': y_test,
    'pred': pred_label
})

Unnamed: 0,gt,pred
27300,0,0
2953,0,0
8326,0,0
2921,0,0
19929,0,0
...,...,...
17358,0,0
4024,0,0
9721,0,0
12102,0,0


In [88]:
def objective(trial):

    learning_rate = trial.suggest_uniform('learning_rate', 0, 1.0)
    num_leaves = trial.suggest_int('num_leaves', 10, 2**8)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 2, 1000)

    lgbm_params = {
        'task': 'train',
        "metric": 'binary',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        "learning_rate": learning_rate,
        "num_leaves": num_leaves,
        "max_depth": max_depth,
        "n_jobs": 1,
        'verbose': -1,
        "seed": 0
    }

    cv_results = lgb.cv(lgbm_params, lgb_train, nfold=3, stratified=True)
    score = np.array(cv_results['binary_logloss-mean']).mean()
    # accuracy = accuracy_score(y_test,pred_label)
    return score

In [90]:
study = optuna.create_study()
study.optimize(objective,n_trials=10)

[32m[I 2019-10-05 16:33:51,857][0m Finished trial#0 resulted in value: 0.41546025744066833. Current best value is 0.41546025744066833 with parameters: {'learning_rate': 0.5723601782298304, 'num_leaves': 207, 'max_depth': 6, 'min_data_in_leaf': 744}.[0m
[32m[I 2019-10-05 16:33:53,001][0m Finished trial#1 resulted in value: 0.21530185384950232. Current best value is 0.21530185384950232 with parameters: {'learning_rate': 0.17172165064234912, 'num_leaves': 250, 'max_depth': 3, 'min_data_in_leaf': 506}.[0m
[32m[I 2019-10-05 16:33:53,970][0m Finished trial#2 resulted in value: 0.665715133243194. Current best value is 0.21530185384950232 with parameters: {'learning_rate': 0.17172165064234912, 'num_leaves': 250, 'max_depth': 3, 'min_data_in_leaf': 506}.[0m
[32m[I 2019-10-05 16:33:55,190][0m Finished trial#3 resulted in value: 0.2540975341144944. Current best value is 0.21530185384950232 with parameters: {'learning_rate': 0.17172165064234912, 'num_leaves': 250, 'max_depth': 3, 'min_d

In [91]:
print('best trial: ', study.best_trial)
print('---------------------------------')
print('best_params: ', study.best_params)

best trial:  FrozenTrial(number=8, state=<TrialState.COMPLETE: 1>, value=0.038809788678887076, datetime_start=datetime.datetime(2019, 10, 5, 16, 33, 59, 267502), datetime_complete=datetime.datetime(2019, 10, 5, 16, 34, 0, 632326), params={'learning_rate': 0.14264489475165743, 'num_leaves': 100, 'max_depth': 4, 'min_data_in_leaf': 337}, distributions={'learning_rate': UniformDistribution(low=0, high=1.0), 'num_leaves': IntUniformDistribution(low=10, high=256), 'max_depth': IntUniformDistribution(low=3, high=8), 'min_data_in_leaf': IntUniformDistribution(low=2, high=1000)}, user_attrs={}, system_attrs={'_number': 8}, intermediate_values={}, params_in_internal_repr={'learning_rate': 0.14264489475165743, 'num_leaves': 100, 'max_depth': 4, 'min_data_in_leaf': 337}, trial_id=8)
---------------------------------
best_params:  {'learning_rate': 0.14264489475165743, 'num_leaves': 100, 'max_depth': 4, 'min_data_in_leaf': 337}


In [None]:
mdl = lgb.train(study.best_params, lgb_train)