In [47]:
import glob
import os
import re
import sys
import warnings
import pickle

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import (KFold, StratifiedKFold, cross_val_score,
                                     train_test_split)
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

warnings.filterwarnings('ignore')

In [48]:
%pwd
%cd /home/toshiya/Workspace/learning/signate/mynavi_2019/

/home/toshiya/Workspace/learning/signate/mynavi_2019


### 方針

- high_price_flag==1が30000件中254件のみ
- アンダーサンプリングによって複数の学習データと学習させた分類器を作成
- テストデータに対する外れ値予測をアンサンブルによって行う

In [49]:
df = pd.read_csv('processed_data/train_v11.csv')
df.columns

Index(['id', 'y', 'location', 'access', 'layout', 'age', 'direction', 'area',
       'floor', 'bath_toilet',
       ...
       'area_num_countall', 'age_countall', 'floor_countall',
       'max_floor_countall', 'layout_countall', 'direction_countall',
       'facilities_countall', 'contract_period_countall', 'latitude',
       'longitude'],
      dtype='object', length=125)

### データの分割&学習データ作成

In [50]:
high = df[df['high_price_flag']==1]
other = df[df['high_price_flag']!=1]

In [51]:
ratio = 3
r1 = other.sample(n=len(high)*ratio)
r2 = other.sample(n=len(high)*ratio)
r3 = other.sample(n=len(high)*ratio)
r4 = other.sample(n=len(high)*ratio)

In [52]:
r1 = pd.concat([r1,high]).reset_index(drop=True)
r2 = pd.concat([r2,high]).reset_index(drop=True)
r3 = pd.concat([r3,high]).reset_index(drop=True)
r4 = pd.concat([r4,high]).reset_index(drop=True)

### 実験

In [53]:
from src.utils.high_and_low_clf import High_and_Low_Classifier

In [54]:
use_col = df.columns
un_use_col = ['id','y','log_y','high_price_flag','location', 'access', 'layout', 'age', 'direction', 'area','floor', 'bath_toilet', 'kitchen',
                 'broadcast_com', 'facilities','parking', 'enviroment', 'structure', 'contract_period',
                 'walk_time','23ku',
                #  'area_num_countall','floor_countall','room_num_countall','facilities_countall','age_countall','area_num_countall',
                ]
mdl = lgb.Booster(model_file='mdl/1011_lgbm.txt')
feature_importances = pd.DataFrame()
feature_importances['feature'] = mdl.feature_name()
feature_importances['importance'] = mdl.feature_importance()
feature_importances = feature_importances.sort_values(by='importance', ascending=False)

un_use_col += list(feature_importances[feature_importances['importance']==0]['feature'])

use_col = [c for c in use_col if c not in un_use_col]

In [55]:
len(use_col)

89

In [56]:
print(use_col)

['area_num', 'age_month', 'max_floor', 'room_num', 'L', 'D', 'K', 'R', 'north', 'structure_orderd', 'min_time', 'avg_time', 'bicycle_parking', 'car_parking', 'bike_parking', 'toilet', 'bath', 'sm_doku', 'kanso', 'onsui', 'oidaki', 'b_t_split', 'teiki_syakuya', 'e_num', 'dis_ave', 'dis_min', 'school', 'univ', 'area_par_room', '23ku_mean_std', 'facilities_0', 'facilities_1', 'facilities_2', 'facilities_3', 'facilities_4', 'facilities_5', 'facilities_6', 'facilities_10', 'facilities_11', 'facilities_12', 'facilities_14', 'facilities_15', 'facilities_16', 'facilities_18', 'facilities_19', 'facilities_20', 'facilities_23', 'facilities_24', 'facilities_26', 'facilities_27', 'facilities_28', 'facilities_29', 'facilities_30', 'facilities_32', 'facilities_33', 'facilities_35', 'facilities_36', 'bc_0', 'bc_1', 'bc_2', 'bc_3', 'bc_4', 'bc_5', 'bc_7', 'kitchen_0', 'kitchen_1', 'kitchen_2', 'kitchen_3', 'kitchen_4', 'kitchen_5', 'kitchen_6', 'kitchen_8', 'kitchen_9', 'kitchen_11', 'kitchen_12', 'ki

In [57]:
X_train,X_test,y_train,y_test = train_test_split(r1.loc[:,use_col],r1.loc[:,'high_price_flag'],random_state=0,test_size=0.3)

In [58]:
def objective(trial):

    learning_rate = trial.suggest_uniform('learning_rate', 0, 1.0)
    num_leaves = trial.suggest_int('num_leaves', 10, 2**8)
    max_depth = trial.suggest_int('max_depth', 3, 8)

    lgbm_params = {
        'task': 'train',
        # "metrics": 'xentropy',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        "learning_rate": learning_rate,
        "num_leaves": num_leaves,
        "max_depth": max_depth,
        "n_jobs": 1,
        'verbose': -1,
        "seed": 0
    }

    mdl = lgb.LGBMClassifier(**lgbm_params)
    stratifiedkfold = StratifiedKFold(n_splits=3)
    scores = cross_val_score(mdl,X_train,y_train,cv=stratifiedkfold,scoring='neg_log_loss')
    score = np.mean(scores)

    return score

In [59]:
study = optuna.create_study()
study.optimize(objective,n_trials=10)

[32m[I 2019-10-13 21:19:49,392][0m Finished trial#0 resulted in value: -0.11925770162843767. Current best value is -0.11925770162843767 with parameters: {'learning_rate': 0.8136290549750936, 'num_leaves': 149, 'max_depth': 4}.[0m
[32m[I 2019-10-13 21:19:49,586][0m Finished trial#1 resulted in value: -0.11028312246706688. Current best value is -0.11925770162843767 with parameters: {'learning_rate': 0.8136290549750936, 'num_leaves': 149, 'max_depth': 4}.[0m
[32m[I 2019-10-13 21:19:49,779][0m Finished trial#2 resulted in value: -0.10235377276527878. Current best value is -0.11925770162843767 with parameters: {'learning_rate': 0.8136290549750936, 'num_leaves': 149, 'max_depth': 4}.[0m
[32m[I 2019-10-13 21:19:49,981][0m Finished trial#3 resulted in value: -0.11179689666823045. Current best value is -0.11925770162843767 with parameters: {'learning_rate': 0.8136290549750936, 'num_leaves': 149, 'max_depth': 4}.[0m
[32m[I 2019-10-13 21:19:50,164][0m Finished trial#4 resulted in va

In [60]:
mdl = lgb.LGBMClassifier(**study.best_params)
mdl.fit(X_train,y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.8200823230165528,
               max_depth=4, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=193,
               objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [61]:
pred_train = mdl.predict_proba(X_train)[:,1]
pred_train = [1 if i>0.5 else 0 for i in pred_train]
pred_test = mdl.predict_proba(X_test)[:,1]
pred_test = [1 if i>0.5 else 0 for i in pred_test]
train_accuracy = accuracy_score(y_train,pred_train)
test_accuracy = accuracy_score(y_test,pred_test)

In [62]:
confusion_matrix(y_test,pred_test)

array([[225,   1],
       [  2,  77]])

In [63]:
pred_r2 = mdl.predict_proba(df.loc[:,use_col])[:,1]
pred_r2 = [1 if i>0.5 else 0 for i in pred_r2]
confusion_matrix(df.loc[:,'high_price_flag'],pred_r2)

array([[30665,   547],
       [    2,   252]])

### 関数定義

In [64]:
def model_train(r,use_col):
    X_train,X_test,y_train,y_test = train_test_split(r.loc[:,use_col],r.loc[:,'high_price_flag'],random_state=0,test_size=0.3)
    def objective(trial):

        learning_rate = trial.suggest_uniform('learning_rate', 0, 1.0)
        num_leaves = trial.suggest_int('num_leaves', 10, 2**8)
        max_depth = trial.suggest_int('max_depth', 3, 8)

        lgbm_params = {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'binary',
            "learning_rate": learning_rate,
            "num_leaves": num_leaves,
            "max_depth": max_depth,
            "n_jobs": 1,
            'verbose': -1,
            "seed": 0
        }

        mdl = lgb.LGBMClassifier(**lgbm_params)
        stratifiedkfold = StratifiedKFold(n_splits=3)
        scores = cross_val_score(mdl,X_train,y_train,cv=stratifiedkfold,scoring='neg_log_loss')
        score = np.mean(scores)

        return score
    
    study = optuna.create_study()
    study.optimize(objective,n_trials=10)
    
    mdl = lgb.LGBMClassifier(**study.best_params)
    mdl.fit(X_train,y_train)
    
    pred_test = mdl.predict_proba(X_test)[:,1]
    pred_test = [1 if i>0.5 else 0 for i in pred_test]
    test_accuracy = accuracy_score(y_test,pred_test)
    
    trained_mdl = mdl.fit(r.loc[:,use_col],r.loc[:,'high_price_flag'])
    
    return test_accuracy,confusion_matrix(y_test,pred_test),trained_mdl

In [65]:
for i,r in enumerate([r1,r2,r3,r4]):
    acc,cm,m = model_train(r,use_col)
    print('---------------------')
    print('test acc: ',acc)
    print(cm)
    print('---------------------')
    with open('mdl/1011_hl{}.pkl'.format(str(i)), 'wb') as f:
        pickle.dump(m, f)

[32m[I 2019-10-13 21:19:51,683][0m Finished trial#0 resulted in value: -0.07197853085659534. Current best value is -0.07197853085659534 with parameters: {'learning_rate': 0.08790424544562137, 'num_leaves': 64, 'max_depth': 5}.[0m
[32m[I 2019-10-13 21:19:51,932][0m Finished trial#1 resulted in value: -0.08846749384942047. Current best value is -0.08846749384942047 with parameters: {'learning_rate': 0.11554416253421962, 'num_leaves': 46, 'max_depth': 5}.[0m
[32m[I 2019-10-13 21:19:52,130][0m Finished trial#2 resulted in value: -0.1041742695285159. Current best value is -0.1041742695285159 with parameters: {'learning_rate': 0.622523271536627, 'num_leaves': 180, 'max_depth': 7}.[0m
[32m[I 2019-10-13 21:19:52,315][0m Finished trial#3 resulted in value: -0.10007216552907372. Current best value is -0.1041742695285159 with parameters: {'learning_rate': 0.622523271536627, 'num_leaves': 180, 'max_depth': 7}.[0m
[32m[I 2019-10-13 21:19:52,509][0m Finished trial#4 resulted in value: 

---------------------
test acc:  0.9868852459016394
[[225   1]
 [  3  76]]
---------------------


[32m[I 2019-10-13 21:19:53,888][0m Finished trial#0 resulted in value: -0.14999642668255497. Current best value is -0.14999642668255497 with parameters: {'learning_rate': 0.44450843752212876, 'num_leaves': 114, 'max_depth': 4}.[0m
[32m[I 2019-10-13 21:19:54,137][0m Finished trial#1 resulted in value: -0.10959265068792522. Current best value is -0.14999642668255497 with parameters: {'learning_rate': 0.44450843752212876, 'num_leaves': 114, 'max_depth': 4}.[0m
[32m[I 2019-10-13 21:19:54,346][0m Finished trial#2 resulted in value: -0.16228574275645516. Current best value is -0.16228574275645516 with parameters: {'learning_rate': 0.4591867228677591, 'num_leaves': 55, 'max_depth': 7}.[0m
[32m[I 2019-10-13 21:19:54,529][0m Finished trial#3 resulted in value: -0.1745108543944813. Current best value is -0.1745108543944813 with parameters: {'learning_rate': 0.9657915848539949, 'num_leaves': 80, 'max_depth': 5}.[0m
[32m[I 2019-10-13 21:19:54,721][0m Finished trial#4 resulted in valu

---------------------
test acc:  0.9868852459016394
[[223   3]
 [  1  78]]
---------------------


[32m[I 2019-10-13 21:19:56,054][0m Finished trial#0 resulted in value: -0.144657857780867. Current best value is -0.144657857780867 with parameters: {'learning_rate': 0.5963512594187522, 'num_leaves': 157, 'max_depth': 8}.[0m
[32m[I 2019-10-13 21:19:56,282][0m Finished trial#1 resulted in value: -0.1549507478237798. Current best value is -0.1549507478237798 with parameters: {'learning_rate': 0.34978726916479375, 'num_leaves': 222, 'max_depth': 8}.[0m
[32m[I 2019-10-13 21:19:56,488][0m Finished trial#2 resulted in value: -0.1412583489657198. Current best value is -0.1549507478237798 with parameters: {'learning_rate': 0.34978726916479375, 'num_leaves': 222, 'max_depth': 8}.[0m
[32m[I 2019-10-13 21:19:56,716][0m Finished trial#3 resulted in value: -0.13879406768072544. Current best value is -0.1549507478237798 with parameters: {'learning_rate': 0.34978726916479375, 'num_leaves': 222, 'max_depth': 8}.[0m
[32m[I 2019-10-13 21:19:56,929][0m Finished trial#4 resulted in value: -

---------------------
test acc:  0.9770491803278688
[[221   5]
 [  2  77]]
---------------------


[32m[I 2019-10-13 21:19:58,282][0m Finished trial#0 resulted in value: -0.07891782519862893. Current best value is -0.07891782519862893 with parameters: {'learning_rate': 0.11495524677808655, 'num_leaves': 117, 'max_depth': 3}.[0m
[32m[I 2019-10-13 21:19:58,505][0m Finished trial#1 resulted in value: -0.1114142273486769. Current best value is -0.1114142273486769 with parameters: {'learning_rate': 0.3460955207380677, 'num_leaves': 64, 'max_depth': 8}.[0m
[32m[I 2019-10-13 21:19:58,704][0m Finished trial#2 resulted in value: -0.13187694674861528. Current best value is -0.13187694674861528 with parameters: {'learning_rate': 0.4295581523413391, 'num_leaves': 173, 'max_depth': 3}.[0m
[32m[I 2019-10-13 21:19:58,885][0m Finished trial#3 resulted in value: -0.11124590040599501. Current best value is -0.13187694674861528 with parameters: {'learning_rate': 0.4295581523413391, 'num_leaves': 173, 'max_depth': 3}.[0m
[32m[I 2019-10-13 21:19:59,110][0m Finished trial#4 resulted in valu

---------------------
test acc:  0.9737704918032787
[[219   7]
 [  1  78]]
---------------------
