In [108]:
import glob
import os
import re
import sys
import warnings
import pickle

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import (KFold, StratifiedKFold, cross_val_score,
                                     train_test_split)
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

warnings.filterwarnings('ignore')

In [72]:
%pwd
%cd /home/toshiya/Workspace/learning/signate/mynavi_2019/

/home/toshiya/Workspace/learning/signate/mynavi_2019


### 方針

- high_price_flag==1が30000件中254件のみ
- アンダーサンプリングによって複数の学習データと学習させた分類器を作成
- テストデータに対する外れ値予測をアンサンブルによって行う

In [73]:
df = pd.read_csv('processed_data/train_v11.csv')

### データの分割&学習データ作成

In [74]:
high = df[df['high_price_flag']==1]
other = df[df['high_price_flag']!=1]

In [75]:
ratio = 3
r1 = other.sample(n=len(high)*ratio)
r2 = other.sample(n=len(high)*ratio)
r3 = other.sample(n=len(high)*ratio)
r4 = other.sample(n=len(high)*ratio)

In [76]:
r1 = pd.concat([r1,high]).reset_index(drop=True)
r2 = pd.concat([r2,high]).reset_index(drop=True)
r3 = pd.concat([r3,high]).reset_index(drop=True)
r4 = pd.concat([r4,high]).reset_index(drop=True)

### 実験

In [77]:
from src.utils.high_and_low_clf import High_and_Low_Classifier

In [78]:
use_col = df.columns
un_use_col = ['id','y','log_y','location', 'access', 'layout', 'age', 'direction', 'area','floor', 'bath_toilet', 'kitchen',
                 'broadcast_com', 'facilities','parking', 'enviroment', 'structure', 'contract_period',
                 'walk_time','23ku',
                #  'area_num_countall','floor_countall','room_num_countall','facilities_countall','age_countall','area_num_countall',
                ]
mdl = lgb.Booster(model_file='mdl/1011_lgbm.txt')
feature_importances = pd.DataFrame()
feature_importances['feature'] = mdl.feature_name()
feature_importances['importance'] = mdl.feature_importance()
feature_importances = feature_importances.sort_values(by='importance', ascending=False)

un_use_col += list(feature_importances[feature_importances['importance']==0]['feature'])

use_col = [c for c in use_col if c not in un_use_col]

In [79]:
len(use_col)

88

In [80]:
X_train,X_test,y_train,y_test = train_test_split(r1.loc[:,use_col],r1.loc[:,'high_price_flag'],random_state=0,test_size=0.3)

In [84]:
def objective(trial):

    learning_rate = trial.suggest_uniform('learning_rate', 0, 1.0)
    num_leaves = trial.suggest_int('num_leaves', 10, 2**8)
    max_depth = trial.suggest_int('max_depth', 3, 8)

    lgbm_params = {
        'task': 'train',
        # "metrics": 'xentropy',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        "learning_rate": learning_rate,
        "num_leaves": num_leaves,
        "max_depth": max_depth,
        "n_jobs": 1,
        'verbose': -1,
        "seed": 0
    }

    mdl = lgb.LGBMClassifier(**lgbm_params)
    stratifiedkfold = StratifiedKFold(n_splits=3)
    scores = cross_val_score(mdl,X_train,y_train,cv=stratifiedkfold,scoring='neg_log_loss')
    score = np.mean(scores)

    return score

In [85]:
study = optuna.create_study()
study.optimize(objective,n_trials=10)

[32m[I 2019-10-11 23:04:52,255][0m Finished trial#0 resulted in value: -3.814753195251598e-06. Current best value is -3.814753195251598e-06 with parameters: {'learning_rate': 0.1683242508253997, 'num_leaves': 34, 'max_depth': 5}.[0m
[32m[I 2019-10-11 23:04:52,804][0m Finished trial#1 resulted in value: -3.812615324363212e-06. Current best value is -3.814753195251598e-06 with parameters: {'learning_rate': 0.1683242508253997, 'num_leaves': 34, 'max_depth': 5}.[0m
[32m[I 2019-10-11 23:04:53,344][0m Finished trial#2 resulted in value: -3.4258698731307027e-06. Current best value is -3.814753195251598e-06 with parameters: {'learning_rate': 0.1683242508253997, 'num_leaves': 34, 'max_depth': 5}.[0m
[32m[I 2019-10-11 23:04:53,862][0m Finished trial#3 resulted in value: -3.60119730411879e-06. Current best value is -3.814753195251598e-06 with parameters: {'learning_rate': 0.1683242508253997, 'num_leaves': 34, 'max_depth': 5}.[0m
[32m[I 2019-10-11 23:04:54,389][0m Finished trial#4 re

In [90]:
mdl = lgb.LGBMClassifier(**study.best_params)
mdl.fit(X_train,y_train)

[32m[I 2019-10-11 23:07:02,076][0m Finished trial#0 resulted in value: -4.079421569981034e-06. Current best value is -4.079421569981034e-06 with parameters: {'learning_rate': 0.6130678836720552, 'num_leaves': 39, 'max_depth': 4}.[0m
[32m[I 2019-10-11 23:07:02,603][0m Finished trial#1 resulted in value: -3.577493797616305e-06. Current best value is -4.079421569981034e-06 with parameters: {'learning_rate': 0.6130678836720552, 'num_leaves': 39, 'max_depth': 4}.[0m
[32m[I 2019-10-11 23:07:03,162][0m Finished trial#2 resulted in value: -0.03707670679054336. Current best value is -0.03707670679054336 with parameters: {'learning_rate': 0.023191361057977367, 'num_leaves': 160, 'max_depth': 4}.[0m
[32m[I 2019-10-11 23:07:03,700][0m Finished trial#3 resulted in value: -3.19876951652724e-06. Current best value is -0.03707670679054336 with parameters: {'learning_rate': 0.023191361057977367, 'num_leaves': 160, 'max_depth': 4}.[0m
[32m[I 2019-10-11 23:07:04,226][0m Finished trial#4 res

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.023191361057977367,
               max_depth=4, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=160,
               objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [97]:
pred_train = mdl.predict_proba(X_train)[:,1]
pred_train = [1 if i>0.5 else 0 for i in pred_train]
pred_test = mdl.predict_proba(X_test)[:,1]
pred_test = [1 if i>0.5 else 0 for i in pred_test]
train_accuracy = accuracy_score(y_train,pred_train)
test_accuracy = accuracy_score(y_test,pred_test)

In [100]:
confusion_matrix(y_test,pred_test)

array([[226,   0],
       [  0,  79]])

In [112]:
pred_r2 = mdl.predict_proba(df.loc[:,use_col])[:,1]
pred_r2 = [1 if i>0.5 else 0 for i in pred_r2]
confusion_matrix(df.loc[:,'high_price_flag'],pred_r2)

array([[31212,     0],
       [    0,   254]])

### 関数定義

In [106]:
def model_train(r,use_col):
    X_train,X_test,y_train,y_test = train_test_split(r.loc[:,use_col],r.loc[:,'high_price_flag'],random_state=0,test_size=0.3)
    def objective(trial):

        learning_rate = trial.suggest_uniform('learning_rate', 0, 1.0)
        num_leaves = trial.suggest_int('num_leaves', 10, 2**8)
        max_depth = trial.suggest_int('max_depth', 3, 8)

        lgbm_params = {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'binary',
            "learning_rate": learning_rate,
            "num_leaves": num_leaves,
            "max_depth": max_depth,
            "n_jobs": 1,
            'verbose': -1,
            "seed": 0
        }

        mdl = lgb.LGBMClassifier(**lgbm_params)
        stratifiedkfold = StratifiedKFold(n_splits=3)
        scores = cross_val_score(mdl,X_train,y_train,cv=stratifiedkfold,scoring='neg_log_loss')
        score = np.mean(scores)

        return score
    
    study = optuna.create_study()
    study.optimize(objective,n_trials=10)
    
    mdl = lgb.LGBMClassifier(**study.best_params)
    mdl.fit(X_train,y_train)
    
    pred_test = mdl.predict_proba(X_test)[:,1]
    pred_test = [1 if i>0.5 else 0 for i in pred_test]
    test_accuracy = accuracy_score(y_test,pred_test)
    
    trained_mdl = mdl.fit(r.loc[:,use_col],r.loc[:,'high_price_flag'])
    
    return test_accuracy,confusion_matrix(y_test,pred_test),trained_mdl

In [109]:
for i,r in enumerate([r1,r2,r3,r4]):
    acc,cm,m = model_train(r,use_col)
    print('---------------------')
    print('test acc: ',acc)
    print(cm)
    print('---------------------')
    with open('mdl/1011_hl{}.pkl'.format(str(i)), 'wb') as f:
        pickle.dump(m, f)

[32m[I 2019-10-11 23:34:22,372][0m Finished trial#0 resulted in value: -3.43462620870446e-06. Current best value is -3.43462620870446e-06 with parameters: {'learning_rate': 0.42068960160351054, 'num_leaves': 169, 'max_depth': 4}.[0m
[32m[I 2019-10-11 23:34:22,915][0m Finished trial#1 resulted in value: -3.670889718394251e-06. Current best value is -3.670889718394251e-06 with parameters: {'learning_rate': 0.5405102681305786, 'num_leaves': 55, 'max_depth': 7}.[0m
[32m[I 2019-10-11 23:34:23,457][0m Finished trial#2 resulted in value: -3.7827567063415282e-06. Current best value is -3.7827567063415282e-06 with parameters: {'learning_rate': 0.5391100127895041, 'num_leaves': 104, 'max_depth': 6}.[0m
[32m[I 2019-10-11 23:34:23,996][0m Finished trial#3 resulted in value: -3.4126699684336676e-06. Current best value is -3.7827567063415282e-06 with parameters: {'learning_rate': 0.5391100127895041, 'num_leaves': 104, 'max_depth': 6}.[0m
[32m[I 2019-10-11 23:34:24,542][0m Finished tria

---------------------
test acc:  1.0
[[226   0]
 [  0  79]]
---------------------


[32m[I 2019-10-11 23:34:27,938][0m Finished trial#0 resulted in value: -3.68547520273531e-06. Current best value is -3.68547520273531e-06 with parameters: {'learning_rate': 0.5970612949593745, 'num_leaves': 205, 'max_depth': 3}.[0m
[32m[I 2019-10-11 23:34:28,527][0m Finished trial#1 resulted in value: -1.2750646878340078e-05. Current best value is -1.2750646878340078e-05 with parameters: {'learning_rate': 0.10233481880391293, 'num_leaves': 251, 'max_depth': 6}.[0m
[32m[I 2019-10-11 23:34:29,082][0m Finished trial#2 resulted in value: -3.123338308890526e-06. Current best value is -1.2750646878340078e-05 with parameters: {'learning_rate': 0.10233481880391293, 'num_leaves': 251, 'max_depth': 6}.[0m
[32m[I 2019-10-11 23:34:29,630][0m Finished trial#3 resulted in value: -3.8499615653387655e-06. Current best value is -1.2750646878340078e-05 with parameters: {'learning_rate': 0.10233481880391293, 'num_leaves': 251, 'max_depth': 6}.[0m
[32m[I 2019-10-11 23:34:30,182][0m Finished 

---------------------
test acc:  1.0
[[226   0]
 [  0  79]]
---------------------


[32m[I 2019-10-11 23:34:33,682][0m Finished trial#0 resulted in value: -3.706912933186865e-06. Current best value is -3.706912933186865e-06 with parameters: {'learning_rate': 0.18500248072409453, 'num_leaves': 241, 'max_depth': 7}.[0m
[32m[I 2019-10-11 23:34:34,240][0m Finished trial#1 resulted in value: -3.6060335586444725e-06. Current best value is -3.706912933186865e-06 with parameters: {'learning_rate': 0.18500248072409453, 'num_leaves': 241, 'max_depth': 7}.[0m
[32m[I 2019-10-11 23:34:34,797][0m Finished trial#2 resulted in value: -4.1559107671690924e-06. Current best value is -4.1559107671690924e-06 with parameters: {'learning_rate': 0.6120810391129562, 'num_leaves': 61, 'max_depth': 8}.[0m
[32m[I 2019-10-11 23:34:35,395][0m Finished trial#3 resulted in value: -0.0007556004683583283. Current best value is -0.0007556004683583283 with parameters: {'learning_rate': 0.06170551740145147, 'num_leaves': 49, 'max_depth': 8}.[0m
[32m[I 2019-10-11 23:34:35,980][0m Finished tr

---------------------
test acc:  1.0
[[226   0]
 [  0  79]]
---------------------


[32m[I 2019-10-11 23:34:39,464][0m Finished trial#0 resulted in value: -3.512538054618446e-06. Current best value is -3.512538054618446e-06 with parameters: {'learning_rate': 0.8765856655842689, 'num_leaves': 65, 'max_depth': 6}.[0m
[32m[I 2019-10-11 23:34:40,052][0m Finished trial#1 resulted in value: -0.0004016232048930767. Current best value is -0.0004016232048930767 with parameters: {'learning_rate': 0.06799493252810873, 'num_leaves': 55, 'max_depth': 6}.[0m
[32m[I 2019-10-11 23:34:40,606][0m Finished trial#2 resulted in value: -3.279772876627993e-06. Current best value is -0.0004016232048930767 with parameters: {'learning_rate': 0.06799493252810873, 'num_leaves': 55, 'max_depth': 6}.[0m
[32m[I 2019-10-11 23:34:41,153][0m Finished trial#3 resulted in value: -4.1334707550597575e-06. Current best value is -0.0004016232048930767 with parameters: {'learning_rate': 0.06799493252810873, 'num_leaves': 55, 'max_depth': 6}.[0m
[32m[I 2019-10-11 23:34:41,692][0m Finished trial#

---------------------
test acc:  1.0
[[226   0]
 [  0  79]]
---------------------
