In [36]:
#Importing Required Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import shap
import sklearn
import pickle

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import log_loss
from statistics import mean
import scipy as sp
import scipy.stats

from hyperopt.pyll.base import scope
from hyperopt import hp, fmin, tpe, Trials
from hyperopt import space_eval

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 100)
sns.set_palette("coolwarm_r", 4)

### データの読み込み

In [37]:
#Reading the data files

train = pd.read_csv(r"C:\Users\NDO00\OneDrive\デスクトップ\MI\mi_book\e-commerce\train.csv",index_col = 'id')
test = pd.read_csv(r"C:\Users\NDO00\OneDrive\デスクトップ\MI\mi_book\e-commerce\test.csv",index_col = 'id')
sample = pd.read_csv(r"C:\Users\NDO00\OneDrive\デスクトップ\MI\mi_book\e-commerce\test.csv")

前処理

In [39]:
Y_train = train['target'].copy()
#feature_8を削除
X_train = train.copy().drop(['target',"feature_8"], axis = 1)

X_test = test.copy().drop(["feature_8"], axis = 1)

In [40]:
lgbm_model = LGBMClassifier(n_estimators = 2000, learning_rate = 0.02,
                            random_state = 42, num_class = 4, metric = 'multi_logloss')

### ハイパーパラメーター探索

In [41]:
#保存したパラメータを読み込むための関数
def pickle_load(path):
    with open(path,mode='rb') as f:
        data = pickle.load(f)
        return data

In [42]:
#part1で得た特徴量49個モデルのハイパーパラメータ読み込み
best_params_full=pickle_load("best_params_full.text.")
print(best_params_full)

{'colsample_bytree': 0.46217573119119065, 'max_depth': 14, 'min_child_samples': 70, 'min_child_weight': 0, 'num_leaves': 7, 'reg_alpha': 0.5268214249542319, 'reg_lambda': 0.7393115077693782, 'subsample': 0.6397130890285847}


In [43]:
#チューニング後のハイパーパラメーターを適用したモデルを出す
lgbm_tuned = lgbm_model
lgbm_tuned = lgbm_tuned.set_params(**best_params_full)
lgbm_tuned

LGBMClassifier(colsample_bytree=0.46217573119119065, learning_rate=0.02,
               max_depth=14, metric='multi_logloss', min_child_samples=70,
               min_child_weight=0, n_estimators=2000, num_class=4, num_leaves=7,
               random_state=42, reg_alpha=0.5268214249542319,
               reg_lambda=0.7393115077693782, subsample=0.6397130890285847)

In [44]:
def prediction (X_train, Y_train, model, X_test):
    
    kfold = StratifiedKFold(n_splits = 10)

    y_pred = np.zeros((50000,4))
    train_oof = np.zeros((100000,4))
    imp=pd.DataFrame()
    n=0
    
    for idx in kfold.split(X=X_train, y=Y_train):
        train_idx, val_idx = idx[0], idx[1]
        xtrain = X_train.iloc[train_idx]
        ytrain = Y_train.iloc[train_idx]
        xval = X_train.iloc[val_idx]
        yval = Y_train.iloc[val_idx]
        
        n+=1
               
        # fit model for current fold
        model.fit(xtrain, ytrain, 
            early_stopping_rounds = 100, eval_set = [(xval,yval)], verbose = False)

        #testデータについて予測値を出す　(n_splits個の各モデルについて推論を行うので平均をとる)
        y_pred += model.predict_proba(X_test)/kfold.n_splits
        print(y_pred)
               
        val_pred = model.predict_proba(xval)
        # getting out-of-fold predictions on training set
        train_oof[val_idx] = val_pred
        
        # imp
        _imp = pd.DataFrame({"col":X_train.columns, "imp":model.feature_importances_, "nfold":n})
        imp = pd.concat([imp, _imp])
        
        # calculate and append logloss
        fold_logloss = metrics.log_loss(yval,val_pred)
        print("Logloss: {0:0.5f}". format(fold_logloss))
    
    #各foldの重要度の平均と標準偏差を算出し、まとめる。
    imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index(drop=False)
    imp.columns = ["col", "imp", "imp_std"]
    
    #テストデータの予測確率、trainデータの予測確率,説明変数の重要度を返す。
    return y_pred, train_oof, imp

part1で得た特徴量49個モデルの特徴量重要度の読み込み

In [45]:
imp = pd.read_csv('imp.csv')

## 特徴量選択

In [46]:
#特徴量がキーと重要度がバリューの辞書を生成
f_imp=dict(zip(imp["col"],imp["imp"]))
#特徴量の大きさで降順にソート
f_imp = sorted(f_imp.items(), key=lambda x:x[1],reverse=True)
f_imp = dict((x, y) for x, y in f_imp)
#キーのみをリストとして取得
feats=list(f_imp.keys())
feats

['feature_14',
 'feature_38',
 'feature_34',
 'feature_31',
 'feature_28',
 'feature_15',
 'feature_9',
 'feature_19',
 'feature_6',
 'feature_17',
 'feature_48',
 'feature_43',
 'feature_7',
 'feature_35',
 'feature_12',
 'feature_18',
 'feature_16',
 'feature_37',
 'feature_5',
 'feature_33',
 'feature_41',
 'feature_25',
 'feature_42',
 'feature_24',
 'feature_11',
 'feature_46',
 'feature_21',
 'feature_39',
 'feature_1',
 'feature_23',
 'feature_45',
 'feature_40',
 'feature_10',
 'feature_26',
 'feature_49',
 'feature_0',
 'feature_32',
 'feature_30',
 'feature_47',
 'feature_4',
 'feature_27',
 'feature_3',
 'feature_20',
 'feature_22',
 'feature_29',
 'feature_2',
 'feature_44',
 'feature_13',
 'feature_36']

### 特徴量を抽出する個数を入れると、使った特徴量の個数と誤差関数が返ってくる関数。

In [47]:
def feat_select(n,model):
    #上からn個目までの特徴量を抽出
    feats_n=feats[:n]
    X_train_n = train[feats_n]
    Y_train = train['target'].copy()
    
    kfold = StratifiedKFold(n_splits = 10)


    
    loss=[]
    
    for idx in kfold.split(X=X_train_n, y=Y_train):
        train_idx, val_idx = idx[0], idx[1]
        xtrain = X_train_n.iloc[train_idx]
        ytrain = Y_train.iloc[train_idx]
        xval = X_train_n.iloc[val_idx]
        yval = Y_train.iloc[val_idx]
        

               
        # fit model for current fold
        model.fit(xtrain, ytrain, 
            early_stopping_rounds = 100, eval_set = [(xval,yval)], verbose = False)

               
        val_pred = model.predict_proba(xval)

        
        # calculate and append logloss
        fold_logloss = metrics.log_loss(yval,val_pred)
        loss.append(fold_logloss)
    
    #特徴量の個数nと、各foldの誤差の平均を返す
    return n, np.mean(loss)

    
    

In [15]:
#特徴量を上から２個使った場合を例示する。
score={}
score[feat_select(2,lgbm_tuned)[0]]=feat_select(2,lgbm_tuned)[1]
#特徴量の個数と、誤差の対応した辞書
score

{2: 1.1167004557532132}

## 何通りか特徴量を選出し、比較する。

In [48]:
#使う特徴量の個数を入れるリスト
featlist=[5,10,15,20,25,30,35,40,45]
#特徴量と誤差の対応した辞書
score={}

In [49]:
for i in featlist:
    score[feat_select(i,lgbm_tuned)[0]]=feat_select(i,lgbm_tuned)[1]
score

{5: 1.1131235893752318,
 10: 1.1093661487758655,
 15: 1.1073977928570933,
 20: 1.1042456268641383,
 25: 1.1018667963486393,
 30: 1.1002914944692863,
 35: 1.098988494285249,
 40: 1.0965836724971503,
 45: 1.0949763832346258}

In [None]:
with open('score.text', 'wb') as g:
    pickle.dump(score,g)

## scoreの中身を見て、誤差が最も小さくなった特徴量の数mを選び、ハイパーパラメーター調整した後にさらに訓練し、予測誤差を下げる。

誤差が最小になる特徴量数mが分かった。

In [66]:
#一行下に誤差が最も小さくなる特徴量数mを入れる
feats_m=feats[:49]
print(feats_m)

with open('feats_m.txt', 'wb') as g:
    pickle.dump(feats_m,g)


['feature_14', 'feature_38', 'feature_34', 'feature_31', 'feature_28', 'feature_15', 'feature_9', 'feature_19', 'feature_6', 'feature_17', 'feature_48', 'feature_43', 'feature_7', 'feature_35', 'feature_12', 'feature_18', 'feature_16', 'feature_37', 'feature_5', 'feature_33', 'feature_41', 'feature_25', 'feature_42', 'feature_24', 'feature_11', 'feature_46', 'feature_21', 'feature_39', 'feature_1', 'feature_23', 'feature_45', 'feature_40', 'feature_10', 'feature_26', 'feature_49', 'feature_0', 'feature_32', 'feature_30', 'feature_47', 'feature_4', 'feature_27', 'feature_3', 'feature_20', 'feature_22', 'feature_29', 'feature_2', 'feature_44', 'feature_13', 'feature_36']


In [51]:
X_train_m = train[feats_m]
X_test_m = test[feats_m]

In [67]:
#特徴量をm個使った場合のハイパーパラメーター探索
def objective_m(params):

    
    clf_search = LGBMClassifier(n_estimators = 2000, learning_rate = 0.02, random_state = 42, num_class = 4, metric = 'multi_logloss', verbosity = -1)
    clf_search.set_params(**params)
   
    search_cvpred = cv_function(X_train_m, Y_train, clf_search, splits = 5)  
    score =metrics.log_loss(Y_train, search_cvpred)
    print("Logloss: {0:0.6f}".format(score)) 
    
    return score

In [68]:
#学習データの分割や、損失などを計算する関数
def cv_function (X_train, Y_train, model, splits = 10):
    
    kfold = StratifiedKFold(n_splits = splits)
    logloss = []
   
    cv_pred = np.zeros((100000,4))
    
    for idx in kfold.split(X=X_train, y=Y_train):
        train_idx, test_idx = idx[0], idx[1]
        xtrain = X_train.iloc[train_idx]
        ytrain = Y_train.iloc[train_idx]
        xtest = X_train.iloc[test_idx]
        ytest = Y_train.iloc[test_idx]
        
        # fit model for current fold
        model.fit(xtrain, ytrain, 
            early_stopping_rounds = 100, eval_set = [(xtest,ytest)], verbose = False)

        #create predictions
        preds = model.predict_proba(xtest)
        cv_pred[test_idx] = preds
                              
        # calculate and append accuracy
        fold_logloss = metrics.log_loss(ytest,preds)
        print("LogLoss: {0:0.5f}". format(fold_logloss))
        logloss.append(fold_logloss)
        
    print (np.mean(logloss))
    #実行すると損失関数を示す
    return cv_pred
    #関数の戻り値は訓練データの予測ラベルをかえす。

In [69]:
#ハイパーパラメーターは基本的に一様分布からサンプリングする。
#uniform(low,high)は[a,b]間の一様分布からサンプリングする。
#quniform(low,high,q)はround(uniform(low, high) / q) * qを返す。
params_lgbm_m = {
    "max_depth": scope.int(hp.quniform("max_depth", 3, 25, 1)),
    "subsample": hp.uniform("subsample",0.4,1),
    "colsample_bytree": hp.uniform("colsample_bytree",0.4,1),
    "min_child_weight": scope.int(hp.quniform("min_child_weight", 0.1, 1.0, 0.1)),    
    "min_child_samples": scope.int(hp.quniform("min_child_samples", 20, 100, 5)),
    "num_leaves": scope.int(hp.quniform("num_leaves", 7, 256, 1)),
    "reg_alpha": hp.uniform('reg_alpha', 0.0, 1),
    "reg_lambda": hp.uniform('reg_lambda', 0.0, 1)  
}

In [70]:
trials_m = Trials()

best_m = fmin(
    fn=objective_m,
    space = params_lgbm_m, 
    algo=tpe.suggest, 
    max_evals=50, 
    trials=trials_m
)
#max_evalsで何回探索ループ回すか設定

print("Best: {}".format(best_m))
trials_m.results

LogLoss: 1.10550                                                                                                       
LogLoss: 1.10721                                                                                                       
LogLoss: 1.10335                                                                                                       
LogLoss: 1.10477                                                                                                       
LogLoss: 1.10244                                                                                                       
1.1046551560895632                                                                                                     
Logloss: 1.104655                                                                                                      
LogLoss: 1.10532                                                                                                       
LogLoss: 1.10671                        

[{'loss': 1.104655156089563, 'status': 'ok'},
 {'loss': 1.1044557670269026, 'status': 'ok'},
 {'loss': 1.103252652448393, 'status': 'ok'},
 {'loss': 1.1058448175524194, 'status': 'ok'},
 {'loss': 1.102793614353717, 'status': 'ok'},
 {'loss': 1.1034389246132104, 'status': 'ok'},
 {'loss': 1.102761227828034, 'status': 'ok'},
 {'loss': 1.104394018520299, 'status': 'ok'},
 {'loss': 1.1051491035707246, 'status': 'ok'},
 {'loss': 1.1039990110788642, 'status': 'ok'},
 {'loss': 1.1037535536763465, 'status': 'ok'},
 {'loss': 1.104893876445772, 'status': 'ok'},
 {'loss': 1.1037121572218147, 'status': 'ok'},
 {'loss': 1.1039052815103023, 'status': 'ok'},
 {'loss': 1.103056911930345, 'status': 'ok'},
 {'loss': 1.1032404799855255, 'status': 'ok'},
 {'loss': 1.1038818872267404, 'status': 'ok'},
 {'loss': 1.1063611322841687, 'status': 'ok'},
 {'loss': 1.104433817267195, 'status': 'ok'},
 {'loss': 1.1026806229570325, 'status': 'ok'},
 {'loss': 1.1032531859980703, 'status': 'ok'},
 {'loss': 1.103580590

In [71]:
best_params_lgbm_m = space_eval(params_lgbm_m, best_m)
print(best_params_lgbm_m)

{'colsample_bytree': 0.647029449963611, 'max_depth': 7, 'min_child_samples': 75, 'min_child_weight': 0, 'num_leaves': 8, 'reg_alpha': 0.7272793239132271, 'reg_lambda': 0.5369919183467708, 'subsample': 0.7853386927113121}


In [72]:
#チューニングしたハイパーパラメーター保存
with open('best_params_lgbm_m.text', 'wb') as g:
    pickle.dump(best_params_lgbm_m,g)

In [73]:
#読み込んで確認
best_params_m_full=pickle_load("best_params_lgbm_m.text.")
print(best_params_m_full)

{'colsample_bytree': 0.647029449963611, 'max_depth': 7, 'min_child_samples': 75, 'min_child_weight': 0, 'num_leaves': 8, 'reg_alpha': 0.7272793239132271, 'reg_lambda': 0.5369919183467708, 'subsample': 0.7853386927113121}


In [74]:
#チューニング後のハイパーパラメーターを適用したモデルを出す
lgbm_tuned_m = lgbm_model
lgbm_tuned_m = lgbm_tuned.set_params(**best_params_m_full)
lgbm_tuned_m

LGBMClassifier(colsample_bytree=0.647029449963611, learning_rate=0.02,
               max_depth=7, metric='multi_logloss', min_child_samples=75,
               min_child_weight=0, n_estimators=2000, num_class=4, num_leaves=8,
               random_state=42, reg_alpha=0.7272793239132271,
               reg_lambda=0.5369919183467708, subsample=0.7853386927113121)

In [75]:
##テストデータの予測確率、trainデータの予測確率,説明変数の重要度を返す。
lgbm_pred_m, train_oof_m, imp_m = prediction (X_train_m, Y_train, lgbm_tuned_m, X_test_m)

[[0.00881174 0.05721949 0.0200128  0.01395596]
 [0.00837647 0.07106138 0.01209532 0.00846682]
 [0.00857529 0.06206013 0.02132353 0.00804105]
 ...
 [0.00835147 0.05317384 0.0225215  0.0159532 ]
 [0.00909024 0.06057075 0.01846704 0.01187197]
 [0.00861682 0.05341541 0.02066354 0.01730422]]
Logloss: 1.10398
[[0.0179375  0.11344551 0.04031666 0.02830033]
 [0.0193706  0.14011185 0.02383516 0.01668239]
 [0.0179355  0.12228147 0.04318968 0.01659335]
 ...
 [0.01692593 0.10415005 0.04575271 0.03317132]
 [0.01784738 0.12076688 0.0368662  0.02451953]
 [0.01705273 0.10761693 0.04124314 0.03408719]]
Logloss: 1.10261
[[0.02651026 0.17035939 0.06070678 0.04242357]
 [0.0296716  0.20912896 0.03637203 0.02482741]
 [0.02537788 0.18584584 0.06368457 0.02509171]
 ...
 [0.02489693 0.1568785  0.06882312 0.04940146]
 [0.02656187 0.18067333 0.05464751 0.03811729]
 [0.0256611  0.160752   0.06188972 0.05169718]]
Logloss: 1.10330
[[0.0355196  0.22635111 0.08213383 0.05599545]
 [0.0387788  0.27801422 0.04912099 0.0

In [76]:
#モデルの保存
with open('lgbm_tuned_m.pickle', 'wb') as g:
    pickle.dump(lgbm_tuned_m,g)

In [77]:
with open('lgbm_tuned_m.pickle', 'rb') as f:
    a = pickle.load(f)

print(a)

LGBMClassifier(colsample_bytree=0.647029449963611, learning_rate=0.02,
               max_depth=7, metric='multi_logloss', min_child_samples=75,
               min_child_weight=0, n_estimators=2000, num_class=4, num_leaves=8,
               random_state=42, reg_alpha=0.7272793239132271,
               reg_lambda=0.5369919183467708, subsample=0.7853386927113121)


In [78]:
#重要度を降順に出す。
imp_m.sort_values("imp", ascending=False)

Unnamed: 0,col,imp,imp_std
16,feature_38,1712.6,287.647238
2,feature_14,1618.7,247.473029
11,feature_31,1366.5,200.409165
13,feature_34,1309.0,180.310843
10,feature_28,1155.6,194.353744
3,feature_15,1073.6,179.038295
7,feature_19,1024.6,199.383048
24,feature_9,986.4,143.078223
20,feature_48,981.7,204.123628
22,feature_6,947.7,116.466543


In [79]:
imp_m.to_csv('imp.csv', index=False)

In [80]:
#kaggleに提出するcsvファイルを作成
pred_test_m = pd.DataFrame(lgbm_pred_m, columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4'])
output_m = pred_test_m
output_m['id'] = X_test.index
output_m.to_csv('submission_m.csv', index=False)

output_m

Unnamed: 0,Class_1,Class_2,Class_3,Class_4,id
0,0.088365,0.569711,0.205979,0.135945,100000
1,0.094465,0.701953,0.121878,0.081704,100001
2,0.083769,0.628677,0.203111,0.084443,100002
3,0.078448,0.535690,0.251393,0.134469,100003
4,0.076079,0.619962,0.193721,0.110238,100004
...,...,...,...,...,...
49995,0.090912,0.672145,0.157144,0.079799,149995
49996,0.076899,0.641277,0.154165,0.127659,149996
49997,0.084762,0.523099,0.230343,0.161797,149997
49998,0.087478,0.605279,0.181358,0.125885,149998
