In [18]:
import pandas as pd
import datetime
import logging
from sklearn.model_selection import KFold
import argparse
import json
import numpy as np
import lightgbm as lgb
from sklearn.metrics import accuracy_score

import sys
sys.path.append('..')
from utils import load_datasets, load_target
from logs.logger import log_best
from models.lgbm import train_and_predict
import os
os.chdir('../')

%matplotlib inline

In [2]:
os.getcwd()

'C:\\Users\\takuy\\work\\work_tokumoto\\splatoon_competition'

In [4]:
parser = argparse.ArgumentParser()
parser.add_argument('--config', default='./configs/default_v04.json')
options = parser.parse_args(args=[])
config = json.load(open(options.config))

feats = config['features']
logging.debug(feats)

target_name = config['target_name']

In [5]:
X_train_all, X_test = load_datasets(feats, target_name)
y_train_all = load_target(target_name)

In [7]:
X_train_all.head()

Unnamed: 0,A1-level,A2-level,A3-level,A4-level,B1-level,B2-level,B3-level,B4-level,enc_period,enc_game-ver,...,poisonmist_B_count,torpedo_A_count,torpedo_B_count,jumpbeacon_A_count,jumpbeacon_B_count,tansanbomb_A_count,tansanbomb_B_count,trap_A_count,trap_B_count,size
0,139,118.0,13.0,10.0,28,26.0,68.0,31.0,0.515152,0.525161,...,0,0,0,0,0,0,0,1,0,2855.0
1,198,77.0,198.0,123.0,83,118.0,168.0,151.0,0.466667,0.524499,...,0,0,1,0,1,0,0,1,0,2391.0
2,114,68.0,225.0,107.0,50,163.0,160.0,126.0,0.6,0.525747,...,0,0,0,0,0,0,0,0,0,2426.0
3,336,131.0,189.0,41.0,273,189.0,194.0,391.0,0.571429,0.523781,...,0,0,0,0,0,0,0,0,1,2237.4
4,299,97.0,96.0,136.0,101,45.0,246.0,160.0,0.58,0.524329,...,1,0,0,0,0,0,0,0,0,2390.0


In [8]:
# 訓練データとテストデータの列を確認
print(X_train_all.columns)
print(X_test.columns)

Index(['A1-level', 'A2-level', 'A3-level', 'A4-level', 'B1-level', 'B2-level',
       'B3-level', 'B4-level', 'enc_period', 'enc_game-ver',
       ...
       'poisonmist_B_count', 'torpedo_A_count', 'torpedo_B_count',
       'jumpbeacon_A_count', 'jumpbeacon_B_count', 'tansanbomb_A_count',
       'tansanbomb_B_count', 'trap_A_count', 'trap_B_count', 'size'],
      dtype='object', length=125)
Index(['A1-level', 'A2-level', 'A3-level', 'A4-level', 'B1-level', 'B2-level',
       'B3-level', 'B4-level', 'enc_period', 'enc_game-ver',
       ...
       'poisonmist_B_count', 'torpedo_A_count', 'torpedo_B_count',
       'jumpbeacon_A_count', 'jumpbeacon_B_count', 'tansanbomb_A_count',
       'tansanbomb_B_count', 'trap_A_count', 'trap_B_count', 'size'],
      dtype='object', length=125)


In [9]:
# 訓練データとテストデータの列を確認
print(X_train_all.shape)
print(X_test.shape)

(66125, 125)
(28340, 125)


In [10]:
#　訓練データに欠損がないことの確認
X_train_all.isnull().sum().sum()

0

In [11]:
#　テストデータに欠損がないことの確認
X_test.isnull().sum().sum()

0

# modeling

In [13]:
# LGBMのパラメータを設定
params = {
    # 二値分類問題
    'objective': 'binary',
    # 損失関数は二値のlogloss
    #'metric': 'auc',
    'metric': 'binary_logloss',
    # 最大イテレーション回数指定
    'num_iterations' : 1000,
    # early_stopping 回数指定
    'early_stopping_rounds' : 100,
}

In [23]:
# k-分割交差検証を使って学習＆予測（K=10）
FOLD_NUM = 10
kf = KFold(n_splits=FOLD_NUM,
              random_state=42)

#検証時のスコアを初期化
scores = []

#テストデータの予測値を初期化
pred_cv = np.zeros(len(X_test.index))

#lgbmのラウンド数を定義
num_round = 10000

for i, (tdx, vdx) in enumerate(kf.split(X_train_all, y_train_all)):
    print(f'Fold : {i}')
    # 訓練用データと検証用データに分割
    X_train, X_valid, y_train, y_valid = X_train_all.iloc[tdx], X_train_all.iloc[vdx], y_train_all.values[tdx], y_train_all.values[vdx]
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid)
    
    # 学習の実行
    model = lgb.train(params, lgb_train, num_boost_round=num_round,
                      valid_names=["train", "valid"], valid_sets=[lgb_train, lgb_valid],
                      verbose_eval=100)

    # 検証データに対する予測値を求めて、勝敗（０　or　１）に変換
    va_pred = np.round(model.predict(X_valid,num_iteration=model.best_iteration))
    
    # accuracyスコアを計算
    score_ = accuracy_score(y_valid, va_pred)
    
    # フォールド毎の検証時のスコアを格納
    scores.append(score_)
    
    #テストデータに対する予測値を求める
    submission = model.predict(X_test,num_iteration=model.best_iteration)
    
    #テストデータに対する予測値をフォールド数で割って蓄積
    #(フォールド毎の予測値の平均値を求めることと同じ)
    pred_cv += submission/FOLD_NUM

# 最終的なテストデータに対する予測値を勝敗（０　or　１）に変換
pred_cv = np.round(pred_cv)

# 最終的なaccuracyスコアを平均値で出力
print('')
print('################################')
print('CV_score:'+ str(np.mean(scores)))

Fold : 0
Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.637461	valid's binary_logloss: 0.681925
[200]	train's binary_logloss: 0.600468	valid's binary_logloss: 0.683622
Early stopping, best iteration is:
[130]	train's binary_logloss: 0.625597	valid's binary_logloss: 0.681381
Fold : 1
Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.637518	valid's binary_logloss: 0.680713
[200]	train's binary_logloss: 0.600576	valid's binary_logloss: 0.681149
Early stopping, best iteration is:
[125]	train's binary_logloss: 0.628072	valid's binary_logloss: 0.680248
Fold : 2
Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.637589	valid's binary_logloss: 0.684309
Early stopping, best iteration is:
[46]	train's binary_logloss: 0.660071	valid's binary_logloss: 0.683127
Fold : 3
Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.

In [24]:
# 提出用ファイルを作成する
pd.DataFrame({"id": range(len(pred_cv)), "y": pred_cv }).to_csv("submission.csv", index=False)

In [25]:
os.getcwd()

'C:\\Users\\takuy\\work'

In [26]:
model.feature_importance()

array([115, 106, 116,  96, 105, 121, 125, 123, 107,  15,  17,   0,  39,
        88, 136,  31, 119,  34, 133,  33, 128,  24, 127,  29, 147,  15,
       133,  27, 133,  17,  54,  32,  19,   9,   0,   0,   9,   9,   3,
         0, 121,  99,  82,  78, 122, 104,  89, 116,  24,  42,  13,  22,
        15,   5,  13,  16,   6,   8,   4,   3,   0,   0,   5,   3,  10,
         7,   1,   3,   9,  13,  14,  17,  17,  10,  13,   9,   0,   3,
        12,   9,  12,  15,   9,  13,  25,  11,   3,   2,  15,   6,   4,
        11,   2,   3,   2,   4,   3,   5,  13,  10,   9,  17,  11,   7,
         4,  12,  10,  10,  20,  15,  10,  14,  18,  17,   5,   5,   1,
         8,   7,   4,   2,   4,   4,   4,  73])

In [27]:
# 特徴量重要度の算出 (データフレームで取得)
cols = list(X_train_all.columns)         
f_importance = np.array(model.feature_importance()) # 特徴量重要度の算出
f_importance = f_importance / np.sum(f_importance)  # 正規化(必要ない場合はコメントアウト)
df_importance = pd.DataFrame({'feature':cols, 'importance':f_importance})
df_importance = df_importance.sort_values('importance', ascending=False) # 降順ソート
display(df_importance)

Unnamed: 0,feature,importance
24,enc_B2-weapon,0.035252
14,enc_A1-weapon,0.032614
28,enc_B4-weapon,0.031894
18,enc_A3-weapon,0.031894
26,enc_B3-weapon,0.031894
20,enc_A4-weapon,0.030695
22,enc_B1-weapon,0.030456
6,B3-level,0.029976
7,B4-level,0.029496
44,B_level_mean,0.029257
