In [1]:
import pandas as pd
import datetime
import logging
from sklearn.model_selection import KFold
import argparse
import json
import numpy as np
import lightgbm as lgb
from sklearn.metrics import accuracy_score

import sys
sys.path.append('..')
from utils import load_datasets, load_target
from logs.logger import log_best
from models.lgbm import train_and_predict
import os
os.chdir('../')

%matplotlib inline

In [2]:
os.getcwd()

'C:\\Users\\takuy\\work\\work_tokumoto\\splatoon_competition'

In [3]:
parser = argparse.ArgumentParser()
parser.add_argument('--config', default='./configs/default_v06.json')
options = parser.parse_args(args=[])
config = json.load(open(options.config))

feats = config['features']
logging.debug(feats)

target_name = config['target_name']

In [4]:
X_train_all, X_test = load_datasets(feats, target_name)
y_train_all = load_target(target_name)

In [5]:
X_train_all.head()

Unnamed: 0,A1-level,A2-level,A3-level,A4-level,B1-level,B2-level,B3-level,B4-level,enc_period,enc_game-ver,...,jetpack_B_count,ultrahanko_A_count,ultrahanko_B_count,splashbomb_pitcher_A_count,splashbomb_pitcher_B_count,robotbomb_pitcher_A_count,robotbomb_pitcher_B_count,quickbomb_pitcher_A_count,quickbomb_pitcher_B_count,size
0,139,118.0,13.0,10.0,28,26.0,68.0,31.0,0.515152,0.525161,...,0,0,2,0,0,0,0,0,0,2855.0
1,198,77.0,198.0,123.0,83,118.0,168.0,151.0,0.466667,0.524499,...,0,1,0,1,0,0,0,0,0,2391.0
2,114,68.0,225.0,107.0,50,163.0,160.0,126.0,0.6,0.525747,...,0,0,1,0,0,0,0,0,0,2426.0
3,336,131.0,189.0,41.0,273,189.0,194.0,391.0,0.571429,0.523781,...,1,0,0,0,0,0,0,0,0,2237.4
4,299,97.0,96.0,136.0,101,45.0,246.0,160.0,0.58,0.524329,...,0,1,0,0,0,0,0,0,1,2390.0


In [6]:
# 訓練データとテストデータの列を確認
print(X_train_all.columns)
print(X_test.columns)

Index(['A1-level', 'A2-level', 'A3-level', 'A4-level', 'B1-level', 'B2-level',
       'B3-level', 'B4-level', 'enc_period', 'enc_game-ver', 'enc_lobby-mode',
       'enc_lobby', 'enc_mode', 'enc_stage', 'enc_A1-weapon', 'enc_A1-rank',
       'enc_A2-weapon', 'enc_A2-rank', 'enc_A3-weapon', 'enc_A3-rank',
       'enc_A4-weapon', 'enc_A4-rank', 'enc_B1-weapon', 'enc_B1-rank',
       'enc_B2-weapon', 'enc_B2-rank', 'enc_B3-weapon', 'enc_B3-rank',
       'enc_B4-weapon', 'enc_B4-rank', 'rank_mean_diff', 'A_rank_std',
       'A_rank_max', 'A_rank_min', 'B_rank_std', 'B_rank_max', 'B_rank_min',
       'level_mean_diff', 'A_level_std', 'A_level_max', 'A_level_min',
       'B_level_std', 'B_level_max', 'B_level_min', 'shooter_A_count',
       'shooter_B_count', 'blaster_A_count', 'blaster_B_count',
       'maneuver_A_count', 'maneuver_B_count', 'reelgun_A_count',
       'reelgun_B_count', 'roller_A_count', 'roller_B_count', 'brush_A_count',
       'brush_B_count', 'charger_A_count', 'charger_B

In [7]:
# 訓練データとテストデータの列を確認
print(X_train_all.shape)
print(X_test.shape)

(66125, 95)
(28340, 95)


In [8]:
#　訓練データに欠損がないことの確認
X_train_all.isnull().sum().sum()

0

In [9]:
#　テストデータに欠損がないことの確認
X_test.isnull().sum().sum()

0

# modeling

In [10]:
# LGBMのパラメータを設定
params = {
    # 二値分類問題
    'objective': 'binary',
    # 損失関数は二値のlogloss
    #'metric': 'auc',
    'metric': 'binary_logloss',
    # 最大イテレーション回数指定
    'num_iterations' : 1000,
    # early_stopping 回数指定
    'early_stopping_rounds' : 100,
}

In [11]:
# k-分割交差検証を使って学習＆予測（K=10）
FOLD_NUM = 10
kf = KFold(n_splits=FOLD_NUM,
              random_state=42)

#検証時のスコアを初期化
scores = []

#テストデータの予測値を初期化
pred_cv = np.zeros(len(X_test.index))

#lgbmのラウンド数を定義
num_round = 10000

for i, (tdx, vdx) in enumerate(kf.split(X_train_all, y_train_all)):
    print(f'Fold : {i}')
    # 訓練用データと検証用データに分割
    X_train, X_valid, y_train, y_valid = X_train_all.iloc[tdx], X_train_all.iloc[vdx], y_train_all.values[tdx], y_train_all.values[vdx]
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid)
    
    # 学習の実行
    model = lgb.train(params, lgb_train, num_boost_round=num_round,
                      valid_names=["train", "valid"], valid_sets=[lgb_train, lgb_valid],
                      verbose_eval=100)

    # 検証データに対する予測値を求めて、勝敗（０　or　１）に変換
    va_pred = np.round(model.predict(X_valid,num_iteration=model.best_iteration))
    
    # accuracyスコアを計算
    score_ = accuracy_score(y_valid, va_pred)
    
    # フォールド毎の検証時のスコアを格納
    scores.append(score_)
    
    #テストデータに対する予測値を求める
    submission = model.predict(X_test,num_iteration=model.best_iteration)
    
    #テストデータに対する予測値をフォールド数で割って蓄積
    #(フォールド毎の予測値の平均値を求めることと同じ)
    pred_cv += submission/FOLD_NUM

# 最終的なテストデータに対する予測値を勝敗（０　or　１）に変換
pred_cv = np.round(pred_cv)

# 最終的なaccuracyスコアを平均値で出力
print('')
print('################################')
print('CV_score:'+ str(np.mean(scores)))

Fold : 0




Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.638208	valid's binary_logloss: 0.681377
Early stopping, best iteration is:
[26]	train's binary_logloss: 0.670037	valid's binary_logloss: 0.681191
Fold : 1
Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.638591	valid's binary_logloss: 0.679332
[200]	train's binary_logloss: 0.601905	valid's binary_logloss: 0.680307
Early stopping, best iteration is:
[110]	train's binary_logloss: 0.634477	valid's binary_logloss: 0.679097
Fold : 2
Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.637372	valid's binary_logloss: 0.682751
Early stopping, best iteration is:
[45]	train's binary_logloss: 0.660708	valid's binary_logloss: 0.681824
Fold : 3
Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.637637	valid's binary_logloss: 0.683026
Early stopping, best iteration is:
[67]	tr

In [12]:
# 提出用ファイルを作成する
pd.DataFrame({"id": range(len(pred_cv)), "y": pred_cv }).to_csv("submission.csv", index=False)

In [14]:
model.feature_importance()

array([ 73,  81,  68,  71,  69,  73,  67,  73,  70,   6,  10,   0,  21,
        51,  85,  18,  80,  36,  87,  24,  64,  20,  75,  17, 110,  12,
       102,  22,  95,  24,  38,   4,   2,   5,   2,   8,   1, 140,  59,
        43,  68,  62,  66,  76,  23,  25,  10,  19,  13,   4,   4,   9,
         4,   6,   3,   4,  52,  33,   1,   2,  10,   3,   3,   2,   5,
         4,  11,  17,  13,   7,   7,   7,   1,   3,   7,   5,  10,  10,
         9,   5,  15,   6,   8,   5,  11,   8,   1,   7,   2,   1,   1,
         1,   1,   2,  42])

In [15]:
# 特徴量重要度の算出 (データフレームで取得)
cols = list(X_train_all.columns)         
f_importance = np.array(model.feature_importance()) # 特徴量重要度の算出
f_importance = f_importance / np.sum(f_importance)  # 正規化(必要ない場合はコメントアウト)
df_importance = pd.DataFrame({'feature':cols, 'importance':f_importance})
df_importance = df_importance.sort_values('importance', ascending=False) # 降順ソート
display(df_importance)

Unnamed: 0,feature,importance
37,level_mean_diff,0.053030
24,enc_B2-weapon,0.041667
26,enc_B3-weapon,0.038636
28,enc_B4-weapon,0.035985
18,enc_A3-weapon,0.032955
14,enc_A1-weapon,0.032197
1,A2-level,0.030682
16,enc_A2-weapon,0.030303
43,B_level_min,0.028788
22,enc_B1-weapon,0.028409


In [17]:
os.getcwd()

'C:\\Users\\takuy\\work\\work_tokumoto\\splatoon_competition'