# メモ
- ルールに則り、1選手単位の特徴量を削除
    - 捕手ID削除
    

# 特徴量
## pitch
- 試合内投球数
- イニング
- イニング内打席数
- 打席内投球数
- 投手投球左右
- 投手役割
- 打者打席左右
- 打者打順
- プレイ前アウト数
- プレイ前ボール数
- プレイ前ストライク数
- プレイ前走者状況

### 追加
- ホームチームID
- アウェイチームID
- 球場ID
- 試合種別詳細
- 表裏
- 打者チームID
- 打者守備位置
- 打者試合内打席数
- プレイ前ホームチーム得点数
- プレイ前アウェイチーム得点数
- 投手登板順
- 投手イニング投球数
- 投手試合内対戦打者数

## player
### 打者&投手
- 育成選手F
- 位置
- 身長
- 体重
- 年俸
- 出身国

### 追加
- ドラフト順位

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

df_tr_pitch = pd.read_csv('../data/train_pitch.csv')
df_tr_player = pd.read_csv('../data/train_player.csv')
df_te_pitch = pd.read_csv('../data/test_pitch.csv')
df_te_player = pd.read_csv('../data/test_player.csv')
df_smp = pd.read_csv('../data/sample_submit_ball_type.csv', header=None)

In [3]:
import sys
sys.path.append("..")

import tqdm
import time
import preprocess
import importlib
import re
importlib.reload(preprocess)
cnvrt = preprocess.Converter()
df_tr = cnvrt.convert_df(df_tr_pitch, df_tr_player)

In [4]:
features = []
for x in df_tr.keys():
    if re.match(r"^num_|^ord_|^cat_", x) is not None:
        features.append(x)
features

['num_data',
 'num_game_throw',
 'ord_inning',
 'num_inning_bat',
 'num_pitch_in_bat',
 'ord_pitching',
 'num_pitcher_vs',
 'num_pitching_game',
 'num_pitching_inning',
 'ord_bat',
 'num_batter_batting',
 'num_point_home',
 'num_point_away',
 'num_out',
 'num_ball',
 'num_strike',
 'cat_pitcher_lr',
 'cat_pitcher_role',
 'cat_bat_lr',
 'cat_runner_state',
 'cat_home_id',
 'cat_away_id',
 'cat_dome_id',
 'cat_game_detail',
 'cat_fb',
 'cat_pitcher_team_id',
 'cat_batter_team_id',
 'cat_batter_def_posi',
 'num_height_batter',
 'num_weight_batter',
 'ord_draft_batter',
 'num_annual_salary_batter',
 'cat_training_player_batter',
 'cat_position_batter',
 'cat_country_batter',
 'num_height_pitcher',
 'num_weight_pitcher',
 'ord_draft_pitcher',
 'num_annual_salary_pitcher',
 'cat_training_player_pitcher',
 'cat_position_pitcher',
 'cat_country_pitcher']

In [5]:
tr_X = df_tr[features]
tr_Y = df_tr["ball_type"]

In [6]:
from sklearn.model_selection import KFold

cv_num = 5
kf = KFold(n_splits=cv_num, shuffle=True)

kf_index = []
for tr_i, te_i in kf.split(tr_X):
    kf_index.append([tr_i, te_i])

In [7]:
from sklearn.metrics import log_loss # モデル評価用(logloss) 
import lightgbm as lgb #LightGBM

# params
params = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 8,
    "metric": "multi_logloss",
    "verbose": 2
}

# train
loss_list = []
models = []
for i, kf in enumerate(kf_index):
    train_x = tr_X.iloc[kf[0]]
    train_y = tr_Y.iloc[kf[0]]
    test_x = tr_X.iloc[kf[1]]
    test_y = tr_Y.iloc[kf[1]]
    
    
    train_data = lgb.Dataset(train_x, label=train_y)
    eval_data = lgb.Dataset(test_x, label=test_y)
    
    print(f"train {i}")
    gbm = lgb.train(
        params,
        train_data, 
        valid_sets=eval_data,
        num_boost_round=500,
        verbose_eval=50
    )
    
    models.append(gbm)

train 0
[50]	valid_0's multi_logloss: 1.26203
[100]	valid_0's multi_logloss: 1.21145
[150]	valid_0's multi_logloss: 1.19139
[200]	valid_0's multi_logloss: 1.17965
[250]	valid_0's multi_logloss: 1.17116
[300]	valid_0's multi_logloss: 1.16461
[350]	valid_0's multi_logloss: 1.16029
[400]	valid_0's multi_logloss: 1.15683
[450]	valid_0's multi_logloss: 1.15454
[500]	valid_0's multi_logloss: 1.15253
train 1
[50]	valid_0's multi_logloss: 1.26223
[100]	valid_0's multi_logloss: 1.20945
[150]	valid_0's multi_logloss: 1.18945
[200]	valid_0's multi_logloss: 1.1782
[250]	valid_0's multi_logloss: 1.16985
[300]	valid_0's multi_logloss: 1.16411
[350]	valid_0's multi_logloss: 1.15967
[400]	valid_0's multi_logloss: 1.15636
[450]	valid_0's multi_logloss: 1.15359
[500]	valid_0's multi_logloss: 1.15138
train 2
[50]	valid_0's multi_logloss: 1.26326
[100]	valid_0's multi_logloss: 1.20965
[150]	valid_0's multi_logloss: 1.18989
[200]	valid_0's multi_logloss: 1.17819
[250]	valid_0's multi_logloss: 1.16974
[300]

# 予測

In [8]:
df_te = cnvrt.convert_df(df_te_pitch, df_te_player, isTrain=False)
te_X = df_te[features]

In [9]:
result = []
for i, model in enumerate(models):
    start = time.time()
    print(f"start model{i}")
    result.append(model.predict(te_X))
    print(f"end time : {time.time()-start}")

start model0
end time : 89.3405327796936
start model1
end time : 85.62753915786743
start model2
end time : 90.66072297096252
start model3
end time : 87.75399374961853
start model4
end time : 87.06843495368958


In [10]:
from scipy.special import softmax
result = np.array(result)
final_result = np.sum(result, axis=0)
final_result = softmax(final_result, axis=1)

In [11]:
df_submit = pd.concat([df_smp[0], pd.DataFrame(final_result)], axis=1).reset_index(drop=True)
df_submit.columns = range(df_submit.shape[1])
df_submit.to_csv("submits/v4.csv", index=False, header=False)