# メモ
- player情報を一切考慮しない

# 特徴量
- 試合内投球数
- イニング
- イニング内打席数
- 打席内投球数
- 投手投球左右
- 投手役割
- 打者打席左右
- 打者打順
- プレイ前アウト数
- プレイ前ボール数
- プレイ前ストライク数
- プレイ前走者状況

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

df_tr_pitch = pd.read_csv('../data/train_pitch.csv')
df_tr_player = pd.read_csv('../data/train_player.csv')
df_te_pitch = pd.read_csv('../data/test_pitch.csv')
df_te_player = pd.read_csv('../data/test_player.csv')
df_smp = pd.read_csv('../data/sample_submit_ball_type.csv', header=None)

In [36]:
import sys
sys.path.append("..")

import tqdm
import time
import preprocess
import importlib
importlib.reload(preprocess)
cnvrt = preprocess.Converter()
df_tr = cnvrt.convert_df(df_tr_pitch, df_tr_player)

In [34]:
df_tr.keys()

Index(['num_data', 'ball_type', 'corse', '', '', '', 'num_game_throw', '', '',
       '', '', '', '', '', 'ord_inning', '', 'num_inning_bat',
       'num_pitch_in_bat', '', '', 'pitcher_lr', 'pitcher_role', '', '', '',
       '', '', '', 'bat_lr', 'ord_bat', '', '', '', '', 'num_out', 'num_ball',
       'num_strike', 'runner_state', '', '', '', '', '', '', '', '', '', '',
       '', '', '', 'cat_pitcher_lr', 'cat_pitcher_role', 'cat_bat_lr',
       'cat_runner_state'],
      dtype='object')

In [22]:
features = [
    "num_game_throw",
    "ord_inning",
    "num_inning_bat",
    "num_pitch_in_bat",
    "cat_pitcher_lr",
    "cat_pitcher_role",
    "cat_bat_lr",
    "ord_bat",
    "num_out",
    "num_ball",
    "num_strike",
    "cat_pitcher_lr",
]

In [87]:
tr_X = df_tr[features]
tr_Y = df_tr["ball_type"]

In [24]:
from sklearn.model_selection import KFold

cv_num = 5
kf = KFold(n_splits=cv_num, shuffle=True)

kf_index = []
for tr_i, te_i in kf.split(tr_X):
    kf_index.append([tr_i, te_i])

In [27]:
from sklearn.metrics import log_loss # モデル評価用(logloss) 
import lightgbm as lgb #LightGBM

# params
params = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 8,
    "metric": "multi_logloss",
    "verbose": 2
}

# train
loss_list = []
models = []
for i, kf in enumerate(kf_index):
    train_x = tr_X.iloc[kf[0]]
    train_y = tr_Y.iloc[kf[0]]
    test_x = tr_X.iloc[kf[1]]
    test_y = tr_Y.iloc[kf[1]]
    
    
    train_data = lgb.Dataset(train_x, label=train_y)
    eval_data = lgb.Dataset(test_x, label=test_y)
    
    print(f"train {i}")
    gbm = lgb.train(
        params,
        train_data, 
        valid_sets=eval_data,
        num_boost_round=100,
        verbose_eval=100
    )
    
    models.append(gbm)

train 0
[100]	valid_0's multi_logloss: 1.53239
train 1
[100]	valid_0's multi_logloss: 1.53636
train 2
[100]	valid_0's multi_logloss: 1.52644
train 3
[100]	valid_0's multi_logloss: 1.53548
train 4
[100]	valid_0's multi_logloss: 1.53135


# 予測

In [37]:
df_te = cnvrt.convert_df(df_te_pitch, df_te_player, isTrain=False)
te_X = df_te[features]

In [76]:
result = []
for i, model in enumerate(models):
    start = time.time()
    print(f"start model{i}")
    result.append(model.predict(te_X))
    print(f"end time : {time.time()-start}")

start model0
end time : 11.879050016403198
start model1
end time : 12.426195859909058
start model2
end time : 12.486909866333008
start model3
end time : 12.815719842910767
start model4
end time : 13.57936692237854


In [84]:
from scipy.special import softmax
result = np.array(result)
final_result = np.sum(result, axis=0)
final_result = softmax(final_result, axis=1)

In [112]:
df_submit = pd.concat([df_smp[0], pd.DataFrame(final_result)], axis=1).reset_index(drop=True)
df_submit.columns = range(df_submit.shape[1])
df_submit.to_csv("submits/v1.csv")