## LightGBM test (Multi-Classification)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
import gc
%matplotlib inline
pd.set_option('display.max_Columns', 100)

In [2]:
ALL_PITCH = 'intermediate/all_pitch_2.f'

In [3]:
all_pitch = pd.read_feather(ALL_PITCH)
all_pitch.shape

(778767, 60)

### 不要な列を削除

In [4]:
all_pitch.drop(
    columns=[
        '年度', 
        '試合ID', 
        #'日付', '時刻', 
        'ホームチームID', 'アウェイチームID', 
        #'球場ID', '球場名', 
        #'試合種別詳細', '表裏', 
        '投手ID', '投手チームID', 
        #'投手投球左右', '投手役割', 
        '打者ID', '打者チームID', 
        #'打者打席左右', '打者守備位置',
        'プレイ前走者状況', 
        #'一塁走者ID', '二塁走者ID', '三塁走者ID', 
        '捕手ID', 
        #'一塁手ID', '二塁手ID', '三塁手ID', '遊撃手ID', '左翼手ID', '中堅手ID', '右翼手ID', 
        #'成績対象投手ID', '成績対象打者ID',
        'opening_date', 'game_date',
        'start_time', 'game_time', 'elapsed_time'
    ], inplace=True)

### Rename

In [5]:
all_pitch.rename(columns={
    'データ内連番': 'No',
    '投球位置区域': 'pitch_area',
    '試合内連番': 'No_in_game',
    '試合内投球数': 'pitch_cnt_in_game',
    'イニング': 'inning',
    'イニング内打席数': 'bat_cnt_in_inning',
    '打席内投球数': 'pitch_cnt_in_bat',
    '投手登板順': 'pitch_order',
    '投手試合内対戦打者数': 'player_cnt_in_game',
    '投手試合内投球数': 'pitch_cnt_in_game',
    '投手イニング内投球数': 'pitch_cnt_in_inning',
    '打者打順': 'bat_order',
    '打者試合内打席数': 'bat_cnt_in_game',
    'プレイ前ホームチーム得点数': 'home_point',
    'プレイ前アウェイチーム得点数': 'away_point',
    'プレイ前アウト数': 'out_cnt',
    'プレイ前ボール数': 'ball_cnt',
    'プレイ前ストライク数': 'strike_cnt',
}, inplace=True)

In [6]:
print(all_pitch.shape)
all_pitch.head()

(778767, 45)


Unnamed: 0,No,ball,pitch_area,No_in_game,pitch_cnt_in_game,inning,bat_cnt_in_inning,pitch_cnt_in_bat,pitch_order,player_cnt_in_game,pitch_cnt_in_game.1,pitch_cnt_in_inning,bat_order,bat_cnt_in_game,home_point,away_point,out_cnt,ball_cnt,strike_cnt,first,second,third,base_cnt,pitch_LR,bat_LR,top_bot,role,date_from_opening,elapsed_min,ball_cnt_0-0,ball_cnt_0-1,ball_cnt_0-2,ball_cnt_0-3,ball_cnt_1-0,ball_cnt_1-1,ball_cnt_1-2,ball_cnt_1-3,ball_cnt_2-0,ball_cnt_2-1,ball_cnt_2-2,ball_cnt_2-3,pit_bat_L_L,pit_bat_L_R,pit_bat_R_L,pit_bat_R_R
0,0,0.0,0.0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,1,0.0,8.0,2,2,1,1,2,1,1,2,2,1,1,0,0,0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0,0.2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
2,2,0.0,5.0,3,3,1,1,3,1,1,3,3,1,1,0,0,0,0,2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0,0.383333,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
3,3,0.0,12.0,4,4,1,2,1,1,2,4,4,2,1,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0,1.016667,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,4,0.0,8.0,5,5,1,2,2,1,2,5,5,2,1,0,0,1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0,1.2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1


### train
- 行数: 257117

In [7]:
train = all_pitch.dropna(subset=['pitch_area'])
train.shape

(257117, 45)

### test
- 行数: 521650

In [8]:
test = all_pitch[all_pitch['pitch_area'].isnull()]
test.shape

(521650, 45)

In [9]:
del all_pitch
gc.collect()

40

In [10]:
train_d = train.drop([
    'No', 
    'pitch_area', 
    'ball'
], axis=1)

In [11]:
test_d = test.drop([
    'No', 
    'pitch_area', 
    'ball'
], axis=1)

## Dataset作成

In [12]:
lgb_train = lgb.Dataset(train_d, train['ball'])

### LGBM学習
- multiclass : クラス数=8

In [13]:
%%time
lgb_param = {
        'objective' : 'multiclass',
        'boosting_type': 'gbdt',
        'metric' : 'multi_logloss',
        'num_class' : 8,
        'num_leaves' : 31,
        'seed' : 0,
        'learning_rate' : 0.1,
    }

# 学習
cv_results = lgb.cv(lgb_param, lgb_train,
                    num_boost_round=8000,
                    early_stopping_rounds=100,
                    verbose_eval=100,
                    nfold=4)

num_boost_round = len(cv_results['multi_logloss-mean'])
print('Best num_boost_round:', num_boost_round)
print('Best CV score:', cv_results['multi_logloss-mean'][-1])

[100]	cv_agg's multi_logloss: 1.5013 + 0.00140296
[200]	cv_agg's multi_logloss: 1.48229 + 0.00196637
[300]	cv_agg's multi_logloss: 1.46903 + 0.00173765
[400]	cv_agg's multi_logloss: 1.45899 + 0.00183953
[500]	cv_agg's multi_logloss: 1.45072 + 0.00150839
[600]	cv_agg's multi_logloss: 1.44421 + 0.00137259
[700]	cv_agg's multi_logloss: 1.43838 + 0.00111484
[800]	cv_agg's multi_logloss: 1.43396 + 0.0013297
[900]	cv_agg's multi_logloss: 1.43019 + 0.00153575
[1000]	cv_agg's multi_logloss: 1.42728 + 0.00165771
[1100]	cv_agg's multi_logloss: 1.42486 + 0.00193938
[1200]	cv_agg's multi_logloss: 1.42289 + 0.00203255
[1300]	cv_agg's multi_logloss: 1.42136 + 0.00212734
[1400]	cv_agg's multi_logloss: 1.42026 + 0.00243269
[1500]	cv_agg's multi_logloss: 1.41936 + 0.00257479
[1600]	cv_agg's multi_logloss: 1.41834 + 0.0025918
[1700]	cv_agg's multi_logloss: 1.41791 + 0.00275362
[1800]	cv_agg's multi_logloss: 1.41781 + 0.00291717
[1900]	cv_agg's multi_logloss: 1.41776 + 0.00284867
[2000]	cv_agg's multi_lo

In [14]:
best_iter = int(num_boost_round * 1.1)

### 全体で再学習

In [15]:
%%time
lgb_model = lgb.train(lgb_param, lgb_train, num_boost_round=best_iter)

CPU times: user 20min 26s, sys: 16 s, total: 20min 42s
Wall time: 5min 31s


### Feature Importance

In [16]:
fi = lgb_model.feature_importance()
fn = lgb_model.feature_name()
df_feature_importance = pd.DataFrame({'feat_name':fn, 'feat_imp':fi})
df_feature_importance.sort_values('feat_imp', inplace=True)
df_feature_importance.tail(10)

Unnamed: 0,feat_name,feat_imp
6,player_cnt_in_game,16141
9,bat_order,19763
11,home_point,24022
12,away_point,25209
8,pitch_cnt_in_inning,31741
1,pitch_cnt_in_game,34451
0,No_in_game,38859
7,pitch_cnt_in_game,44061
25,elapsed_min,49125
24,date_from_opening,78664


In [17]:
%%time
predict = lgb_model.predict(test_d, num_iteration = lgb_model.best_iteration)

CPU times: user 27min 19s, sys: 3.48 s, total: 27min 22s
Wall time: 7min 10s


In [19]:
predict

array([[8.39157664e-01, 6.99189498e-04, 5.54594511e-02, ...,
        2.19360509e-03, 1.22529207e-03, 3.30587330e-02],
       [8.11104776e-01, 6.11155382e-03, 7.95245002e-02, ...,
        3.32875217e-03, 7.90176854e-05, 1.43470673e-02],
       [5.34280387e-01, 1.79205921e-02, 1.11526847e-01, ...,
        7.74100019e-02, 9.82353312e-03, 9.14373687e-02],
       ...,
       [6.95485885e-01, 1.81193031e-02, 5.44117717e-02, ...,
        2.14235411e-02, 1.38414845e-05, 1.44908105e-02],
       [7.24056320e-01, 1.57317426e-02, 2.58165053e-02, ...,
        2.50957034e-02, 9.20236649e-06, 2.89901542e-03],
       [6.40745124e-01, 1.54072332e-02, 8.63822225e-02, ...,
        1.71479951e-02, 8.00455898e-06, 2.90388418e-03]])

## 結果出力

In [21]:
submit = pd.DataFrame(predict)
submit.reset_index(inplace=True)
print(submit.shape)
submit.head()

(521650, 9)


Unnamed: 0,index,0,1,2,3,4,5,6,7
0,0,0.839158,0.000699,0.055459,0.056541,0.011665,0.002194,0.001225,0.033059
1,1,0.811105,0.006112,0.079525,0.066355,0.01915,0.003329,7.9e-05,0.014347
2,2,0.53428,0.017921,0.111527,0.061598,0.096003,0.07741,0.009824,0.091437
3,3,0.413577,0.024705,0.069909,0.126504,0.262462,0.004781,0.000985,0.097077
4,4,0.228076,0.066228,0.140437,0.144326,0.290621,0.002473,0.001016,0.126823


### 2017の平均
- ストレート           0.468254
- カーブ             0.074725
- スライダー           0.185806
- シュート            0.070633
- フォーク            0.083013
- チェンジアップ         0.054217
- シンカー            0.011361
- カットボール          0.051992

In [22]:
submit.mean()

index    260824.500000
0             0.493584
1             0.069841
2             0.190759
3             0.065094
4             0.079030
5             0.052075
6             0.005315
7             0.044301
dtype: float64

In [23]:
#submit.to_csv('submit/lgbm_2.csv', header=False, index=False)

### 評価結果
- CV score: 1.4177152872543688
- 評価結果  : 1.6695529