## pitch2017前処理

In [54]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import feather
pd.set_option('display.max_Columns', 100)

In [55]:
train_pitch = pd.read_feather('data/train_pitch.f')
print(train_pitch.shape)

(257117, 51)


### 投手
#### 球種
0:ストレート 1:カーブ 2:スライダー 3:シュート 4:フォーク 5:チェンジアップ 6:シンカー 7:カットボール

In [56]:
train_pitch.replace({'球種': {
    0: '0_straight', 
    1: '1_curve', 
    2: '2_slider', 
    3: '3_shoot', 
    4: '4_fork', 
    5: '5_changeup', 
    6: '6_sinker', 
    7: '7_cutball'
}}, inplace=True)
train_pitch.rename(columns={'球種': 'ball'}, inplace=True)

In [57]:
pitch_ball = train_pitch[['投手ID','ball']]
pitch_ball = pd.get_dummies(pitch_ball, columns=['ball'])

In [58]:
groupby_id = pitch_ball.groupby('投手ID').sum()

In [59]:
groupby_id['curve'] =  groupby_id['ball_1_curve'] / groupby_id['ball_0_straight']
groupby_id['slider'] =  groupby_id['ball_2_slider'] / groupby_id['ball_0_straight']
groupby_id['shoot'] =  groupby_id['ball_3_shoot'] / groupby_id['ball_0_straight']
groupby_id['fork'] =  groupby_id['ball_4_fork'] / groupby_id['ball_0_straight']
groupby_id['changeup'] =  groupby_id['ball_5_changeup'] / groupby_id['ball_0_straight']
groupby_id['sinker'] =  groupby_id['ball_6_sinker'] / groupby_id['ball_0_straight']
groupby_id['cutball'] =  groupby_id['ball_7_cutball'] / groupby_id['ball_0_straight']
groupby_id['total'] = (groupby_id['ball_0_straight'] + groupby_id['ball_1_curve'] + groupby_id['ball_2_slider']
                       + groupby_id['ball_3_shoot'] + groupby_id['ball_4_fork'] + groupby_id['ball_5_changeup']
                       + groupby_id['ball_6_sinker'] + groupby_id['ball_7_cutball'])

In [60]:
pitch_ball = groupby_id.reset_index(inplace=False)

In [61]:
pitch_ball.drop(
    columns=[
        'ball_0_straight', 'ball_1_curve', 'ball_2_slider', 'ball_3_shoot', 
        'ball_4_fork', 'ball_5_changeup', 'ball_6_sinker', 'ball_7_cutball'
    ], inplace=True)

In [62]:
pitch_ball.head()

Unnamed: 0,投手ID,curve,slider,shoot,fork,changeup,sinker,cutball,total
0,11606,0.495751,0.0,0.0,0.407932,0.0,0.0,0.101983,708.0
1,11766,0.0,0.96729,0.509346,0.0,0.051402,0.149533,0.0,573.0
2,11807,0.136157,0.003026,0.016641,0.340393,0.0,0.0,0.003026,991.0
3,12103,0.13388,0.581967,0.060109,0.161202,0.0,0.0,0.0,709.0
4,12113,0.038363,0.948849,0.84399,0.0,0.340153,1.242967,0.913043,2083.0


### 登板試合数

In [63]:
pit_game = train_pitch[['投手ID', '試合ID']].groupby(['投手ID', '試合ID']).count()
pit_game = pd.DataFrame(pit_game.groupby(['投手ID']).size())
pit_game.reset_index(inplace=True)
pit_game.rename(columns={0: 'pit_game_cnt'}, inplace=True)

### イニング数

In [64]:
pit_inning = train_pitch[['投手ID', '試合ID', 'イニング']].groupby(['投手ID', '試合ID', 'イニング']).count()
pit_inning = pd.DataFrame(pit_inning.groupby(['投手ID']).size())
pit_inning.reset_index(inplace=True)
pit_inning.rename(columns={0: 'pit_inning_cnt'}, inplace=True)

### 対戦打者数

In [65]:
pit_batcnt = train_pitch[['投手ID', '試合ID', 'イニング', 'イニング内打席数']].groupby(['投手ID', '試合ID', 'イニング', 'イニング内打席数']).count()
pit_batcnt = pd.DataFrame(pit_batcnt.groupby(['投手ID']).size())
pit_batcnt.reset_index(inplace=True)
pit_batcnt.rename(columns={0: 'pit_batter_cnt'}, inplace=True)

### 投手実績まとめ

In [66]:
pitch_ball = pitch_ball.merge(pit_game, on='投手ID', how='left')
pitch_ball = pitch_ball.merge(pit_inning, on='投手ID', how='left')
pitch_ball = pitch_ball.merge(pit_batcnt, on='投手ID', how='left')

### 1イニング当たり、1試合当たり、1打席当たり
- pit_inning_per_game: イニング数/試合数
- pit_batter_per_game: 打席数/試合数
- pit_ball_per_game: 投球数/試合数
- pit_batter_per_inning: 打席数/イニング数
- pit_ball_per_inning: 投球数/イニング数
- pit_ball_per_batter: 投球数/打席数

In [67]:
pitch_ball['pit_inning_per_game'] = pitch_ball['pit_inning_cnt'] / pitch_ball['pit_game_cnt']
pitch_ball['pit_batter_per_game'] = pitch_ball['pit_batter_cnt'] / pitch_ball['pit_game_cnt']
pitch_ball['pit_ball_per_game'] = pitch_ball['total'] / pitch_ball['pit_game_cnt']
pitch_ball['pit_batter_per_inning'] = pitch_ball['pit_batter_cnt'] / pitch_ball['pit_inning_cnt']
pitch_ball['pit_ball_per_inning'] = pitch_ball['total'] / pitch_ball['pit_inning_cnt']
pitch_ball['pit_ball_per_batter'] = pitch_ball['total'] / pitch_ball['pit_batter_cnt']

In [68]:
print(pitch_ball.shape)
pitch_ball.head()

(326, 18)


Unnamed: 0,投手ID,curve,slider,shoot,fork,changeup,sinker,cutball,total,pit_game_cnt,pit_inning_cnt,pit_batter_cnt,pit_inning_per_game,pit_batter_per_game,pit_ball_per_game,pit_batter_per_inning,pit_ball_per_inning,pit_ball_per_batter
0,11606,0.495751,0.0,0.0,0.407932,0.0,0.0,0.101983,708.0,46,54,171,1.173913,3.717391,15.391304,3.166667,13.111111,4.140351
1,11766,0.0,0.96729,0.509346,0.0,0.051402,0.149533,0.0,573.0,50,50,154,1.0,3.08,11.46,3.08,11.46,3.720779
2,11807,0.136157,0.003026,0.016641,0.340393,0.0,0.0,0.003026,991.0,52,61,235,1.173077,4.519231,19.057692,3.852459,16.245902,4.217021
3,12103,0.13388,0.581967,0.060109,0.161202,0.0,0.0,0.0,709.0,24,51,191,2.125,7.958333,29.541667,3.745098,13.901961,3.712042
4,12113,0.038363,0.948849,0.84399,0.0,0.340153,1.242967,0.913043,2083.0,23,129,540,5.608696,23.478261,90.565217,4.186047,16.147287,3.857407


In [69]:
pitch_ball.describe()

Unnamed: 0,投手ID,curve,slider,shoot,fork,changeup,sinker,cutball,total,pit_game_cnt,pit_inning_cnt,pit_batter_cnt,pit_inning_per_game,pit_batter_per_game,pit_ball_per_game,pit_batter_per_inning,pit_ball_per_inning,pit_ball_per_batter
count,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0
mean,1163288.0,0.257321,0.461623,0.500294,0.186347,0.138092,0.021658,0.146885,788.702454,20.766871,50.245399,199.684049,2.910101,12.114679,47.892106,4.080928,16.130663,3.953392
std,416143.7,2.295388,0.400976,5.221909,0.227734,0.313514,0.104249,0.633629,756.88166,18.831584,47.533255,192.461884,2.088025,8.882978,35.103271,0.693583,3.042027,0.328686
min,11606.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,1.0,2.0,1.0,2.0,4.0,2.0,4.0,2.0
25%,925322.5,0.0,0.176896,0.0,0.0,0.0,0.0,0.0,184.0,5.25,11.25,46.0,1.152592,4.337121,17.510263,3.772204,14.80578,3.79569
50%,1300036.0,0.077167,0.395109,0.0,0.106137,0.0,0.0,0.0,569.0,15.0,37.0,143.5,1.58435,6.75,26.791667,4.055175,16.0,3.956039
75%,1500108.0,0.192714,0.62896,0.233909,0.330539,0.170576,0.0,0.06647,1031.0,27.0,67.75,263.25,5.041667,21.725,84.741758,4.30119,17.139987,4.124831
max,1700016.0,41.461538,2.264286,94.153846,1.156364,3.923077,1.242967,10.230769,3004.0,72.0,190.0,756.0,7.52,30.0,116.64,8.0,35.0,6.0


### 2017年全投手平均
- total: 788.702454
- pit_game_cnt: 20.766871
- pit_inning_cnt: 50.245399
- pit_batter_cnt: 199.684049

- pit_inning_per_game: 2.910101
- pit_batter_per_game: 12.114679
- pit_ball_per_game: 47.892106
- pit_batter_per_inning: 4.080928
- pit_ball_per_inning: 16.130663
- pit_ball_per_batter: 3.953392

In [70]:
pitch_ball.to_feather('intermediate/pit_2017_1.f')

## 野手
### 打席数

In [71]:
bat_ball = train_pitch[['打者ID', '試合ID', 'イニング', 'イニング内打席数']].groupby(['打者ID', '試合ID', 'イニング', 'イニング内打席数']).count()
bat_ball = pd.DataFrame(bat_ball.groupby(['打者ID']).size())
bat_ball.reset_index(inplace=True)
bat_ball.rename(columns={0: 'batter_cnt'}, inplace=True)

### 試合数

In [72]:
bat_game = train_pitch[['打者ID', '試合ID']].groupby(['打者ID', '試合ID']).count()
bat_game = pd.DataFrame(bat_game.groupby(['打者ID']).size())
bat_game.reset_index(inplace=True)
bat_game.rename(columns={0: 'bat_game_cnt'}, inplace=True)

### 打者成績まとめ

In [73]:
bat_ball = bat_ball.merge(bat_game, on='打者ID', how='left')

### 1試合当たり打席数
- batter_cnt_per_game: 打席数/試合数

In [74]:
bat_ball['batter_cnt_per_game'] = bat_ball['batter_cnt'] / bat_ball['bat_game_cnt']

In [75]:
print(bat_ball.shape)
bat_ball.head()

(466, 4)


Unnamed: 0,打者ID,batter_cnt,bat_game_cnt,batter_cnt_per_game
0,11270,83,35,2.371429
1,11302,156,69,2.26087
2,11343,41,28,1.464286
3,11436,268,81,3.308642
4,11511,48,21,2.285714


In [76]:
bat_ball.describe()

Unnamed: 0,打者ID,batter_cnt,bat_game_cnt,batter_cnt_per_game
count,466.0,466.0,466.0,466.0
mean,1044992.0,139.688841,41.448498,2.515079
std,468711.7,184.754006,45.398623,1.020181
min,11270.0,1.0,1.0,1.0
25%,800012.5,7.25,4.0,1.755682
50%,1100114.0,44.0,21.0,2.3875
75%,1400182.0,222.5,71.75,3.257923
max,1700019.0,680.0,143.0,4.755245


### 2017年全野手平均
- batter_cnt: 139.688841
- bat_game_cnt: 41.448498
- batter_cnt_per_game: 2.515079

In [77]:
bat_ball.to_feather('intermediate/bat_2017_1.f')

## 捕手
### 捕球球種

In [78]:
catch_ball = train_pitch[['捕手ID','ball']]
catch_ball = pd.get_dummies(catch_ball, columns=['ball'])
groupby_cat = catch_ball.groupby('捕手ID').sum()

In [79]:
groupby_cat['c_curve'] =  groupby_cat['ball_1_curve'] / groupby_cat['ball_0_straight']
groupby_cat['c_slider'] =  groupby_cat['ball_2_slider'] / groupby_cat['ball_0_straight']
groupby_cat['c_shoot'] =  groupby_cat['ball_3_shoot'] / groupby_cat['ball_0_straight']
groupby_cat['c_fork'] =  groupby_cat['ball_4_fork'] / groupby_cat['ball_0_straight']
groupby_cat['c_changeup'] =  groupby_cat['ball_5_changeup'] / groupby_cat['ball_0_straight']
groupby_cat['c_sinker'] =  groupby_cat['ball_6_sinker'] / groupby_cat['ball_0_straight']
groupby_cat['c_cutball'] =  groupby_cat['ball_7_cutball'] / groupby_cat['ball_0_straight']
groupby_cat['c_total'] = (groupby_cat['ball_0_straight'] + groupby_cat['ball_1_curve'] + groupby_cat['ball_2_slider']
                       + groupby_cat['ball_3_shoot'] + groupby_cat['ball_4_fork'] + groupby_cat['ball_5_changeup']
                       + groupby_cat['ball_6_sinker'] + groupby_cat['ball_7_cutball'])

In [80]:
catch_ball = groupby_cat.reset_index(inplace=False)
catch_ball.drop(
    columns=[
        'ball_0_straight', 'ball_1_curve', 'ball_2_slider', 'ball_3_shoot', 
        'ball_4_fork', 'ball_5_changeup', 'ball_6_sinker', 'ball_7_cutball'
    ], inplace=True)

### 試合数

In [81]:
cat_game = train_pitch[['捕手ID', '試合ID']].groupby(['捕手ID', '試合ID']).count()
cat_game = pd.DataFrame(cat_game.groupby(['捕手ID']).size())
cat_game.reset_index(inplace=True)
cat_game.rename(columns={0: 'cat_game_cnt'}, inplace=True)

### イニング数

In [82]:
cat_inning = train_pitch[['捕手ID', '試合ID', 'イニング']].groupby(['捕手ID', '試合ID', 'イニング']).count()
cat_inning = pd.DataFrame(cat_inning.groupby(['捕手ID']).size())
cat_inning.reset_index(inplace=True)
cat_inning.rename(columns={0: 'cat_inning_cnt'}, inplace=True)

### 対戦打者数

In [83]:
cat_batcnt = train_pitch[['捕手ID', '試合ID', 'イニング', 'イニング内打席数']].groupby(['捕手ID', '試合ID', 'イニング', 'イニング内打席数']).count()
cat_batcnt = pd.DataFrame(cat_batcnt.groupby(['捕手ID']).size())
cat_batcnt.reset_index(inplace=True)
cat_batcnt.rename(columns={0: 'cat_batter_cnt'}, inplace=True)

### 捕手実績まとめ

In [84]:
catch_ball = catch_ball.merge(cat_game, on='捕手ID', how='left')
catch_ball = catch_ball.merge(cat_inning, on='捕手ID', how='left')
catch_ball = catch_ball.merge(cat_batcnt, on='捕手ID', how='left')

### 1イニング当たり、1試合当たり、1打席当たり
- cat_inning_per_game: イニング数/試合数
- cat_batter_per_game: 打席数/試合数
- cat_ball_per_game: 投球数/試合数
- cat_batter_per_inning: 打席数/イニング数
- cat_ball_per_inning: 投球数/イニング数
- cat_ball_per_batter: 投球数/打席数

In [85]:
catch_ball['cat_inning_per_game'] = catch_ball['cat_inning_cnt'] / catch_ball['cat_game_cnt']
catch_ball['cat_batter_per_game'] = catch_ball['cat_batter_cnt'] / catch_ball['cat_game_cnt']
catch_ball['cat_ball_per_game'] = catch_ball['c_total'] / catch_ball['cat_game_cnt']
catch_ball['cat_batter_per_inning'] = catch_ball['cat_batter_cnt'] / catch_ball['cat_inning_cnt']
catch_ball['cat_ball_per_inning'] = catch_ball['c_total'] / catch_ball['cat_inning_cnt']
catch_ball['cat_ball_per_batter'] = catch_ball['c_total'] / catch_ball['cat_batter_cnt']

In [86]:
print(catch_ball.shape)
catch_ball.head()

(55, 18)


Unnamed: 0,捕手ID,c_curve,c_slider,c_shoot,c_fork,c_changeup,c_sinker,c_cutball,c_total,cat_game_cnt,cat_inning_cnt,cat_batter_cnt,cat_inning_per_game,cat_batter_per_game,cat_ball_per_game,cat_batter_per_inning,cat_ball_per_inning,cat_ball_per_batter
0,11343,0.150198,0.286561,0.043478,0.106719,0.134387,0.017787,0.073123,917.0,19,49,220,2.578947,11.578947,48.263158,4.489796,18.714286,4.168182
1,11726,0.204082,0.405248,0.163265,0.201166,0.087464,0.005831,0.201166,778.0,11,46,196,4.181818,17.818182,70.727273,4.26087,16.913043,3.969388
2,12055,0.199313,0.331615,0.061856,0.257732,0.187285,0.0,0.091065,1239.0,16,70,303,4.375,18.9375,77.4375,4.328571,17.7,4.089109
3,12100,0.147866,0.477134,0.053354,0.129573,0.132622,0.0,0.018293,1285.0,20,77,323,3.85,16.15,64.25,4.194805,16.688312,3.978328
4,12186,0.203482,0.454117,0.205658,0.21654,0.140733,0.0,0.262242,6845.0,76,420,1760,5.526316,23.157895,90.065789,4.190476,16.297619,3.889205


In [87]:
catch_ball.describe()

Unnamed: 0,捕手ID,c_curve,c_slider,c_shoot,c_fork,c_changeup,c_sinker,c_cutball,c_total,cat_game_cnt,cat_inning_cnt,cat_batter_cnt,cat_inning_per_game,cat_batter_per_game,cat_ball_per_game,cat_batter_per_inning,cat_ball_per_inning,cat_ball_per_batter
count,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0
mean,946274.9,0.129609,0.398785,0.126261,0.187395,0.109288,0.022809,0.093492,4674.854545,43.290909,279.381818,1183.472727,4.962933,21.224174,83.984458,4.209168,16.81433,3.98513
std,517015.3,0.131703,0.181599,0.102096,0.105057,0.075308,0.044733,0.10134,5377.539946,42.332315,321.970524,1360.802035,2.267016,10.038274,38.886875,0.46069,2.790733,0.530876
min,11343.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,1.0,3.0,1.0,3.0,4.0,3.0,4.0,1.333333
25%,650048.0,0.052804,0.302003,0.055594,0.12072,0.059224,0.0,0.014695,447.5,6.5,24.5,113.5,3.714474,13.947368,53.894737,4.14636,16.331577,3.881882
50%,1000176.0,0.112075,0.38719,0.105164,0.181189,0.113092,0.002115,0.072251,2496.0,24.0,145.0,648.0,5.333333,23.157895,90.065789,4.227811,16.844172,3.96236
75%,1300107.0,0.173117,0.493681,0.166829,0.243125,0.148474,0.023969,0.130228,7342.5,78.0,439.0,1881.5,6.684349,29.703434,115.540043,4.334941,17.385417,4.033424
max,1600156.0,0.857143,1.0,0.407609,0.571429,0.287671,0.25,0.494565,18250.0,137.0,1114.0,4652.0,9.0,40.0,159.0,5.5,25.0,6.25


### 2017年全捕手平均
- c_curve: 0.129609
- c_slider: 0.398785
- c_shoot: 0.126261
- c_fork: 0.187395
- c_changeup: 0.109288
- c_sinker: 0.022809
- c_cutball: 0.093492
- c_total: 4674.854545
- cat_game_cnt: 43.290909
- cat_inning_cnt: 279.381818
- cat_batter_cnt: 1183.472727
- cat_inning_per_game: 4.962933
- cat_batter_per_game: 21.224174
- cat_ball_per_game: 83.984458
- cat_batter_per_inning: 4.209168
- cat_ball_per_inning: 16.814330
- cat_ball_per_batter: 3.985130

In [88]:
catch_ball.to_feather('intermediate/cat_2017_1.f')