## pitch2017前処理

#### 6
- 選手IDごとのコースの実績

In [30]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import feather
pd.set_option('display.max_Columns', 100)

In [31]:
train_pitch = pd.read_feather('data/train_pitch.f')
print(train_pitch.shape)

(257117, 51)


In [32]:
OUT_PIT = 'intermediate/pit_2017_6.f'
OUT_BAT = 'intermediate/bat_2017_6.f'

### 左右

In [33]:
train_pitch.replace('左', 'L', inplace=True)
train_pitch.replace('右', 'R', inplace=True)
train_pitch['pit_bat'] = train_pitch['投手投球左右'] + '_' + train_pitch['打者打席左右']

In [34]:
train_pitch.rename(columns={'球種': 'ball', '投球位置区域': 'course'}, inplace=True)

### 投手
#### 球種
0:ストレート 1:カーブ 2:スライダー 3:シュート 4:フォーク 5:チェンジアップ 6:シンカー 7:カットボール

In [35]:
pitch_ball = train_pitch[['投手ID','pit_bat','ball']]
pitch_ball = pd.get_dummies(pitch_ball, columns=['ball'])

In [36]:
groupby_pit = pitch_ball.groupby(['投手ID','pit_bat']).sum()
groupby_pit.rename(columns={
    'ball_0': 'straight',
    'ball_1': 'curve',
    'ball_2': 'slider',
    'ball_3': 'shoot',
    'ball_4': 'fork',
    'ball_5': 'changeup',
    'ball_6': 'sinker',
    'ball_7': 'cutball',
}, inplace=True)
pitch_ball = groupby_pit.reset_index(inplace=False)

In [37]:
pitch_ball['total'] = (pitch_ball['straight'] + pitch_ball['curve'] + pitch_ball['slider'] + pitch_ball['shoot']
                       + pitch_ball['fork'] + pitch_ball['changeup'] + pitch_ball['sinker'] + pitch_ball['cutball'])

In [38]:
pitch_ball.head()

Unnamed: 0,投手ID,pit_bat,straight,curve,slider,shoot,fork,changeup,sinker,cutball,total
0,11606,R_L,134.0,54.0,0.0,0.0,61.0,0.0,0.0,13.0,262.0
1,11606,R_R,219.0,121.0,0.0,0.0,83.0,0.0,0.0,23.0,446.0
2,11766,L_L,111.0,0.0,103.0,67.0,0.0,0.0,2.0,0.0,283.0
3,11766,L_R,103.0,0.0,104.0,42.0,0.0,11.0,30.0,0.0,290.0
4,11807,R_L,249.0,38.0,1.0,9.0,115.0,0.0,0.0,2.0,414.0


### コース

In [39]:
pitch_course = train_pitch[['投手ID','pit_bat','course']]
pitch_course = pd.get_dummies(pitch_course, columns=['course'])

In [40]:
groupby_course = pitch_course.groupby(['投手ID','pit_bat']).sum()
pitch_course = groupby_course.reset_index(inplace=False)

In [41]:
pitch_course.head()

Unnamed: 0,投手ID,pit_bat,course_0,course_1,course_2,course_3,course_4,course_5,course_6,course_7,course_8,course_9,course_10,course_11,course_12
0,11606,R_L,21.0,17.0,10.0,7.0,20.0,14.0,4.0,7.0,10.0,51.0,8.0,45.0,48.0
1,11606,R_R,25.0,20.0,8.0,17.0,26.0,20.0,19.0,40.0,27.0,53.0,51.0,38.0,102.0
2,11766,L_L,14.0,31.0,24.0,20.0,24.0,11.0,16.0,14.0,10.0,20.0,24.0,62.0,13.0
3,11766,L_R,11.0,25.0,31.0,12.0,17.0,9.0,9.0,11.0,10.0,33.0,58.0,39.0,25.0
4,11807,R_L,22.0,13.0,20.0,18.0,22.0,25.0,18.0,25.0,22.0,58.0,53.0,54.0,64.0


### 登板試合数

In [42]:
pit_game = train_pitch[['投手ID', '試合ID']].groupby(['投手ID', '試合ID']).count()
pit_game = pd.DataFrame(pit_game.groupby(['投手ID']).size())
pit_game.reset_index(inplace=True)
pit_game.rename(columns={0: 'pit_game_cnt'}, inplace=True)

### イニング数

In [43]:
pit_inning = train_pitch[['投手ID', '試合ID', 'イニング']].groupby(['投手ID', '試合ID', 'イニング']).count()
pit_inning = pd.DataFrame(pit_inning.groupby(['投手ID']).size())
pit_inning.reset_index(inplace=True)
pit_inning.rename(columns={0: 'pit_inning_cnt'}, inplace=True)

### 対戦打者数

In [44]:
pit_batcnt = train_pitch[['投手ID', 'pit_bat', '試合ID', 'イニング', 'イニング内打席数']].groupby(['投手ID', 'pit_bat', '試合ID', 'イニング', 'イニング内打席数']).count()
pit_batcnt = pd.DataFrame(pit_batcnt.groupby(['投手ID','pit_bat']).size())
pit_batcnt.reset_index(inplace=True)
pit_batcnt.rename(columns={0: 'pit_batter_cnt'}, inplace=True)

### 投手実績まとめ

In [45]:
pitch_ball = pitch_ball.merge(pitch_course, on=['投手ID','pit_bat'], how='left')
pitch_ball = pitch_ball.merge(pit_game, on='投手ID', how='left')
pitch_ball = pitch_ball.merge(pit_inning, on='投手ID', how='left')
pitch_ball = pitch_ball.merge(pit_batcnt, on=['投手ID','pit_bat'], how='left')

In [46]:
print(pitch_ball.shape)
pitch_ball.head()

(651, 27)


Unnamed: 0,投手ID,pit_bat,straight,curve,slider,shoot,fork,changeup,sinker,cutball,total,course_0,course_1,course_2,course_3,course_4,course_5,course_6,course_7,course_8,course_9,course_10,course_11,course_12,pit_game_cnt,pit_inning_cnt,pit_batter_cnt
0,11606,R_L,134.0,54.0,0.0,0.0,61.0,0.0,0.0,13.0,262.0,21.0,17.0,10.0,7.0,20.0,14.0,4.0,7.0,10.0,51.0,8.0,45.0,48.0,46,54,63
1,11606,R_R,219.0,121.0,0.0,0.0,83.0,0.0,0.0,23.0,446.0,25.0,20.0,8.0,17.0,26.0,20.0,19.0,40.0,27.0,53.0,51.0,38.0,102.0,46,54,108
2,11766,L_L,111.0,0.0,103.0,67.0,0.0,0.0,2.0,0.0,283.0,14.0,31.0,24.0,20.0,24.0,11.0,16.0,14.0,10.0,20.0,24.0,62.0,13.0,50,50,73
3,11766,L_R,103.0,0.0,104.0,42.0,0.0,11.0,30.0,0.0,290.0,11.0,25.0,31.0,12.0,17.0,9.0,9.0,11.0,10.0,33.0,58.0,39.0,25.0,50,50,81
4,11807,R_L,249.0,38.0,1.0,9.0,115.0,0.0,0.0,2.0,414.0,22.0,13.0,20.0,18.0,22.0,25.0,18.0,25.0,22.0,58.0,53.0,54.0,64.0,52,61,95


In [47]:
pitch_ball.describe()

Unnamed: 0,投手ID,straight,curve,slider,shoot,fork,changeup,sinker,cutball,total,course_0,course_1,course_2,course_3,course_4,course_5,course_6,course_7,course_8,course_9,course_10,course_11,course_12,pit_game_cnt,pit_inning_cnt,pit_batter_cnt
count,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0
mean,1162924.0,184.940092,29.513057,73.385561,27.897081,32.786482,21.41321,4.486943,20.534562,394.956989,17.990783,26.162826,21.90169,17.895545,23.324117,18.58679,17.09831,25.549923,22.130568,42.941628,34.400922,53.043011,73.930876,20.797235,50.321045,100.001536
std,416039.8,190.605887,58.372104,94.297583,68.452185,58.598626,55.396277,30.041577,57.853821,391.279901,18.989712,28.977252,25.95842,18.602353,23.471574,19.564798,19.93303,29.703797,27.0172,45.536254,39.546552,62.828347,86.056009,18.815592,47.493933,99.171499
min,11606.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,900429.0,40.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,89.0,4.0,5.0,4.0,4.0,5.0,4.0,3.0,4.0,4.0,10.0,7.0,10.5,13.0,5.5,11.5,22.0
50%,1300036.0,126.0,5.0,33.0,0.0,4.0,0.0,0.0,0.0,276.0,12.0,16.0,12.0,12.0,17.0,12.0,10.0,15.0,12.0,27.0,22.0,29.0,41.0,15.0,37.0,71.0
75%,1500109.0,254.0,32.0,113.0,17.0,37.5,11.0,0.0,4.0,520.0,26.0,36.0,30.0,26.0,31.0,26.0,23.0,36.0,30.5,61.0,46.0,69.0,105.0,27.0,68.0,130.5
max,1700016.0,1083.0,458.0,710.0,637.0,331.0,438.0,451.0,475.0,1886.0,109.0,153.0,156.0,102.0,115.0,118.0,101.0,153.0,151.0,298.0,246.0,380.0,429.0,72.0,190.0,471.0


In [48]:
pitch_ball.to_feather(OUT_PIT)

## 野手
### 打席数

In [49]:
bat_ball = train_pitch[['打者ID', '試合ID', 'イニング', 'イニング内打席数']].groupby(['打者ID', '試合ID', 'イニング', 'イニング内打席数']).count()
bat_ball = pd.DataFrame(bat_ball.groupby(['打者ID']).size())
bat_ball.reset_index(inplace=True)
bat_ball.rename(columns={0: 'batter_cnt'}, inplace=True)

### 試合数

In [50]:
bat_game = train_pitch[['打者ID', '試合ID']].groupby(['打者ID', '試合ID']).count()
bat_game = pd.DataFrame(bat_game.groupby(['打者ID']).size())
bat_game.reset_index(inplace=True)
bat_game.rename(columns={0: 'bat_game_cnt'}, inplace=True)

### 打者成績まとめ

In [51]:
bat_ball = bat_ball.merge(bat_game, on='打者ID', how='left')

In [52]:
print(bat_ball.shape)
bat_ball.head()

(466, 3)


Unnamed: 0,打者ID,batter_cnt,bat_game_cnt
0,11270,83,35
1,11302,156,69
2,11343,41,28
3,11436,268,81
4,11511,48,21


In [53]:
bat_ball.describe()

Unnamed: 0,打者ID,batter_cnt,bat_game_cnt
count,466.0,466.0,466.0
mean,1044992.0,139.688841,41.448498
std,468711.7,184.754006,45.398623
min,11270.0,1.0,1.0
25%,800012.5,7.25,4.0
50%,1100114.0,44.0,21.0
75%,1400182.0,222.5,71.75
max,1700019.0,680.0,143.0


In [54]:
bat_ball.to_feather(OUT_BAT)