## pitch前処理
#### 14
- ball_2017_5.f ... ストレートに対する比率、コースの種類のみ

#### 13
- 1つ前の投球・ファウル数
- ball_2017_4.f ... コースの種類

#### 12
- イニング最初からの時間、打席最初からの時間、平均投球間隔(差)、サヨナラ、延長戦

#### 11
- ball_2017_3.f ... 特徴量名変更

#### 10
- ball_2017_2.f ... 2017年の球種とコースの集計を左右別に

#### 9
- pit_batをダミー変数から外す

#### 8
- 前の投球からの時間差

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import feather
pd.set_option('display.max_Columns', 100)

In [5]:
train_pitch = pd.read_feather('data/train_pitch.f')
test_pitch = pd.read_feather('data/test_pitch.f')

In [6]:
INPUT_BALL2017 = 'intermediate/pitch/pitch_2017_5.f'
OUTPUT = 'intermediate/pitch/all_pitch_14.f'

In [7]:
test_pitch['球種'] = None
test_pitch['投球位置区域'] = None
print(train_pitch.shape)
print(test_pitch.shape)

(257117, 51)
(521650, 51)


In [8]:
all_pitch = train_pitch.append(test_pitch, ignore_index=True)
print(all_pitch.shape)

(778767, 51)


### 球種
0:ストレート 1:カーブ 2:スライダー 3:シュート 4:フォーク 5:チェンジアップ 6:シンカー 7:カットボール

In [9]:
all_pitch.rename(columns={'球種': 'ball', '投球位置区域': 'course'}, inplace=True)

### ボールカウント

In [10]:
all_pitch['ball_cnt'] = all_pitch['プレイ前ストライク数'].astype(str) + '-' + all_pitch['プレイ前ボール数'].astype(str)

### 左右

In [11]:
all_pitch.replace('左', 'L', inplace=True)
all_pitch.replace('右', 'R', inplace=True)
all_pitch['pit_bat'] = all_pitch['投手投球左右'] + '_' + all_pitch['打者打席左右']
all_pitch.loc[all_pitch['投手投球左右']=='L', 'pitch_LR']=1
all_pitch.loc[all_pitch['投手投球左右']=='R', 'pitch_LR']=0
all_pitch.loc[all_pitch['打者打席左右']=='L', 'bat_LR']=1
all_pitch.loc[all_pitch['打者打席左右']=='R', 'bat_LR']=0

### 2017年のデータをマージ

In [12]:
train_ball = pd.read_feather(INPUT_BALL2017)
all_pitch = all_pitch.merge(train_ball, on=['ball_cnt', 'pit_bat'], how='left')

### 一塁走者ID, 二塁走者ID, 三塁走者ID

In [13]:
all_pitch['first'] = 0
all_pitch['second'] = 0
all_pitch['third'] = 0
all_pitch.loc[~np.isnan(all_pitch['一塁走者ID']), 'first'] = 1
all_pitch.loc[~np.isnan(all_pitch['二塁走者ID']), 'second'] = 1
all_pitch.loc[~np.isnan(all_pitch['三塁走者ID']), 'third'] = 1
all_pitch['base_cnt'] = all_pitch['first'] + all_pitch['second'] + all_pitch['third']

### 表裏
表=0, 裏=1

In [14]:
all_pitch['top_bot']=0
all_pitch.loc[all_pitch['表裏']=='裏', 'top_bot']=1

### 投手役割
先発=1, 救援=0

In [15]:
all_pitch['role'] = 0
all_pitch.loc[all_pitch['投手役割']=='先発', 'role']=1

### 打者守備位置
投手=1, 投手以外=0

In [16]:
all_pitch['pos_pit']=0
all_pitch.loc[all_pitch['打者守備位置']=='投手', 'pos_pit']=1

### 開幕からの日数

In [17]:
date_min = all_pitch.groupby('年度').agg({'日付': min})
date_min.rename(columns={'日付': 'opening_date'}, inplace=True)
date_min.reset_index(inplace=True)
date_min['opening_date'] = pd.to_datetime(date_min['opening_date'])
date_min

Unnamed: 0,年度,opening_date
0,2017,2017-03-31
1,2018,2018-03-30
2,2019,2019-03-29


In [18]:
all_pitch = pd.merge(all_pitch, date_min, on='年度', how='left')
all_pitch['game_date'] = pd.to_datetime(all_pitch['日付'])
all_pitch['date_from_opening'] = (all_pitch['game_date'] - all_pitch['opening_date']).dt.days
all_pitch.shape

(778767, 82)

### 試合開始からの経過時間

In [19]:
time_min = all_pitch.groupby('試合ID').agg({'時刻': min})
time_min.rename(columns={'時刻': 'start_time'}, inplace=True)
time_min.reset_index(inplace=True)
time_min['start_time'] = pd.to_datetime(time_min['start_time'])
time_min.head()

Unnamed: 0,試合ID,start_time
0,2017033101,2020-05-12 18:03:00
1,2017033102,2020-05-12 18:02:00
2,2017033103,2020-05-12 18:30:00
3,2017033104,2020-05-12 18:30:00
4,2017033105,2020-05-12 18:30:00


In [20]:
all_pitch = pd.merge(all_pitch, time_min, on='試合ID', how='left')
all_pitch['game_time'] = pd.to_datetime(all_pitch['時刻'])
all_pitch['elapsed_time'] = (all_pitch['game_time'] - all_pitch['start_time'])
all_pitch['elapsed_min'] = all_pitch['elapsed_time'].dt.seconds / 60
all_pitch.shape

(778767, 86)

### 前の投球からの時間差
イニング最初の投球はNaNとする

In [21]:
min_diff = all_pitch.groupby(['試合ID'])['elapsed_min'].diff()
all_pitch['min_diff'] = min_diff
all_pitch.loc[all_pitch['投手イニング内投球数']==1, 'min_diff'] = np.NaN

### イニングの通し番号

In [22]:
all_pitch['total_inning'] = (all_pitch['イニング'] - 1) * 2 + all_pitch['top_bot']

### イニング最初からの時間

In [23]:
min_inning = all_pitch.groupby(['試合ID', 'total_inning']).agg({'elapsed_min': min, '試合内投球数': min})
min_inning.reset_index(inplace=True)
min_inning.rename(columns={'elapsed_min': 'start_inning', '試合内投球数': 'start_ball_inning'}, inplace=True)
all_pitch = pd.merge(all_pitch, min_inning, on=['試合ID', 'total_inning'], how='left')
all_pitch['elapsed_from_inning'] = (all_pitch['elapsed_min'] - all_pitch['start_inning'])
all_pitch['ballnum_from_inning'] = (all_pitch['試合内投球数'] - all_pitch['start_ball_inning']) + 1
all_pitch.drop(columns=['start_inning', 'start_ball_inning'], inplace=True)

### 打席最初からの時間

In [24]:
min_batter = all_pitch.groupby(['試合ID', 'total_inning', 'イニング内打席数']).agg({'elapsed_min': min})
min_batter.rename(columns={'elapsed_min': 'start_batter'}, inplace=True)
min_batter.reset_index(inplace=True)
all_pitch = pd.merge(all_pitch, min_batter, on=['試合ID', 'total_inning', 'イニング内打席数'], how='left')
all_pitch['elapsed_batter'] = (all_pitch['elapsed_min'] - all_pitch['start_batter'])
all_pitch.drop(columns=['start_batter'], inplace=True)

### 平均投球間隔

In [25]:
all_pitch['ave_elapsed_game'] = (all_pitch['elapsed_min'] / (all_pitch['試合内投球数'] - 1))
all_pitch['ave_elapsed_inning'] = (all_pitch['elapsed_from_inning'] / (all_pitch['ballnum_from_inning'] - 1))
all_pitch['ave_elapsed_batter'] = (all_pitch['elapsed_batter'] / (all_pitch['打席内投球数'] - 1))

### 平均投球間隔の差

In [26]:
all_pitch['diff_elapsed_batter'] = all_pitch['min_diff'] - all_pitch['ave_elapsed_batter']
all_pitch['diff_elapsed_inning'] = all_pitch['min_diff'] - all_pitch['ave_elapsed_inning']
all_pitch['diff_elapsed_game'] = all_pitch['min_diff'] - all_pitch['ave_elapsed_game']

### サヨナラの危機

In [27]:
all_pitch['sayonara'] = 0
all_pitch.loc[(all_pitch['イニング']>=9)&(all_pitch['表裏']=='裏'), 'sayonara']=1

### 延長戦

In [28]:
all_pitch['extention'] = 0
all_pitch.loc[(all_pitch['イニング']>9), 'extention']=1

### ナイター

In [29]:
all_pitch['nighter'] = 0
all_pitch.loc[all_pitch['game_time'].dt.hour>=18, 'nighter']=1

### 交流戦

In [30]:
all_pitch['ce-pa'] = 0
all_pitch.loc[all_pitch['試合種別詳細']=='セ・パ交流戦', 'ce-pa']=1

### リーグ
- セ・リーグ=1

In [31]:
all_pitch['league'] = 0
all_pitch.loc[all_pitch['試合種別詳細']=='セ・リーグ公式戦', 'league']=1

### ホーム・アウェー

In [32]:
all_pitch['home']=0
all_pitch.loc[all_pitch['投手チームID']==all_pitch['ホームチームID'], 'home'] = 1

### 得点差

In [33]:
point_diff = all_pitch['プレイ前ホームチーム得点数'] - all_pitch['プレイ前アウェイチーム得点数']
all_pitch['point_diff'] = point_diff
all_pitch.loc[all_pitch['home']==0, 'point_diff'] = -point_diff

### 得点圏にランナーがいる

In [34]:
all_pitch['runner_23'] = 0
all_pitch.loc[(all_pitch['second']==1)|(all_pitch['third']==1), 'runner_23']=1

### 送りバントの場面
- ノーアウト一塁、ストライク0or1

In [35]:
all_pitch['bant'] = 0
all_pitch.loc[(all_pitch['first']==1)&(all_pitch['third']==0)&(all_pitch['プレイ前アウト数']==0)&(all_pitch['プレイ前ストライク数']<2), 'bant']=1

### スクイズの場面
- 1,2アウト三塁、ストライク0or1

In [36]:
all_pitch['squize'] = 0
all_pitch.loc[(all_pitch['third']==1)&(all_pitch['プレイ前アウト数']<2)&(all_pitch['プレイ前ストライク数']<2), 'squize']=1

### 上位打線

In [37]:
all_pitch['cleanup'] = 0
all_pitch.loc[(all_pitch['打者打順']>=1)&(all_pitch['打者打順']<=5), 'cleanup']=1

### 失点ピンチ
- 得点圏にランナーがいる & 上位打線

In [38]:
all_pitch['pinch'] = 0
all_pitch.loc[(all_pitch['runner_23']==1)&(all_pitch['cleanup']==1), 'pinch']=1

### 押出しの危機
- フルベースでボール23

In [39]:
all_pitch['fourball'] = 0
all_pitch.loc[(all_pitch['base_cnt']==3)&(all_pitch['プレイ前ボール数']>1), 'fourball']=1

### セーブがつく場面
- 9回、3点差以内で勝っている

In [40]:
all_pitch['savepoint'] = 0
all_pitch.loc[(all_pitch['イニング']>=9)&(all_pitch['point_diff']<4), 'savepoint']=1

### 1つ前の投球・ファウル数

In [41]:
all_pitch['ball_count_sum'] =  all_pitch['プレイ前ボール数'] + all_pitch['プレイ前ストライク数']
groupby_batter = all_pitch.groupby(['試合ID', 'イニング', 'イニング内打席数'])
all_pitch['pre_ball_foul'] = 1 - groupby_batter['ball_count_sum'].diff().fillna(1)
all_pitch['pre_ball_ball'] = groupby_batter['プレイ前ボール数'].diff().fillna(0)
all_pitch['pre_ball_strike'] = groupby_batter['プレイ前ストライク数'].diff().fillna(0) + all_pitch['pre_ball_foul']
all_pitch['pre_foul_sum'] = all_pitch['打席内投球数'] - all_pitch['ball_count_sum']

### ダミー変数

In [42]:
all_pitch = pd.get_dummies(all_pitch, columns=['ball_cnt'])

### 不要な列を削除

In [43]:
all_pitch.drop(
    columns=[
        '日付', '時刻', 
        '球場ID', '球場名', 
        '試合種別詳細', '表裏', 
        '投手投球左右', '投手役割', 
        '打者打席左右', '打者守備位置',
        '一塁走者ID', '二塁走者ID', '三塁走者ID', 
        '一塁手ID', '二塁手ID', '三塁手ID', '遊撃手ID', '左翼手ID', '中堅手ID', '右翼手ID', 
        '成績対象投手ID', '成績対象打者ID',
    ], inplace=True)

In [44]:
all_pitch.head()

Unnamed: 0,データ内連番,ball,course,年度,試合ID,試合内連番,試合内投球数,ホームチームID,アウェイチームID,イニング,イニング内打席数,打席内投球数,投手ID,投手チームID,投手登板順,投手試合内対戦打者数,投手試合内投球数,投手イニング内投球数,打者ID,打者チームID,打者打順,打者試合内打席数,プレイ前ホームチーム得点数,プレイ前アウェイチーム得点数,プレイ前アウト数,プレイ前ボール数,プレイ前ストライク数,プレイ前走者状況,捕手ID,pit_bat,pitch_LR,bat_LR,bc_curve,bc_slider,bc_shoot,bc_fork,bc_changeup,bc_sinker,bc_cutball,bc_high_str,bc_high_ball,bc_mid_str,bc_low_str,bc_low_ball,bc_left_str,bc_left_ball,bc_center_str,bc_right_str,bc_right_ball,first,...,pos_pit,opening_date,game_date,date_from_opening,start_time,game_time,elapsed_time,elapsed_min,min_diff,total_inning,elapsed_from_inning,ballnum_from_inning,elapsed_batter,ave_elapsed_game,ave_elapsed_inning,ave_elapsed_batter,diff_elapsed_batter,diff_elapsed_inning,diff_elapsed_game,sayonara,extention,nighter,ce-pa,league,home,point_diff,runner_23,bant,squize,cleanup,pinch,fourball,savepoint,ball_count_sum,pre_ball_foul,pre_ball_ball,pre_ball_strike,pre_foul_sum,ball_cnt_0-0,ball_cnt_0-1,ball_cnt_0-2,ball_cnt_0-3,ball_cnt_1-0,ball_cnt_1-1,ball_cnt_1-2,ball_cnt_1-3,ball_cnt_2-0,ball_cnt_2-1,ball_cnt_2-2,ball_cnt_2-3
0,0,0.0,0.0,2017,2017033101,1,1,1,4,1,1,1,1500001,1,1,1,1,1,900410,4,1,1,0,0,0,0,0,___,1300027,R_L,0.0,1.0,0.237468,0.325101,0.209547,0.145227,0.092978,0.027829,0.11362,0.135883,0.221489,0.204959,0.174254,0.263415,0.242045,0.294634,0.157681,0.11537,0.19027,0,...,0,2017-03-31,2017-03-31,0,2020-05-12 18:03:00,2020-05-12 18:03:00,00:00:00,0.0,,0,0.0,1,0.0,,,,,,,0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0.0,0.0,0.0,1,1,0,0,0,0,0,0,0,0,0,0,0
1,1,0.0,8.0,2017,2017033101,2,2,1,4,1,1,2,1500001,1,1,1,2,2,900410,4,1,1,0,0,0,0,1,___,1300027,R_L,0.0,1.0,0.210923,0.247198,0.177502,0.320359,0.147952,0.039943,0.116772,0.106283,0.230235,0.157397,0.138195,0.36789,0.184621,0.35572,0.111331,0.105923,0.242405,0,...,0,2017-03-31,2017-03-31,0,2020-05-12 18:03:00,2020-05-12 18:03:12,00:00:12,0.2,0.2,0,0.2,2,0.2,0.2,0.2,0.2,0.0,0.0,0.0,0,0,1,0,1,1,0,0,0,0,1,0,0,0,1,0.0,0.0,1.0,1,0,0,0,0,1,0,0,0,0,0,0,0
2,2,0.0,5.0,2017,2017033101,3,3,1,4,1,1,3,1500001,1,1,1,3,3,900410,4,1,1,0,0,0,0,2,___,1300027,R_L,0.0,1.0,0.138758,0.13004,0.092263,0.419906,0.117327,0.03814,0.046858,0.076007,0.284249,0.102015,0.095055,0.442674,0.118315,0.488095,0.07619,0.078571,0.238828,0,...,0,2017-03-31,2017-03-31,0,2020-05-12 18:03:00,2020-05-12 18:03:23,00:00:23,0.383333,0.183333,0,0.383333,3,0.383333,0.191667,0.191667,0.191667,-0.008333,-0.008333,-0.008333,0,0,1,0,1,1,0,0,0,0,1,0,0,0,2,0.0,0.0,1.0,1,0,0,0,0,0,0,0,0,1,0,0,0
3,3,0.0,12.0,2017,2017033101,4,4,1,4,1,2,1,1500001,1,1,2,4,4,11436,4,2,1,0,0,1,0,0,___,1300027,R_R,0.0,0.0,0.213866,0.474931,0.139964,0.088519,0.038543,0.007023,0.138168,0.151308,0.196937,0.206965,0.157643,0.287147,0.105057,0.153718,0.164756,0.246104,0.330367,0,...,0,2017-03-31,2017-03-31,0,2020-05-12 18:03:00,2020-05-12 18:04:01,00:01:01,1.016667,0.633333,0,1.016667,4,0.0,0.338889,0.338889,,,0.294444,0.294444,0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0.0,0.0,0.0,1,1,0,0,0,0,0,0,0,0,0,0,0
4,4,0.0,8.0,2017,2017033101,5,5,1,4,1,2,2,1500001,1,1,2,5,5,11436,4,2,1,0,0,1,0,1,___,1300027,R_R,0.0,0.0,0.215241,0.420099,0.145723,0.196762,0.051566,0.00968,0.118444,0.113468,0.206297,0.162901,0.127988,0.389347,0.084346,0.162167,0.120564,0.199445,0.433477,0,...,0,2017-03-31,2017-03-31,0,2020-05-12 18:03:00,2020-05-12 18:04:12,00:01:12,1.2,0.183333,0,1.2,5,0.183333,0.3,0.3,0.183333,0.0,-0.116667,-0.116667,0,0,1,0,1,1,0,0,0,0,1,0,0,0,1,0.0,0.0,1.0,1,0,0,0,0,1,0,0,0,0,0,0,0


In [45]:
all_pitch.tail()

Unnamed: 0,データ内連番,ball,course,年度,試合ID,試合内連番,試合内投球数,ホームチームID,アウェイチームID,イニング,イニング内打席数,打席内投球数,投手ID,投手チームID,投手登板順,投手試合内対戦打者数,投手試合内投球数,投手イニング内投球数,打者ID,打者チームID,打者打順,打者試合内打席数,プレイ前ホームチーム得点数,プレイ前アウェイチーム得点数,プレイ前アウト数,プレイ前ボール数,プレイ前ストライク数,プレイ前走者状況,捕手ID,pit_bat,pitch_LR,bat_LR,bc_curve,bc_slider,bc_shoot,bc_fork,bc_changeup,bc_sinker,bc_cutball,bc_high_str,bc_high_ball,bc_mid_str,bc_low_str,bc_low_ball,bc_left_str,bc_left_ball,bc_center_str,bc_right_str,bc_right_ball,first,...,pos_pit,opening_date,game_date,date_from_opening,start_time,game_time,elapsed_time,elapsed_min,min_diff,total_inning,elapsed_from_inning,ballnum_from_inning,elapsed_batter,ave_elapsed_game,ave_elapsed_inning,ave_elapsed_batter,diff_elapsed_batter,diff_elapsed_inning,diff_elapsed_game,sayonara,extention,nighter,ce-pa,league,home,point_diff,runner_23,bant,squize,cleanup,pinch,fourball,savepoint,ball_count_sum,pre_ball_foul,pre_ball_ball,pre_ball_strike,pre_foul_sum,ball_cnt_0-0,ball_cnt_0-1,ball_cnt_0-2,ball_cnt_0-3,ball_cnt_1-0,ball_cnt_1-1,ball_cnt_1-2,ball_cnt_1-3,ball_cnt_2-0,ball_cnt_2-1,ball_cnt_2-2,ball_cnt_2-3
778762,521645,,,2019,2019093001,289,285,5,4,9,3,8,11807,5,6,3,17,17,700034,4,4,2,3,0,1,1,2,_2_,1300073,R_R,0.0,0.0,0.208482,0.526805,0.11818,0.391708,0.081249,0.018108,0.12914,0.1047,0.174822,0.137257,0.119919,0.463302,0.082739,0.165575,0.107783,0.171354,0.472549,0,...,0,2019-03-29,2019-09-30,185,2020-05-12 18:00:00,2020-05-12 21:10:05,03:10:05,190.083333,0.716667,16,9.866667,17,4.4,0.669308,0.616667,0.628571,0.088095,0.1,0.047359,0,0,1,0,1,1,3,1,0,0,1,1,0,1,3,1.0,0.0,1.0,5,0,0,0,0,0,0,0,0,0,1,0,0
778763,521646,,,2019,2019093001,290,286,5,4,9,4,1,11807,5,6,4,18,18,1100069,4,5,4,3,0,2,0,0,_2_,1300073,R_L,0.0,1.0,0.237468,0.325101,0.209547,0.145227,0.092978,0.027829,0.11362,0.135883,0.221489,0.204959,0.174254,0.263415,0.242045,0.294634,0.157681,0.11537,0.19027,0,...,0,2019-03-29,2019-09-30,185,2020-05-12 18:00:00,2020-05-12 21:10:53,03:10:53,190.883333,0.8,16,10.666667,18,0.0,0.669766,0.627451,,,0.172549,0.130234,0,0,1,0,1,1,3,1,0,0,1,1,0,1,0,0.0,0.0,0.0,1,1,0,0,0,0,0,0,0,0,0,0,0
778764,521647,,,2019,2019093001,291,287,5,4,9,4,2,11807,5,6,4,19,19,1100069,4,5,4,3,0,2,0,1,_2_,1300073,R_L,0.0,1.0,0.210923,0.247198,0.177502,0.320359,0.147952,0.039943,0.116772,0.106283,0.230235,0.157397,0.138195,0.36789,0.184621,0.35572,0.111331,0.105923,0.242405,0,...,0,2019-03-29,2019-09-30,185,2020-05-12 18:00:00,2020-05-12 21:11:21,03:11:21,191.35,0.466667,16,11.133333,19,0.466667,0.669056,0.618519,0.466667,0.0,-0.151852,-0.202389,0,0,1,0,1,1,3,1,0,0,1,1,0,1,1,0.0,0.0,1.0,1,0,0,0,0,1,0,0,0,0,0,0,0
778765,521648,,,2019,2019093001,292,288,5,4,9,4,3,11807,5,6,4,20,20,1100069,4,5,4,3,0,2,0,2,_2_,1300073,R_L,0.0,1.0,0.138758,0.13004,0.092263,0.419906,0.117327,0.03814,0.046858,0.076007,0.284249,0.102015,0.095055,0.442674,0.118315,0.488095,0.07619,0.078571,0.238828,0,...,0,2019-03-29,2019-09-30,185,2020-05-12 18:00:00,2020-05-12 21:12:05,03:12:05,192.083333,0.733333,16,11.866667,20,1.2,0.66928,0.624561,0.6,0.133333,0.108772,0.064053,0,0,1,0,1,1,3,1,0,0,1,1,0,1,2,0.0,0.0,1.0,1,0,0,0,0,0,0,0,0,1,0,0,0
778766,521649,,,2019,2019093001,293,289,5,4,9,4,4,11807,5,6,4,21,21,1100069,4,5,4,3,0,2,1,2,_2_,1300073,R_L,0.0,1.0,0.166898,0.248729,0.11165,0.413315,0.132917,0.039528,0.09154,0.109468,0.213799,0.146902,0.13002,0.399811,0.156443,0.357555,0.106532,0.123414,0.256055,0,...,0,2019-03-29,2019-09-30,185,2020-05-12 18:00:00,2020-05-12 21:12:48,03:12:48,192.8,0.716667,16,12.583333,21,1.916667,0.669444,0.629167,0.638889,0.077778,0.0875,0.047222,0,0,1,0,1,1,3,1,0,0,1,1,0,1,3,0.0,1.0,0.0,1,0,0,0,0,0,0,0,0,0,1,0,0


In [46]:
all_pitch.to_feather(OUTPUT)