## pitch前処理(6)

In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import feather
pd.set_option('display.max_Columns', 100)

In [10]:
train_pitch = pd.read_feather('data/train_pitch.f')
test_pitch = pd.read_feather('data/test_pitch.f')

In [11]:
test_pitch['球種'] = None
test_pitch['投球位置区域'] = None
print(train_pitch.shape)
print(test_pitch.shape)

(257117, 51)
(521650, 51)


In [12]:
all_pitch = train_pitch.append(test_pitch, ignore_index=True)
print(all_pitch.shape)

(778767, 51)


### 球種
0:ストレート 1:カーブ 2:スライダー 3:シュート 4:フォーク 5:チェンジアップ 6:シンカー 7:カットボール

In [13]:
all_pitch.rename(columns={'球種': 'ball'}, inplace=True)

### ボールカウント

In [14]:
all_pitch['ball_cnt'] = all_pitch['プレイ前ストライク数'].astype(str) + '-' + all_pitch['プレイ前ボール数'].astype(str)

#### 2017年のデータ

In [15]:
train_ball = pd.read_feather('intermediate/ball_2017.f')
all_pitch = all_pitch.merge(train_ball, on='ball_cnt', how='left')

### 一塁走者ID, 二塁走者ID, 三塁走者ID

In [16]:
all_pitch['first'] = 0
all_pitch['second'] = 0
all_pitch['third'] = 0
all_pitch.loc[~np.isnan(all_pitch['一塁走者ID']), 'first'] = 1
all_pitch.loc[~np.isnan(all_pitch['二塁走者ID']), 'second'] = 1
all_pitch.loc[~np.isnan(all_pitch['三塁走者ID']), 'third'] = 1
all_pitch['base_cnt'] = all_pitch['first'] + all_pitch['second'] + all_pitch['third']

### 左右

In [17]:
all_pitch.replace('左', 'L', inplace=True)
all_pitch.replace('右', 'R', inplace=True)
all_pitch['pit_bat'] = all_pitch['投手投球左右'] + '_' + all_pitch['打者打席左右']
all_pitch.loc[all_pitch['投手投球左右']=='L', 'pitch_LR']=1
all_pitch.loc[all_pitch['投手投球左右']=='R', 'pitch_LR']=0
all_pitch.loc[all_pitch['打者打席左右']=='L', 'bat_LR']=1
all_pitch.loc[all_pitch['打者打席左右']=='R', 'bat_LR']=0

### 表裏
表=1, 裏=0

In [18]:
all_pitch['top_bot']=0
all_pitch.loc[all_pitch['表裏']=='表', 'top_bot']=1

### 投手役割
先発=1, 救援=0

In [19]:
all_pitch['role'] = 0
all_pitch.loc[all_pitch['投手役割']=='先発', 'role']=1

### 打者守備位置
投手=1, 投手以外=0

In [20]:
all_pitch['pos_pit']=0
all_pitch.loc[all_pitch['打者守備位置']=='投手', 'pos_pit']=1

### 開幕からの日数

In [21]:
date_min = all_pitch.groupby('年度').agg({'日付': min})
date_min.rename(columns={'日付': 'opening_date'}, inplace=True)
date_min.reset_index(inplace=True)
date_min['opening_date'] = pd.to_datetime(date_min['opening_date'])
date_min

Unnamed: 0,年度,opening_date
0,2017,2017-03-31
1,2018,2018-03-30
2,2019,2019-03-29


In [22]:
all_pitch = pd.merge(all_pitch, date_min, on='年度', how='left')
all_pitch['game_date'] = pd.to_datetime(all_pitch['日付'])
all_pitch['date_from_opening'] = (all_pitch['game_date'] - all_pitch['opening_date']).dt.days
all_pitch.shape

(778767, 73)

### 試合開始からの経過時間

In [23]:
time_min = all_pitch.groupby('試合ID').agg({'時刻': min})
time_min.rename(columns={'時刻': 'start_time'}, inplace=True)
time_min.reset_index(inplace=True)
time_min['start_time'] = pd.to_datetime(time_min['start_time'])
time_min.head()

Unnamed: 0,試合ID,start_time
0,2017033101,2020-05-06 18:03:00
1,2017033102,2020-05-06 18:02:00
2,2017033103,2020-05-06 18:30:00
3,2017033104,2020-05-06 18:30:00
4,2017033105,2020-05-06 18:30:00


In [24]:
all_pitch = pd.merge(all_pitch, time_min, on='試合ID', how='left')
all_pitch['game_time'] = pd.to_datetime(all_pitch['時刻'])
all_pitch['elapsed_time'] = (all_pitch['game_time'] - all_pitch['start_time'])
all_pitch['elapsed_min'] = all_pitch['elapsed_time'].dt.seconds / 60
all_pitch.shape

(778767, 77)

### ナイター

In [25]:
all_pitch['nighter'] = 0
all_pitch.loc[all_pitch['game_time'].dt.hour>=18, 'nighter']=1

### 交流戦

In [26]:
all_pitch['ce-pa'] = 0
all_pitch.loc[all_pitch['試合種別詳細']=='セ・パ交流戦', 'ce-pa']=1

### リーグ
- セ・リーグ=1

In [27]:
all_pitch['league'] = 0
all_pitch.loc[all_pitch['試合種別詳細']=='セ・リーグ公式戦', 'league']=1

### ホーム・アウェー

In [28]:
all_pitch['home']=0
all_pitch.loc[all_pitch['投手チームID']==all_pitch['ホームチームID'], 'home'] = 1

### 得点差

In [29]:
point_diff = all_pitch['プレイ前ホームチーム得点数'] - all_pitch['プレイ前アウェイチーム得点数']
all_pitch['point_diff'] = point_diff
all_pitch.loc[all_pitch['home']==0, 'point_diff'] = -point_diff

### 得点圏にランナーがいる

In [30]:
all_pitch['runner_23'] = 0
all_pitch.loc[(all_pitch['second']==1)|(all_pitch['third']==1), 'runner_23']=1

### 送りバントの場面
- ノーアウト一塁、ストライク0or1

In [31]:
all_pitch['bant'] = 0
all_pitch.loc[(all_pitch['first']==1)&(all_pitch['third']==0)&(all_pitch['プレイ前アウト数']==0)&(all_pitch['プレイ前ストライク数']<2), 'bant']=1

### スクイズの場面
- 1,2アウト三塁、ストライク0or1

In [32]:
all_pitch['squize'] = 0
all_pitch.loc[(all_pitch['third']==1)&(all_pitch['プレイ前アウト数']<2)&(all_pitch['プレイ前ストライク数']<2), 'squize']=1

### 上位打線

In [33]:
all_pitch['cleanup'] = 0
all_pitch.loc[(all_pitch['打者打順']>=1)&(all_pitch['打者打順']<=5), 'cleanup']=1

### 失点ピンチ
- 得点圏にランナーがいる & 上位打線

In [34]:
all_pitch['pinch'] = 0
all_pitch.loc[(all_pitch['runner_23']==1)&(all_pitch['cleanup']==1), 'pinch']=1

### 押出しの危機
- フルベースでボール23

In [35]:
all_pitch['fourball'] = 0
all_pitch.loc[(all_pitch['base_cnt']==3)&(all_pitch['プレイ前ボール数']>1), 'fourball']=1

### セーブがつく場面
- 9回、3点差以内で勝っている

In [36]:
all_pitch['savepoint'] = 0
all_pitch.loc[(all_pitch['イニング']==9)&(all_pitch['point_diff']<4), 'savepoint']=1

### ダミー変数

In [37]:
all_pitch = pd.get_dummies(all_pitch, columns=['ball_cnt', 'pit_bat'])

### 不要な列を削除

In [38]:
all_pitch.drop(
    columns=[
        '日付', '時刻', 
        '球場ID', '球場名', 
        '試合種別詳細', '表裏', 
        '投手投球左右', '投手役割', 
        '打者打席左右', '打者守備位置',
        '一塁走者ID', '二塁走者ID', '三塁走者ID', 
        '一塁手ID', '二塁手ID', '三塁手ID', '遊撃手ID', '左翼手ID', '中堅手ID', '右翼手ID', 
        '成績対象投手ID', '成績対象打者ID',
    ], inplace=True)

In [39]:
all_pitch.head()

Unnamed: 0,データ内連番,ball,投球位置区域,年度,試合ID,試合内連番,試合内投球数,ホームチームID,アウェイチームID,イニング,イニング内打席数,打席内投球数,投手ID,投手チームID,投手登板順,投手試合内対戦打者数,投手試合内投球数,投手イニング内投球数,打者ID,打者チームID,打者打順,打者試合内打席数,プレイ前ホームチーム得点数,プレイ前アウェイチーム得点数,プレイ前アウト数,プレイ前ボール数,プレイ前ストライク数,プレイ前走者状況,捕手ID,bc_straight,bc_curve,bc_slider,bc_shoot,bc_fork,bc_changeup,bc_sinker,bc_cutball,first,second,third,base_cnt,pitch_LR,bat_LR,top_bot,role,pos_pit,opening_date,game_date,date_from_opening,start_time,game_time,elapsed_time,elapsed_min,nighter,ce-pa,league,home,point_diff,runner_23,bant,squize,cleanup,pinch,fourball,savepoint,ball_cnt_0-0,ball_cnt_0-1,ball_cnt_0-2,ball_cnt_0-3,ball_cnt_1-0,ball_cnt_1-1,ball_cnt_1-2,ball_cnt_1-3,ball_cnt_2-0,ball_cnt_2-1,ball_cnt_2-2,ball_cnt_2-3,pit_bat_L_L,pit_bat_L_R,pit_bat_R_L,pit_bat_R_R
0,0,0.0,0.0,2017,2017033101,1,1,1,4,1,1,1,1500001,1,1,1,1,1,900410,4,1,1,0,0,0,0,0,___,1300027,0.477023,0.097592,0.20087,0.074454,0.046799,0.041882,0.008619,0.05276,0,0,0,0,0.0,1.0,1,1,0,2017-03-31,2017-03-31,0,2020-05-06 18:03:00,2020-05-06 18:03:00,00:00:00,0.0,1,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,1,0.0,8.0,2017,2017033101,2,2,1,4,1,1,2,1500001,1,1,1,2,2,900410,4,1,1,0,0,0,0,1,___,1300027,0.459832,0.090366,0.161893,0.068313,0.099045,0.061079,0.012505,0.046967,0,0,0,0,0.0,1.0,1,1,0,2017-03-31,2017-03-31,0,2020-05-06 18:03:00,2020-05-06 18:03:12,00:00:12,0.2,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
2,2,0.0,5.0,2017,2017033101,3,3,1,4,1,1,3,1500001,1,1,1,3,3,900410,4,1,1,0,0,0,0,2,___,1300027,0.46973,0.066062,0.159395,0.031361,0.16294,0.062722,0.014112,0.033679,0,0,0,0,0.0,1.0,1,1,0,2017-03-31,2017-03-31,0,2020-05-06 18:03:00,2020-05-06 18:03:23,00:00:23,0.383333,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
3,3,0.0,12.0,2017,2017033101,4,4,1,4,1,2,1,1500001,1,1,2,4,4,11436,4,2,1,0,0,1,0,0,___,1300027,0.477023,0.097592,0.20087,0.074454,0.046799,0.041882,0.008619,0.05276,0,0,0,0,0.0,0.0,1,1,0,2017-03-31,2017-03-31,0,2020-05-06 18:03:00,2020-05-06 18:04:01,00:01:01,1.016667,1,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,4,0.0,8.0,2017,2017033101,5,5,1,4,1,2,2,1500001,1,1,2,5,5,11436,4,2,1,0,0,1,0,1,___,1300027,0.459832,0.090366,0.161893,0.068313,0.099045,0.061079,0.012505,0.046967,0,0,0,0,0.0,0.0,1,1,0,2017-03-31,2017-03-31,0,2020-05-06 18:03:00,2020-05-06 18:04:12,00:01:12,1.2,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1


In [40]:
all_pitch.tail()

Unnamed: 0,データ内連番,ball,投球位置区域,年度,試合ID,試合内連番,試合内投球数,ホームチームID,アウェイチームID,イニング,イニング内打席数,打席内投球数,投手ID,投手チームID,投手登板順,投手試合内対戦打者数,投手試合内投球数,投手イニング内投球数,打者ID,打者チームID,打者打順,打者試合内打席数,プレイ前ホームチーム得点数,プレイ前アウェイチーム得点数,プレイ前アウト数,プレイ前ボール数,プレイ前ストライク数,プレイ前走者状況,捕手ID,bc_straight,bc_curve,bc_slider,bc_shoot,bc_fork,bc_changeup,bc_sinker,bc_cutball,first,second,third,base_cnt,pitch_LR,bat_LR,top_bot,role,pos_pit,opening_date,game_date,date_from_opening,start_time,game_time,elapsed_time,elapsed_min,nighter,ce-pa,league,home,point_diff,runner_23,bant,squize,cleanup,pinch,fourball,savepoint,ball_cnt_0-0,ball_cnt_0-1,ball_cnt_0-2,ball_cnt_0-3,ball_cnt_1-0,ball_cnt_1-1,ball_cnt_1-2,ball_cnt_1-3,ball_cnt_2-0,ball_cnt_2-1,ball_cnt_2-2,ball_cnt_2-3,pit_bat_L_L,pit_bat_L_R,pit_bat_R_L,pit_bat_R_R
778762,521645,,,2019,2019093001,289,285,5,4,9,3,8,11807,5,6,3,17,17,700034,4,4,2,3,0,1,1,2,_2_,1300073,0.431787,0.070655,0.177804,0.043295,0.149767,0.072798,0.013116,0.040777,0,1,0,1,0.0,0.0,1,0,0,2019-03-29,2019-09-30,185,2020-05-06 18:00:00,2020-05-06 21:10:05,03:10:05,190.083333,1,0,1,1,3,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
778763,521646,,,2019,2019093001,290,286,5,4,9,4,1,11807,5,6,4,18,18,1100069,4,5,4,3,0,2,0,0,_2_,1300073,0.477023,0.097592,0.20087,0.074454,0.046799,0.041882,0.008619,0.05276,0,1,0,1,0.0,1.0,1,0,0,2019-03-29,2019-09-30,185,2020-05-06 18:00:00,2020-05-06 21:10:53,03:10:53,190.883333,1,0,1,1,3,1,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
778764,521647,,,2019,2019093001,291,287,5,4,9,4,2,11807,5,6,4,19,19,1100069,4,5,4,3,0,2,0,1,_2_,1300073,0.459832,0.090366,0.161893,0.068313,0.099045,0.061079,0.012505,0.046967,0,1,0,1,0.0,1.0,1,0,0,2019-03-29,2019-09-30,185,2020-05-06 18:00:00,2020-05-06 21:11:21,03:11:21,191.35,1,0,1,1,3,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
778765,521648,,,2019,2019093001,292,288,5,4,9,4,3,11807,5,6,4,20,20,1100069,4,5,4,3,0,2,0,2,_2_,1300073,0.46973,0.066062,0.159395,0.031361,0.16294,0.062722,0.014112,0.033679,0,1,0,1,0.0,1.0,1,0,0,2019-03-29,2019-09-30,185,2020-05-06 18:00:00,2020-05-06 21:12:05,03:12:05,192.083333,1,0,1,1,3,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
778766,521649,,,2019,2019093001,293,289,5,4,9,4,4,11807,5,6,4,21,21,1100069,4,5,4,3,0,2,1,2,_2_,1300073,0.431787,0.070655,0.177804,0.043295,0.149767,0.072798,0.013116,0.040777,0,1,0,1,0.0,1.0,1,0,0,2019-03-29,2019-09-30,185,2020-05-06 18:00:00,2020-05-06 21:12:48,03:12:48,192.8,1,0,1,1,3,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0


In [41]:
all_pitch.to_feather('intermediate/all_pitch_6.f')