## pitch2017前処理

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import feather
pd.set_option('display.max_Columns', 100)

In [2]:
train_pitch = pd.read_feather('data/train_pitch.f')
print(train_pitch.shape)

(257117, 51)


### 投手
#### 球種
0:ストレート 1:カーブ 2:スライダー 3:シュート 4:フォーク 5:チェンジアップ 6:シンカー 7:カットボール

In [3]:
train_pitch.replace({'球種': {
    0: '0_straight', 
    1: '1_curve', 
    2: '2_slider', 
    3: '3_shoot', 
    4: '4_fork', 
    5: '5_changeup', 
    6: '6_sinker', 
    7: '7_cutball'
}}, inplace=True)
train_pitch.rename(columns={'球種': 'ball'}, inplace=True)

In [4]:
pitch_ball = train_pitch[['投手ID','ball']]
pitch_ball = pd.get_dummies(pitch_ball, columns=['ball'])

In [5]:
groupby_pit = pitch_ball.groupby('投手ID').sum()
groupby_pit.rename(columns={
    'ball_0_straight': 'straight',
    'ball_1_curve': 'curve',
    'ball_2_slider': 'slider',
    'ball_3_shoot': 'shoot',
    'ball_4_fork': 'fork',
    'ball_5_changeup': 'changeup',
    'ball_6_sinker': 'sinker',
    'ball_7_cutball': 'cutball',
}, inplace=True)
pitch_ball = groupby_pit.reset_index(inplace=False)

In [6]:
pitch_ball['total'] = (pitch_ball['straight'] + pitch_ball['curve'] + pitch_ball['slider'] + pitch_ball['shoot']
                       + pitch_ball['fork'] + pitch_ball['changeup'] + pitch_ball['sinker'] + pitch_ball['cutball'])
pitch_ball['straight'] = pitch_ball['straight'] / pitch_ball['total']
pitch_ball['curve'] = pitch_ball['curve'] / pitch_ball['total']
pitch_ball['slider'] = pitch_ball['slider'] / pitch_ball['total']
pitch_ball['shoot'] = pitch_ball['shoot'] / pitch_ball['total']
pitch_ball['fork'] = pitch_ball['fork'] / pitch_ball['total']
pitch_ball['changeup'] = pitch_ball['changeup'] / pitch_ball['total']
pitch_ball['sinker'] = pitch_ball['sinker'] / pitch_ball['total']
pitch_ball['cutball'] = pitch_ball['cutball'] / pitch_ball['total']

In [7]:
pitch_ball.head()

Unnamed: 0,投手ID,straight,curve,slider,shoot,fork,changeup,sinker,cutball,total
0,11606,0.498588,0.247175,0.0,0.0,0.20339,0.0,0.0,0.050847,708.0
1,11766,0.373473,0.0,0.361257,0.190227,0.0,0.019197,0.055846,0.0,573.0
2,11807,0.667003,0.090817,0.002018,0.0111,0.227043,0.0,0.0,0.002018,991.0
3,12103,0.51622,0.069111,0.300423,0.03103,0.083216,0.0,0.0,0.0,709.0
4,12113,0.18771,0.007201,0.178108,0.158425,0.0,0.06385,0.233317,0.171387,2083.0


### 登板試合数

In [8]:
pit_game = train_pitch[['投手ID', '試合ID']].groupby(['投手ID', '試合ID']).count()
pit_game = pd.DataFrame(pit_game.groupby(['投手ID']).size())
pit_game.reset_index(inplace=True)
pit_game.rename(columns={0: 'pit_game_cnt'}, inplace=True)

### イニング数

In [9]:
pit_inning = train_pitch[['投手ID', '試合ID', 'イニング']].groupby(['投手ID', '試合ID', 'イニング']).count()
pit_inning = pd.DataFrame(pit_inning.groupby(['投手ID']).size())
pit_inning.reset_index(inplace=True)
pit_inning.rename(columns={0: 'pit_inning_cnt'}, inplace=True)

### 対戦打者数

In [10]:
pit_batcnt = train_pitch[['投手ID', '試合ID', 'イニング', 'イニング内打席数']].groupby(['投手ID', '試合ID', 'イニング', 'イニング内打席数']).count()
pit_batcnt = pd.DataFrame(pit_batcnt.groupby(['投手ID']).size())
pit_batcnt.reset_index(inplace=True)
pit_batcnt.rename(columns={0: 'pit_batter_cnt'}, inplace=True)

### 投手実績まとめ

In [11]:
pitch_ball = pitch_ball.merge(pit_game, on='投手ID', how='left')
pitch_ball = pitch_ball.merge(pit_inning, on='投手ID', how='left')
pitch_ball = pitch_ball.merge(pit_batcnt, on='投手ID', how='left')

### 1イニング当たり、1試合当たり、1打席当たり
- pit_inning_per_game: イニング数/試合数
- pit_batter_per_game: 打席数/試合数
- pit_ball_per_game: 投球数/試合数
- pit_batter_per_inning: 打席数/イニング数
- pit_ball_per_inning: 投球数/イニング数
- pit_ball_per_batter: 投球数/打席数

In [12]:
#pitch_ball['pit_inning_per_game'] = pitch_ball['pit_inning_cnt'] / pitch_ball['pit_game_cnt']
#pitch_ball['pit_batter_per_game'] = pitch_ball['pit_batter_cnt'] / pitch_ball['pit_game_cnt']
#pitch_ball['pit_ball_per_game'] = pitch_ball['total'] / pitch_ball['pit_game_cnt']
#pitch_ball['pit_batter_per_inning'] = pitch_ball['pit_batter_cnt'] / pitch_ball['pit_inning_cnt']
#pitch_ball['pit_ball_per_inning'] = pitch_ball['total'] / pitch_ball['pit_inning_cnt']
#pitch_ball['pit_ball_per_batter'] = pitch_ball['total'] / pitch_ball['pit_batter_cnt']

In [13]:
print(pitch_ball.shape)
pitch_ball.head()

(326, 13)


Unnamed: 0,投手ID,straight,curve,slider,shoot,fork,changeup,sinker,cutball,total,pit_game_cnt,pit_inning_cnt,pit_batter_cnt
0,11606,0.498588,0.247175,0.0,0.0,0.20339,0.0,0.0,0.050847,708.0,46,54,171
1,11766,0.373473,0.0,0.361257,0.190227,0.0,0.019197,0.055846,0.0,573.0,50,50,154
2,11807,0.667003,0.090817,0.002018,0.0111,0.227043,0.0,0.0,0.002018,991.0,52,61,235
3,12103,0.51622,0.069111,0.300423,0.03103,0.083216,0.0,0.0,0.0,709.0,24,51,191
4,12113,0.18771,0.007201,0.178108,0.158425,0.0,0.06385,0.233317,0.171387,2083.0,23,129,540


In [14]:
pitch_ball.describe()

Unnamed: 0,投手ID,straight,curve,slider,shoot,fork,changeup,sinker,cutball,total,pit_game_cnt,pit_inning_cnt,pit_batter_cnt
count,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0,326.0
mean,1163288.0,0.489293,0.058848,0.196568,0.066474,0.086211,0.051951,0.008841,0.041813,788.702454,20.766871,50.245399,199.684049
std,416143.7,0.13881,0.069276,0.133403,0.112614,0.101202,0.08502,0.037591,0.089068,756.88166,18.831584,47.533255,192.461884
min,11606.0,0.006633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,1.0,2.0
25%,925322.5,0.394643,0.0,0.104212,0.0,0.0,0.0,0.0,0.0,184.0,5.25,11.25,46.0
50%,1300036.0,0.498006,0.03541,0.198832,0.0,0.048811,0.0,0.0,0.0,569.0,15.0,37.0,143.5
75%,1500108.0,0.573184,0.091249,0.282345,0.09776,0.154717,0.076431,0.0,0.034477,1031.0,27.0,67.75,263.25
max,1700016.0,0.928571,0.385412,0.663874,0.642166,0.442735,0.439024,0.238281,0.51875,3004.0,72.0,190.0,756.0


In [15]:
pitch_ball.to_feather('intermediate/pit_2017_3.f')

## 野手
### 打席数

In [16]:
bat_ball = train_pitch[['打者ID', '試合ID', 'イニング', 'イニング内打席数']].groupby(['打者ID', '試合ID', 'イニング', 'イニング内打席数']).count()
bat_ball = pd.DataFrame(bat_ball.groupby(['打者ID']).size())
bat_ball.reset_index(inplace=True)
bat_ball.rename(columns={0: 'batter_cnt'}, inplace=True)

### 試合数

In [17]:
bat_game = train_pitch[['打者ID', '試合ID']].groupby(['打者ID', '試合ID']).count()
bat_game = pd.DataFrame(bat_game.groupby(['打者ID']).size())
bat_game.reset_index(inplace=True)
bat_game.rename(columns={0: 'bat_game_cnt'}, inplace=True)

### 打者成績まとめ

In [18]:
bat_ball = bat_ball.merge(bat_game, on='打者ID', how='left')

### 1試合当たり打席数
- batter_cnt_per_game: 打席数/試合数

In [19]:
#bat_ball['batter_cnt_per_game'] = bat_ball['batter_cnt'] / bat_ball['bat_game_cnt']

In [20]:
print(bat_ball.shape)
bat_ball.head()

(466, 3)


Unnamed: 0,打者ID,batter_cnt,bat_game_cnt
0,11270,83,35
1,11302,156,69
2,11343,41,28
3,11436,268,81
4,11511,48,21


In [21]:
bat_ball.describe()

Unnamed: 0,打者ID,batter_cnt,bat_game_cnt
count,466.0,466.0,466.0
mean,1044992.0,139.688841,41.448498
std,468711.7,184.754006,45.398623
min,11270.0,1.0,1.0
25%,800012.5,7.25,4.0
50%,1100114.0,44.0,21.0
75%,1400182.0,222.5,71.75
max,1700019.0,680.0,143.0


In [22]:
bat_ball.to_feather('intermediate/bat_2017_3.f')

## 捕手
### 捕球球種

In [23]:
#catch_ball = train_pitch[['捕手ID','ball']]
#catch_ball = pd.get_dummies(catch_ball, columns=['ball'])

In [24]:
#groupby_cat = catch_ball.groupby('捕手ID').sum()
#groupby_cat.rename(columns={
#    'ball_0_straight': 'c_straight',
#    'ball_1_curve': 'c_curve',
#    'ball_2_slider': 'c_slider',
#    'ball_3_shoot': 'c_shoot',
#    'ball_4_fork': 'c_fork',
#    'ball_5_changeup': 'c_changeup',
#    'ball_6_sinker': 'c_sinker',
#    'ball_7_cutball': 'c_cutball',
#}, inplace=True)
#catch_ball = groupby_cat.reset_index(inplace=False)

In [25]:
#catch_ball['c_total'] = (catch_ball['c_straight'] + catch_ball['c_curve'] + catch_ball['c_slider'] + catch_ball['c_shoot']
#                       + catch_ball['c_fork'] + catch_ball['c_changeup'] + catch_ball['c_sinker'] + catch_ball['c_cutball'])
#catch_ball['c_straight'] = catch_ball['c_straight'] / catch_ball['c_total']
#catch_ball['c_curve'] = catch_ball['c_curve'] / catch_ball['c_total']
#catch_ball['c_slider'] = catch_ball['c_slider'] / catch_ball['c_total']
#catch_ball['c_shoot'] = catch_ball['c_shoot'] / catch_ball['c_total']
#catch_ball['c_fork'] = catch_ball['c_fork'] / catch_ball['c_total']
#catch_ball['c_changeup'] = catch_ball['c_changeup'] / catch_ball['c_total']
#catch_ball['c_sinker'] = catch_ball['c_sinker'] / catch_ball['c_total']
#catch_ball['c_cutball'] = catch_ball['c_cutball'] / catch_ball['c_total']

### 試合数

In [26]:
#cat_game = train_pitch[['捕手ID', '試合ID']].groupby(['捕手ID', '試合ID']).count()
#cat_game = pd.DataFrame(cat_game.groupby(['捕手ID']).size())
#cat_game.reset_index(inplace=True)
#cat_game.rename(columns={0: 'cat_game_cnt'}, inplace=True)

### イニング数

In [27]:
#cat_inning = train_pitch[['捕手ID', '試合ID', 'イニング']].groupby(['捕手ID', '試合ID', 'イニング']).count()
#cat_inning = pd.DataFrame(cat_inning.groupby(['捕手ID']).size())
#cat_inning.reset_index(inplace=True)
#cat_inning.rename(columns={0: 'cat_inning_cnt'}, inplace=True)

### 対戦打者数

In [28]:
#cat_batcnt = train_pitch[['捕手ID', '試合ID', 'イニング', 'イニング内打席数']].groupby(['捕手ID', '試合ID', 'イニング', 'イニング内打席数']).count()
#cat_batcnt = pd.DataFrame(cat_batcnt.groupby(['捕手ID']).size())
#cat_batcnt.reset_index(inplace=True)
#cat_batcnt.rename(columns={0: 'cat_batter_cnt'}, inplace=True)

### 捕手実績まとめ

In [29]:
#catch_ball = catch_ball.merge(cat_game, on='捕手ID', how='left')
#catch_ball = catch_ball.merge(cat_inning, on='捕手ID', how='left')
#catch_ball = catch_ball.merge(cat_batcnt, on='捕手ID', how='left')

### 1イニング当たり、1試合当たり、1打席当たり
- cat_inning_per_game: イニング数/試合数
- cat_batter_per_game: 打席数/試合数
- cat_ball_per_game: 投球数/試合数
- cat_batter_per_inning: 打席数/イニング数
- cat_ball_per_inning: 投球数/イニング数
- cat_ball_per_batter: 投球数/打席数

In [30]:
#catch_ball['cat_inning_per_game'] = catch_ball['cat_inning_cnt'] / catch_ball['cat_game_cnt']
#catch_ball['cat_batter_per_game'] = catch_ball['cat_batter_cnt'] / catch_ball['cat_game_cnt']
#catch_ball['cat_ball_per_game'] = catch_ball['c_total'] / catch_ball['cat_game_cnt']
#catch_ball['cat_batter_per_inning'] = catch_ball['cat_batter_cnt'] / catch_ball['cat_inning_cnt']
#catch_ball['cat_ball_per_inning'] = catch_ball['c_total'] / catch_ball['cat_inning_cnt']
#catch_ball['cat_ball_per_batter'] = catch_ball['c_total'] / catch_ball['cat_batter_cnt']

In [31]:
#print(catch_ball.shape)
#catch_ball.head()

In [32]:
#catch_ball.describe()

In [33]:
#catch_ball.to_feather('intermediate/cat_2017_2.f')