## player前処理
#### 16
- 2017年の情報がない選手は平均で埋めずにNaNのままにする

#### 15
- 2017年のデータを1/4ずつ使う

#### 14
- コースの種類のみ

#### 13
- コースの種類

#### 12
- 球団ごとの年棒ランク
- 選手IDごとの2017年のコース実績

In [105]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import feather
pd.set_option('display.max_Columns', 100)

In [106]:
train_player = pd.read_feather('data/train_player.f')
test_player = pd.read_feather('data/test_player.f')
print(train_player.shape)
print(test_player.shape)

(911, 25)
(1846, 25)


In [107]:
all_player = train_player.append(test_player, ignore_index=True)
print(all_player.shape)

(2757, 25)


### 管理番号

In [108]:
No = 16

### 2017年のn/4

In [109]:
sample_No = 4

In [110]:
OUT_PITCHER = 'intermediate/player/all_pitcher_{}_{}.f'.format(No, sample_No)
OUT_ALLPLAYER = 'intermediate/player/all_player_{}_{}.f'.format(No, sample_No)

### 2017年の成績(1/4ずつ)

In [111]:
pit_2017 = pd.read_feather('intermediate/player/pit_2017_7_{}.f'.format(sample_No))
bat_2017 = pd.read_feather('intermediate/player/bat_2017_7_{}.f'.format(sample_No))
print(pit_2017.shape)
print(bat_2017.shape)

(514, 27)
(342, 3)


### 外国人助っ人

In [112]:
all_player['foreigner']=0
all_player.loc[all_player['出身国']!='日本', 'foreigner'] = 1

### 高卒・大卒・社会人

In [113]:
# 社会人出身
all_player['company'] = 0
all_player.loc[~all_player['社会人'].isnull(), 'company'] = 1
# 大卒
all_player['univ']=0
all_player.loc[all_player['出身大学ID']!=0, 'univ'] = 1
# 高卒
all_player['highsch'] = 0
all_player.loc[(all_player['company']==0)&(all_player['univ']==0)&(all_player['foreigner']==0) , 'highsch'] = 1

### 年齢、現役年数

In [114]:
# 年齢
all_player['birth_day'] = pd.to_datetime(all_player['生年月日'])
all_player['age'] = all_player['年度'] - all_player['birth_day'].dt.year
# 現役年数
all_player['play_year'] = all_player['年度'] - all_player['ドラフト年']
all_player.loc[all_player['ドラフト年'].isnull(), 'play_year'] = 6

### 年棒

#### 球団ごとの年棒順位

In [115]:
all_player['salary_rank'] = all_player.groupby(['年度', 'チームID'])['年俸'].rank(ascending=False)

In [116]:
all_player['rank_year'] = all_player['salary_rank']/ all_player['play_year'] 
all_player['rank_x_year'] = all_player['salary_rank'] * all_player['play_year'] 
all_player['salary_year'] = all_player['年俸']/ all_player['play_year'] 
all_player['salary_x_year'] = all_player['年俸'] * all_player['play_year'] 

### 身長・体重
BMI=体重/身長^2

In [117]:
all_player['bmi'] = all_player['体重']*10000/(all_player['身長']*all_player['身長'])

### 不要な列を削除

In [118]:
all_player.drop(
    columns=[
        'チームID', 'チーム名', '選手名', '背番号', '打', '生年月日', 
        '出身高校ID', '出身高校名', '出身大学ID', '出身大学名', '社会人', 
        'ドラフト年', 'ドラフト種別', 
        '出身国', '出身地', '血液型', 'birth_day',
    ], inplace=True)

### rename

In [119]:
all_player.rename(columns={
    '育成選手F': 'firm',
    '身長': 'height',
    '体重': 'weight',
    'ドラフト順位': 'draft_order',
    '年俸': 'salary',
}, inplace=True)

### 投手のみ

In [120]:
all_pitcher = all_player[all_player['位置']=='投手']

In [121]:
dummy = pd.DataFrame({
    '投': ['右', '右', '左', '左'],
    'pit_bat': ['R_L', 'R_R', 'L_L', 'L_R']
})
all_pitcher = all_pitcher.merge(dummy, on='投', how='outer')

In [122]:
all_pitcher = all_pitcher.merge(pit_2017, left_on=['選手ID','pit_bat'], right_on=['投手ID','pit_bat'], how='left')

In [123]:
all_pitcher.loc[(all_pitcher['投手ID'].isnull()) & (all_pitcher['foreigner']==1), '投手ID'] = -1
all_pitcher.loc[all_pitcher['投手ID'].isnull(), '投手ID'] = 0

#### 情報がない選手
- 外国人投手-> 投手ID=-1
- 日本人投手-> 投手ID=0
- 2017の平均で穴埋め

In [124]:
def fill_ball(condition, source, target):
    ball_kind = [
        'straight', 'curve', 'slider', 'shoot', 'fork', 'changeup', 'sinker', 'cutball', 'total',
        'pit_game_cnt', 'pit_inning_cnt', 'pit_batter_cnt',
        'course_0', 'course_1', 'course_2', 'course_3', 'course_4', 'course_5', 'course_6', 
        'course_7', 'course_8', 'course_9', 'course_10', 'course_11', 'course_12'
    ]
    for ball in ball_kind:
        target.loc[condition, ball] = source[ball]

#### 日本人平均

In [125]:
RightLeft = ['R_L', 'R_R', 'L_R', 'L_L']
for RL in RightLeft:
    pit_mean = all_pitcher[(all_pitcher['foreigner']==0)&(all_pitcher['投手ID']!=0)&(all_pitcher['pit_bat']==RL)].mean()
    condition = (all_pitcher['投手ID']==0)&(all_pitcher['pit_bat']==RL)
    # 平均で埋めない
    #fill_ball(condition, pit_mean, all_pitcher)

#### 外国人平均

In [126]:
for RL in RightLeft:
    pit_mean = all_pitcher[(all_pitcher['foreigner']==1)&(all_pitcher['投手ID']!=-1)&(all_pitcher['pit_bat']==RL)].mean()
    condition = (all_pitcher['投手ID']==-1)&(all_pitcher['pit_bat']==RL)
    # 平均で埋めない
    #fill_ball(condition, pit_mean, all_pitcher)

### 各球種のストレートに対する比率

In [127]:
ball_not_straight = ['curve', 'slider', 'shoot', 'fork', 'changeup', 'sinker', 'cutball']
for ball in ball_not_straight:
    all_pitcher[ball] = all_pitcher[ball] / all_pitcher['straight']

### コースの比率

In [128]:
course_kind = ['course_0', 'course_1', 'course_2', 'course_3', 'course_4', 'course_5', 'course_6', 
                        'course_7', 'course_8', 'course_9', 'course_10', 'course_11', 'course_12']
for course in course_kind:
    all_pitcher[course] = all_pitcher[course] / all_pitcher['total']

### コースの種類

In [129]:
all_pitcher['high_str'] = all_pitcher['course_0'] + all_pitcher['course_3'] + all_pitcher['course_6'] 
all_pitcher['high_ball'] = all_pitcher['course_9'] + all_pitcher['course_10'] 
all_pitcher['mid_str'] = all_pitcher['course_1'] + all_pitcher['course_4'] + all_pitcher['course_7'] 
all_pitcher['low_str'] = all_pitcher['course_2'] + all_pitcher['course_5'] + all_pitcher['course_8'] 
all_pitcher['low_ball'] = all_pitcher['course_11'] + all_pitcher['course_12'] 

In [130]:
all_pitcher['left_str'] = all_pitcher['course_0'] + all_pitcher['course_1'] + all_pitcher['course_2'] 
all_pitcher['left_ball'] = all_pitcher['course_9'] + all_pitcher['course_11'] 
all_pitcher['center_str'] = all_pitcher['course_3'] + all_pitcher['course_4'] + all_pitcher['course_5'] 
all_pitcher['right_str'] = all_pitcher['course_6'] + all_pitcher['course_7'] + all_pitcher['course_8'] 
all_pitcher['right_ball'] = all_pitcher['course_10'] + all_pitcher['course_12'] 

### 不要な列を削除

In [131]:
all_pitcher.drop(columns=[
    'straight', '投手ID', '位置', '投',
    'course_0', 'course_1', 'course_2', 'course_3', 'course_4', 'course_5', 'course_6', 
    'course_7', 'course_8', 'course_9', 'course_10', 'course_11', 'course_12'
], inplace=True)

In [132]:
all_pitcher.head()

Unnamed: 0,年度,選手ID,firm,height,weight,draft_order,salary,foreigner,company,univ,highsch,age,play_year,salary_rank,rank_year,rank_x_year,salary_year,salary_x_year,bmi,pit_bat,curve,slider,shoot,fork,changeup,sinker,cutball,total,pit_game_cnt,pit_inning_cnt,pit_batter_cnt,high_str,high_ball,mid_str,low_str,low_ball,left_str,left_ball,center_str,right_str,right_ball
0,2017,12107,0,175,82,3.0,5000,0,1,0,0,37,16.0,23.5,1.46875,376.0,312.5,80000.0,26.77551,L_L,,,,,,,,,,,,,,,,,,,,,
1,2017,12107,0,175,82,3.0,5000,0,1,0,0,37,16.0,23.5,1.46875,376.0,312.5,80000.0,26.77551,L_R,,,,,,,,,,,,,,,,,,,,,
2,2017,400010,0,186,95,1.0,20000,0,1,0,0,35,14.0,10.0,0.714286,140.0,1428.571429,280000.0,27.459822,L_L,0.0,0.857143,1.071429,0.071429,0.714286,0.0,0.214286,55.0,2.0,7.0,14.0,0.072727,0.090909,0.254545,0.2,0.381818,0.145455,0.236364,0.2,0.181818,0.236364
3,2017,400010,0,186,95,1.0,20000,0,1,0,0,35,14.0,10.0,0.714286,140.0,1428.571429,280000.0,27.459822,L_R,0.0,1.090909,2.0,0.272727,1.818182,0.0,0.0,68.0,2.0,7.0,18.0,0.088235,0.161765,0.132353,0.176471,0.441176,0.044118,0.220588,0.102941,0.25,0.382353
4,2017,600098,0,184,88,1.0,32000,0,0,0,1,34,12.0,2.0,0.166667,24.0,2666.666667,384000.0,25.992439,L_L,0.0,2.75,1.75,0.0,0.0,0.0,0.0,22.0,5.0,5.0,7.0,0.090909,0.0,0.181818,0.272727,0.454545,0.409091,0.363636,0.090909,0.045455,0.090909


### 投手のみ出力

In [133]:
all_pitcher.to_feather(OUT_PITCHER)
print(OUT_PITCHER)

intermediate/player/all_pitcher_16_4.f


### 打者(全選手)

In [134]:
all_player = all_player.merge(bat_2017, left_on='選手ID', right_on='打者ID', how='left')

In [135]:
all_player.loc[all_player['打者ID'].isnull(), '打者ID'] = 0
# 投手以外
bat_mean = all_player[(all_player['打者ID']!=0)&(all_player['位置']!='投手')].mean()
condition = (all_player['打者ID']==0)&(all_player['位置']!='投手')
# 平均で埋めない
#all_player.loc[condition, 'batter_cnt'] = bat_mean['batter_cnt']
#all_player.loc[condition, 'bat_game_cnt'] = bat_mean['bat_game_cnt']

# 投手
bat_mean = all_player[(all_player['打者ID']!=0)&(all_player['位置']=='投手')].mean()
condition = (all_player['打者ID']==0)&(all_player['位置']=='投手')
# 平均で埋めない
#all_player.loc[condition, 'batter_cnt'] = bat_mean['batter_cnt']
#all_player.loc[condition, 'bat_game_cnt'] = bat_mean['bat_game_cnt']

### 不要な列を削除

In [136]:
all_player.drop(
    columns=['打者ID', '位置', '投'], inplace=True)

In [137]:
all_player.head()

Unnamed: 0,年度,選手ID,firm,height,weight,draft_order,salary,foreigner,company,univ,highsch,age,play_year,salary_rank,rank_year,rank_x_year,salary_year,salary_x_year,bmi,batter_cnt,bat_game_cnt
0,2017,11343,0,183,86,5.0,4500,0,0,0,1,41,23.0,26.5,1.152174,609.5,195.652174,103500.0,25.680074,6.0,6.0
1,2017,11726,0,177,85,1.0,3700,0,0,0,1,36,19.0,28.5,1.5,541.5,194.736842,70300.0,27.131412,,
2,2017,12049,0,180,97,1.0,26000,0,0,1,0,38,17.0,4.0,0.235294,68.0,1529.411765,442000.0,29.938272,163.0,39.0
3,2017,12107,0,175,82,3.0,5000,0,1,0,0,37,16.0,23.5,1.46875,376.0,312.5,80000.0,26.77551,,
4,2017,12179,0,184,94,1.0,7000,0,0,0,1,34,16.0,20.0,1.25,320.0,437.5,112000.0,27.76465,,


### 全選手出力

In [138]:
all_player.to_feather(OUT_ALLPLAYER)
print(OUT_ALLPLAYER)

intermediate/player/all_player_16_4.f
