## pitchとplayerの前処理を統合

In [34]:
import pandas as pd
pd.set_option('display.max_Columns', 100)

In [35]:
ALL_PITCH = 'intermediate/all_pitch_3.f'
ALL_PLAYER = 'intermediate/all_player_3.f'

In [36]:
all_pitch = pd.read_feather(ALL_PITCH)
all_pitch.shape

(778767, 62)

In [37]:
all_player = pd.read_feather(ALL_PLAYER)
all_player.shape

(2757, 21)

### Join
- pitch.投手ID/年度 - player.選手ID/年度
- pitch.打者ID/年度 - player.選手ID/年度
- pitch.捕手ID/年度 - player.選手ID/年度

In [38]:
merge_all = pd.merge(all_pitch, all_player, left_on=['投手ID', '年度'], right_on=['選手ID', '年度'], how='left')

In [39]:
merge_all = pd.merge(merge_all, all_player, left_on=['打者ID', '年度'], right_on=['選手ID', '年度'], how='left', suffixes=['_pit', '_bat'])

In [40]:
merge_all = pd.merge(merge_all, all_player, left_on=['捕手ID', '年度'], right_on=['選手ID', '年度'], how='left')

### player同士の組み合わせ
- 年棒の差
- 現役年数の差
- 年齢の差

In [41]:
merge_all['salary_dif_p-b'] = merge_all['salary_pit'] - merge_all['salary_bat']
merge_all['play_year_dif_p-b'] = merge_all['play_year_pit'] - merge_all['play_year_bat']
merge_all['age_dif_p-b'] = merge_all['age_pit'] - merge_all['age_bat']

merge_all['salary_dif_p-c'] = merge_all['salary_pit'] - merge_all['salary']
merge_all['play_year_dif_p-c'] = merge_all['play_year_pit'] - merge_all['play_year']
merge_all['age_dif_p-c'] = merge_all['age_pit'] - merge_all['age']

merge_all['salary_dif_b-c'] = merge_all['salary_bat'] - merge_all['salary']
merge_all['play_year_dif_b-c'] = merge_all['play_year_bat'] - merge_all['play_year']
merge_all['age_dif_b-c'] = merge_all['age_bat'] - merge_all['age']

In [42]:
merge_all.drop(
    columns=[
        '選手ID_pit', '選手ID_bat', '選手ID',
        'curve_bat', 'slider_bat', 'shoot_bat', 'fork_bat', 'changeup_bat', 'sinker_bat', 'cutball_bat', 'total_bat',
        'curve', 'slider', 'shoot', 'fork', 'changeup', 'sinker', 'cutball', 'total'
    ], inplace=True)

In [43]:
print(merge_all.shape)
merge_all.head()

(778767, 112)


Unnamed: 0,データ内連番,ball,投球位置区域,年度,試合ID,試合内連番,試合内投球数,ホームチームID,アウェイチームID,イニング,イニング内打席数,打席内投球数,投手ID,投手チームID,投手登板順,投手試合内対戦打者数,投手試合内投球数,投手イニング内投球数,打者ID,打者チームID,打者打順,打者試合内打席数,プレイ前ホームチーム得点数,プレイ前アウェイチーム得点数,プレイ前アウト数,プレイ前ボール数,プレイ前ストライク数,プレイ前走者状況,捕手ID,first,second,third,base_cnt,pitch_LR,bat_LR,top_bot,role,opening_date,game_date,date_from_opening,start_time,game_time,elapsed_time,elapsed_min,home,point_diff,ball_cnt_0-0,ball_cnt_0-1,ball_cnt_0-2,ball_cnt_0-3,...,firm_pit,height_pit,weight_pit,draft_order_pit,salary_pit,curve_pit,slider_pit,shoot_pit,fork_pit,changeup_pit,sinker_pit,cutball_pit,total_pit,foreigner_pit,company_pit,univ_pit,highsch_pit,age_pit,play_year_pit,firm_bat,height_bat,weight_bat,draft_order_bat,salary_bat,foreigner_bat,company_bat,univ_bat,highsch_bat,age_bat,play_year_bat,firm,height,weight,draft_order,salary,foreigner,company,univ,highsch,age,play_year,salary_dif_p-b,play_year_dif_p-b,age_dif_p-b,salary_dif_p-c,play_year_dif_p-c,age_dif_p-c,salary_dif_b-c,play_year_dif_b-c,age_dif_b-c
0,0,0.0,0.0,2017,2017033101,1,1,1,4,1,1,1,1500001,1,1,1,1,1,900410,4,1,1,0,0,0,0,0,___,1300027,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2017-03-31,2017-03-31,0,2020-05-02 18:03:00,2020-05-02 18:03:00,00:00:00,0.0,1,0,1,0,0,0,...,0,196,100,,24000,0.459848,0.493939,0.199242,0.023485,0.061364,0.0,0.0,2954.0,1,0,0,0,29,,0,176,74,5.0,15000,0,1,1,0,32,8.0,0,178,83,1.0,5000,0,1,1,0,28,4.0,9000,,-3,19000,,1,10000,4.0,4
1,1,0.0,8.0,2017,2017033101,2,2,1,4,1,1,2,1500001,1,1,1,2,2,900410,4,1,1,0,0,0,0,1,___,1300027,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2017-03-31,2017-03-31,0,2020-05-02 18:03:00,2020-05-02 18:03:12,00:00:12,0.2,1,0,0,0,0,0,...,0,196,100,,24000,0.459848,0.493939,0.199242,0.023485,0.061364,0.0,0.0,2954.0,1,0,0,0,29,,0,176,74,5.0,15000,0,1,1,0,32,8.0,0,178,83,1.0,5000,0,1,1,0,28,4.0,9000,,-3,19000,,1,10000,4.0,4
2,2,0.0,5.0,2017,2017033101,3,3,1,4,1,1,3,1500001,1,1,1,3,3,900410,4,1,1,0,0,0,0,2,___,1300027,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2017-03-31,2017-03-31,0,2020-05-02 18:03:00,2020-05-02 18:03:23,00:00:23,0.383333,1,0,0,0,0,0,...,0,196,100,,24000,0.459848,0.493939,0.199242,0.023485,0.061364,0.0,0.0,2954.0,1,0,0,0,29,,0,176,74,5.0,15000,0,1,1,0,32,8.0,0,178,83,1.0,5000,0,1,1,0,28,4.0,9000,,-3,19000,,1,10000,4.0,4
3,3,0.0,12.0,2017,2017033101,4,4,1,4,1,2,1,1500001,1,1,2,4,4,11436,4,2,1,0,0,1,0,0,___,1300027,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2017-03-31,2017-03-31,0,2020-05-02 18:03:00,2020-05-02 18:04:01,00:01:01,1.016667,1,0,1,0,0,0,...,0,196,100,,24000,0.459848,0.493939,0.199242,0.023485,0.061364,0.0,0.0,2954.0,1,0,0,0,29,,0,180,74,1.0,7040,0,0,0,1,40,22.0,0,178,83,1.0,5000,0,1,1,0,28,4.0,16960,,-11,19000,,1,2040,18.0,12
4,4,0.0,8.0,2017,2017033101,5,5,1,4,1,2,2,1500001,1,1,2,5,5,11436,4,2,1,0,0,1,0,1,___,1300027,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2017-03-31,2017-03-31,0,2020-05-02 18:03:00,2020-05-02 18:04:12,00:01:12,1.2,1,0,0,0,0,0,...,0,196,100,,24000,0.459848,0.493939,0.199242,0.023485,0.061364,0.0,0.0,2954.0,1,0,0,0,29,,0,180,74,1.0,7040,0,0,0,1,40,22.0,0,178,83,1.0,5000,0,1,1,0,28,4.0,16960,,-11,19000,,1,2040,18.0,12


In [44]:
import feather
merge_all.to_feather('intermediate/all_merge_3.f')