### 5th Process:

- Merge all the stats from batters and pitchers to play data
- To predict the event in a match

### Import

In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', None)

In [3]:
all_player_stats = pd.read_csv('all_player_stats.csv')
top_players_plays = pd.read_csv('mlb_top_players_plays.csv')
top_players_id =pd.read_csv('mlb_top_players_id.csv')

In [4]:
valid_ids = set(all_player_stats['player_id'])

top_players_plays = (
    top_players_plays[
        (top_players_plays['batter_id'].isin(valid_ids)) &
        (top_players_plays['pitcher_id'].isin(valid_ids))
        ]
    )

In [5]:
batter_stats = all_player_stats.copy().add_prefix("b_")
pitcher_stats = all_player_stats.copy().add_prefix("p_")
batter_stats = batter_stats.rename(columns={'b_player_id': 'batter_id', 'b_season': 'season'})
pitcher_stats = pitcher_stats.rename(columns={'p_player_id': 'pitcher_id', 'p_season': 'season'})

In [6]:
merged_data = (
    top_players_plays
    .filter([
        'season', 'batter_id', 'pitcher_id',
        'event_type', 'vs_RHB', 'vs_LHB',
        'vs_SHB', 'vs_RHP', 'vs_LHP'
    ])
    .merge(batter_stats, on=['season', 'batter_id'], how='left')
    .merge(pitcher_stats, on=['season', 'pitcher_id'], how='left')
)

In [7]:
null_percentage = merged_data.isnull().mean() * 100
merged_data_2 = merged_data.loc[:, null_percentage <= 65]

In [8]:
merged_data_2.columns.tolist()

['season',
 'batter_id',
 'pitcher_id',
 'event_type',
 'vs_RHB',
 'vs_LHB',
 'vs_SHB',
 'vs_RHP',
 'vs_LHP',
 'b_airOuts',
 'b_atBats',
 'b_baseOnBalls',
 'b_catchersInterference',
 'b_caughtStealing',
 'b_doubles',
 'b_gamesPlayed',
 'b_groundIntoDoublePlay',
 'b_groundOuts',
 'b_hitByPitch',
 'b_hits',
 'b_homeRuns',
 'b_intentionalWalks',
 'b_leftOnBase',
 'b_numberOfPitches',
 'b_plateAppearances',
 'b_rbi',
 'b_runs',
 'b_sacBunts',
 'b_sacFlies',
 'b_stolenBases',
 'b_strikeOuts',
 'b_totalBases',
 'b_triples',
 'b_singles',
 'b_atBatsPerHomeRun',
 'b_avg',
 'b_babip',
 'b_groundOutsToAirouts',
 'b_obp',
 'b_slg',
 'b_stolenBasePercentage',
 'b_strikeoutWalkRatio',
 'b_ops',
 'p_airOuts',
 'p_atBats',
 'p_balks',
 'p_baseOnBalls',
 'p_battersFaced',
 'p_blownSaves',
 'p_catchersInterference',
 'p_caughtStealing',
 'p_completeGames',
 'p_doubles',
 'p_earnedRuns',
 'p_gamesFinished',
 'p_gamesPitched',
 'p_gamesPlayed',
 'p_gamesStarted',
 'p_groundIntoDoublePlay',
 'p_groundOuts

In [24]:
merged_data = merged_data.drop(columns=['batter_id', 'pitcher_id', 'season'])

In [25]:
merged_data.to_csv('all_plays_merged_data.csv', index=False)