In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',1000)

In [2]:
game_df = pd.read_csv("data/team_data.csv")
df = pd.read_csv("../data fetch/data/pitchers_player_2024.csv")

In [3]:
print(game_df.shape)
print(df.shape)

(4938, 10)
(20950, 28)


In [4]:
# Avoid division by zero
df['inningsPitched'] = df['inningsPitched'].replace(0, np.nan)

# ERA: Earned Run Average
df['pitching_player_ERA'] = (df['earnedRuns'] / df['inningsPitched']) * 9

# WHIP: Walks + Hits per Inning Pitched
df['pitching_player_WHIP'] = (df['baseOnBalls'] + df['hits']) / df['inningsPitched']

# K/9, BB/9, HR/9
df['pitching_player_K_per9'] = (df['strikeOuts'] / df['inningsPitched']) * 9
df['pitching_player_BB_per9'] = (df['baseOnBalls'] / df['inningsPitched']) * 9
df['pitching_player_HR_per9'] = (df['homeRuns'] / df['inningsPitched']) * 9

# K/BB ratio
df['pitching_player_K_BB_ratio'] = df['strikeOuts'] / df['baseOnBalls'].replace(0, np.nan)

# Pitch efficiency
df['pitching_player_pitches_per_inning'] = df['numberOfPitches'] / df['inningsPitched']
df['pitching_player_strike_to_ball_ratio'] = df['strikes'] / df['balls'].replace(0, np.nan)
df['pitching_player_ground_to_fly_ratio'] = df['groundOuts'] / df['flyOuts'].replace(0, np.nan)

### aggregated

In [5]:
df = df.groupby(['gamePk', 'team_id']).agg({
    'pitching_player_ERA': 'mean',
    'pitching_player_WHIP': 'mean',
    'pitching_player_K_per9': 'mean',
    'pitching_player_BB_per9': 'mean',
    'pitching_player_HR_per9': 'mean',
    'pitching_player_K_BB_ratio': 'mean',
    'pitching_player_pitches_per_inning': 'mean',
    'pitching_player_strike_to_ball_ratio': 'mean',
    'pitching_player_ground_to_fly_ratio': 'mean'
}).reset_index()

In [6]:
# Merge batting_team ka team_side into batting_player
df = df.merge(
    game_df[['gamePk', 'team_id', 'team_side']],
    on=['gamePk', 'team_id'],
    how='left'
)

In [7]:
# Home team data
pitching_home = (
    df[df['team_side']=='home']
    .drop(columns=['team_side'])
    .add_prefix('home_')
    .rename(columns={'home_gamePk':'gamePk'})
)

# Away team data
pitching_away = (
    df[df['team_side']=='away']
    .drop(columns=['team_side'])
    .add_prefix('away_')
    .rename(columns={'away_gamePk':'gamePk'})
)

# Merge home and away into wide format
df = pd.merge(pitching_home, pitching_away, on='gamePk', how='inner')

In [8]:
df.shape

(2550, 21)

In [9]:
df.to_csv("data/pitchers_player_clean.csv", index = False)