In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',1000)

In [2]:
df = pd.read_csv("../data fetch/data/pitching_team_2024.csv")
df.shape

(4932, 22)

In [3]:
# Example: Pitching rolling averages (last 5 games)

# Innings pitched last 5 games
df['ip_last_5'] = df.groupby('team_id')['inningsPitched'] \
    .transform(lambda x: x.rolling(5, min_periods=1).mean().shift(1))

# Runs allowed last 5 games
df['runs_allowed_last_5'] = df.groupby('team_id')['runs'] \
    .transform(lambda x: x.rolling(5, min_periods=1).mean().shift(1))

# Earned runs allowed last 5 games
df['er_last_5'] = df.groupby('team_id')['earnedRuns'] \
    .transform(lambda x: x.rolling(5, min_periods=1).mean().shift(1))

# Strikeouts last 5 games
df['so_last_5'] = df.groupby('team_id')['strikeOuts'] \
    .transform(lambda x: x.rolling(5, min_periods=1).mean().shift(1))

# Walks last 5 games
df['bb_last_5'] = df.groupby('team_id')['baseOnBalls'] \
    .transform(lambda x: x.rolling(5, min_periods=1).mean().shift(1))

# Home runs allowed last 5 games
df['hr_allowed_last_5'] = df.groupby('team_id')['homeRuns'] \
    .transform(lambda x: x.rolling(5, min_periods=1).mean().shift(1))

# Fill missing values
df = df.fillna({
    'ip_last_5': df['inningsPitched'].mean(),
    'runs_allowed_last_5': 0,
    'er_last_5': 0,
    'so_last_5': 0,
    'bb_last_5': 0,
    'hr_allowed_last_5': 0
})

In [4]:
df.drop(columns = ['inningsPitched', 'outs', 'runs', 'earnedRuns', 'hits', 'homeRuns', 'strikeOuts', 'baseOnBalls', 
                   'hitBatsmen', 'numberOfPitches', 'balls', 'strikes', 'strikePercentage', 'groundOuts', 'flyOuts', 
                   'completeGames', 'shutouts', 'saveOpportunities', 'team_id'], inplace = True)

In [5]:
# Home team
pitching_home = (
    df[df['team_side'] == 'home']
    .drop(columns=['team_side'])
    .add_prefix('home_')
    .rename(columns={'home_gamePk': 'gamePk'})
)

# Away team
pitching_away = (
    df[df['team_side'] == 'away']
    .drop(columns=['team_side'])
    .add_prefix('away_')
    .rename(columns={'away_gamePk': 'gamePk'})
)

# Merge
df = pd.merge(pitching_home, pitching_away, on='gamePk', how='inner')

In [6]:
df.shape

(2542, 15)

In [7]:
df.to_csv("data/pitching_team_clean.csv", index = False)