In [209]:
import numpy as np
import pandas as pd
from functools import cache

# Feature Engineering

In [210]:
batting_data = pd.read_parquet("data/batting_full.parquet.gz")
pitching_data = pd.read_parquet("data/pitching_full.parquet.gz")

In [211]:
# All 2024 Teams
teams = [
    "NYY",
    "KCR",
    "LAD",
    "BAL",
    "NYM",
    "BOS",
    "CLE",
    "CIN",
    "ARI",
    "TOR",
    "SFG",
    "MIL",
    "SEA",
    "HOU",
    "SDP",
    "PHI",
    "OAK",
    "ATL",
    "TEX",
    "MIN",
    "CHC",
    "DET",
    "COL",
    "STL",
    "PIT",
    "LAA",
    "WSN",
    "MIA",
    "TBR",
    "CHW",
]

In [212]:
# Drop teams that don't exist in 2024
batting_data = batting_data[batting_data["Team"].isin(teams)]
pitching_data = pitching_data[pitching_data["Team"].isin(teams)]

## Batting

In [213]:
batting_cols = [
    # Meta
    "IDfg",
    "Season",
    "Team",
    "Age",
    # Normalization
    "AB",
    # Stats
    "OPS",  # TODO: how to get OPS+?
    "WAR",
    "OBP+",
    "2B",
    "3B",
    "HR",
    "CS",
    "SB",
    "SO",
    # Left-handed?
]
batting_cleaned = batting_data[batting_cols].dropna()
batting_cleaned = batting_cleaned[batting_cleaned["AB"] > 0]
# Normalize for at-bats
for col in ["OPS", "2B", "3B", "HR", "SO"]:
    batting_cleaned[col] /= batting_cleaned["AB"]
batting_cleaned["CS:SB"] = batting_cleaned["CS"] / batting_cleaned["SB"]
batting_cleaned.drop(columns=["CS", "SB"], inplace=True)
batting_cleaned.rename(columns={"AB": "weight"}, inplace=True)

In [214]:
batting_cleaned.head()

Unnamed: 0,IDfg,Season,Team,Age,weight,OPS,WAR,OBP+,2B,3B,HR,SO,CS:SB
0,1008559,1998,STL,34,509,0.002401,8.5,138.0,0.041257,0.0,0.137525,0.304519,0.0
1,1109,1998,SFG,33,552,0.001897,8.5,129.0,0.07971,0.012681,0.067029,0.166667,0.428571
2,1093,1998,NYM,29,557,0.001792,8.1,132.0,0.064632,0.007181,0.039497,0.131059,1.0
3,1274,1998,SEA,22,686,0.00134,7.9,106.0,0.05102,0.007289,0.061224,0.176385,0.282609
4,190,1998,BOS,24,604,0.001566,7.3,106.0,0.061258,0.013245,0.057947,0.102649,0.5


In [215]:
# Save
batting_cleaned.to_parquet("data/batting_cleaned.parquet.gz", compression="gzip")

## Pitching

In [216]:
pitching_cols = [
    # Meta
    "IDfg",
    "Season",
    "Team",
    "Age",
    # Normalization
    "TBF",
    # Stats
    "ERA-",
    "H",
    "HR",
    "BB",
    "SO",
]

pitching_cleaned = pitching_data[pitching_cols].dropna()
pitching_cleaned = pitching_cleaned[pitching_cleaned["TBF"] > 0]
# Normalize for batters faced
for col in ["H", "HR", "BB", "SO"]:
    pitching_cleaned[col] /= pitching_cleaned["TBF"]
pitching_cleaned.rename(columns={"TBF": "weight"}, inplace=True)

In [217]:
pitching_cleaned.head()

Unnamed: 0,IDfg,Season,Team,Age,weight,ERA-,H,HR,BB,SO
0,642,1998,SDP,33,1032,60,0.218023,0.007752,0.047481,0.249031
1,73,1998,PHI,31,1089,76,0.216713,0.02112,0.056015,0.275482
2,815,1998,TOR,35,961,57,0.175858,0.011446,0.091571,0.281998
4,104,1998,ATL,32,987,53,0.203647,0.013171,0.045593,0.206687
5,200,1998,BOS,26,951,61,0.197687,0.02734,0.070452,0.263933


In [218]:
#  Save
pitching_cleaned.to_parquet("data/pitching_cleaned.parquet.gz", compression="gzip")

## Schedule/Record

In [219]:
schedules = pd.read_parquet("data/schedules_full.parquet.gz")

In [220]:
schedules.head()

Unnamed: 0,Date,Tm,Home_Away,Opp,W/L,R,RA,Inn,W-L,Rank,GB,Win,Loss,Save,Time,D/N,Attendance,cLI,Streak,Orig. Scheduled,Season
0,"Thursday, Mar 28",NYY,@,HOU,W,5.0,4.0,9.0,1-0,1.0,Tied,Loáisiga,Pressly,Holmes,2:41,D,42642.0,0.93,1.0,,2024
1,"Friday, Mar 29",NYY,@,HOU,W,7.0,1.0,9.0,2-0,1.0,up 0.5,Weaver,Scott,,3:17,N,41583.0,0.99,2.0,,2024
2,"Saturday, Mar 30",NYY,@,HOU,W,5.0,3.0,9.0,3-0,1.0,up 0.5,Stroman,Abreu,Holmes,2:54,N,41247.0,1.05,3.0,,2024
3,"Sunday, Mar 31",NYY,@,HOU,W,4.0,3.0,9.0,4-0,1.0,up 1.5,Burdi,Hader,Holmes,2:56,D,36908.0,1.06,4.0,,2024
4,"Monday, Apr 1",NYY,@,ARI,W,5.0,2.0,9.0,5-0,1.0,up 1.5,Weaver,Nelson,González,2:59,N,38608.0,1.04,5.0,,2024


In [221]:
schedules_cleaned = schedules[["Tm", "Opp", "W/L", "D/N", "Home_Away", "Season"]]
# Drop away games
schedules_cleaned = schedules_cleaned[schedules_cleaned["Home_Away"] == "Home"]
# Rename columns
schedules_cleaned.rename(columns={"Tm": "HomeTeam", "Opp": "AwayTeam"}, inplace=True)
# Add dummies
schedules_cleaned["Day"] = schedules_cleaned["D/N"].apply(
    lambda x: 1 if x == "D" else 0
)
schedules_cleaned["Win"] = schedules_cleaned["W/L"].apply(
    lambda x: 1 if x == "W" else 0
)
# Drop old columns
schedules_cleaned.drop(columns=["W/L", "D/N", "Home_Away"], inplace=True)


In [222]:
schedules_cleaned.head()

Unnamed: 0,HomeTeam,AwayTeam,Season,Day,Win
7,NYY,TOR,2024,1,0
8,NYY,TOR,2024,0,1
9,NYY,TOR,2024,1,1
10,NYY,MIA,2024,0,1
11,NYY,MIA,2024,0,1


In [223]:
# Save
schedules_cleaned.to_parquet("data/schedules_cleaned.parquet.gz", compression="gzip")

## Add Player Features to Schedules

In [224]:
schedules_cleaned.head()

Unnamed: 0,HomeTeam,AwayTeam,Season,Day,Win
7,NYY,TOR,2024,1,0
8,NYY,TOR,2024,0,1
9,NYY,TOR,2024,1,1
10,NYY,MIA,2024,0,1
11,NYY,MIA,2024,0,1


In [228]:
@cache
def get_players(Team, Season):
    batting = batting_cleaned[
        (batting_cleaned["Team"] == Team) & (batting_cleaned["Season"] == Season)
    ]["IDfg"].unique()
    pitching = pitching_cleaned[
        (pitching_cleaned["Team"] == Team) & (pitching_cleaned["Season"] == Season)
    ]["IDfg"].unique()
    return batting, pitching

In [None]:
@cache
def get_team_stats(Team, Season):
    batting_players, pitching_players = get_players(Team, Season)

    prior_season_batting = []
    batting_rookies = 0
    for player in batting_players:
        player_data = batting_cleaned[
            (batting_cleaned["IDfg"] == player)
            & (batting_cleaned["Team"] == Team)
            & (batting_cleaned["Season"] == Season - 1)
        ]
        if len(player_data) == 0:
            batting_rookies += 1
        else:
            prior_season_batting.append(player_data)

    prior_season_pitching = []
    pitching_rookies = 0
    for player in pitching_players:
        player_data = pitching_cleaned[
            (pitching_cleaned["IDfg"] == player)
            & (pitching_cleaned["Team"] == Team)
            & (pitching_cleaned["Season"] == Season - 1)
        ]
        if len(player_data) == 0:
            pitching_rookies += 1
        else:
            prior_season_pitching.append(player_data)
    
    


In [230]:
get_players("NYY", 2009)

(array([ 826, 1281, 1274, 3269,  185, 4599, 1659, 9927, 4022,  841,   25,
        8841, 5275, 7783,  840,  844,  404,  512, 2162, 2692, 5164, 5535,
        1658, 1593, 2074, 3620, 1126]),
 array([ 404,  840,  512, 7450,  844, 2692, 5164, 8241, 5535, 2074, 1929,
        1773, 4599, 1489, 4264,  247, 6986, 9948, 3521, 4830]))