In [114]:
import numpy as np
import pandas as pd
from pybaseball import schedule_and_record


# Feature Engineering

In [77]:
batting_data = pd.read_parquet("data/batting_full.parquet.gz")
pitching_data = pd.read_parquet("data/pitching_full.parquet.gz")

## Batting

In [None]:
batting_cols = [
    # Meta
    "IDfg",
    "Season",
    "Team",
    "Age",
    # Normalization
    "AB",
    # Stats
    "OPS",  # TODO: how to get OPS+?
    "WAR",
    "OBP+",
    "2B",
    "3B",
    "HR",
    "CS",
    "SB",
    "SO",
    # Left-handed?
]
batting_cleaned = batting_data[batting_cols].dropna()
batting_cleaned = batting_cleaned[batting_cleaned["AB"] > 0]
# Normalize for at-bats
for col in ["OPS", "2B", "3B", "HR", "SO"]:
    batting_cleaned[col] /= batting_cleaned["AB"]
batting_cleaned["CS:SB"] = batting_cleaned["CS"] / batting_cleaned["SB"]

batting_cleaned.drop(columns=["CS", "SB", "AB"], inplace=True)

In [94]:
batting_cleaned.head()

Unnamed: 0,IDfg,Season,Team,Age,OPS,WAR,OBP+,2B,3B,HR,SO,CS:SB
0,1008559,1998,STL,34,0.002401,8.5,138.0,0.041257,0.0,0.137525,0.304519,0.0
1,1109,1998,SFG,33,0.001897,8.5,129.0,0.07971,0.012681,0.067029,0.166667,0.428571
2,1093,1998,NYM,29,0.001792,8.1,132.0,0.064632,0.007181,0.039497,0.131059,1.0
3,1274,1998,SEA,22,0.00134,7.9,106.0,0.05102,0.007289,0.061224,0.176385,0.282609
4,190,1998,BOS,24,0.001566,7.3,106.0,0.061258,0.013245,0.057947,0.102649,0.5


In [105]:
# Save
batting_cleaned.to_parquet("data/batting_cleaned.parquet.gz", compression="gzip")

## Pitching

In [102]:
pitching_cols = [
    # Meta
    "IDfg",
    "Season",
    "Team",
    "Age",
    # Normalization
    "TBF",
    # Stats
    "ERA-",
    "H",
    "HR",
    "BB",
    "SO",
]

pitching_cleaned = pitching_data[pitching_cols].dropna()
pitching_cleaned = pitching_cleaned[pitching_cleaned["TBF"] > 0]
# Normalize for batters faced
for col in ["H", "HR", "BB", "SO"]:
    pitching_cleaned[col] /= pitching_cleaned["TBF"]
pitching_cleaned.drop(columns=["TBF"], inplace=True)

In [103]:
pitching_cleaned.head()

Unnamed: 0,IDfg,Season,Team,Age,ERA-,H,HR,BB,SO
0,642,1998,SDP,33,60,0.218023,0.007752,0.047481,0.249031
1,73,1998,PHI,31,76,0.216713,0.02112,0.056015,0.275482
2,815,1998,TOR,35,57,0.175858,0.011446,0.091571,0.281998
3,60,1998,- - -,34,72,0.200197,0.022682,0.084813,0.324458
4,104,1998,ATL,32,53,0.203647,0.013171,0.045593,0.206687


In [104]:
#  Save
pitching_cleaned.to_parquet("data/pitching_cleaned.parquet.gz", compression="gzip")

## Schedule/Record

In [None]:
# All 2024 Teams
teams = [
    "NYY",
    "KCR",
    "LAD",
    "BAL",
    "NYM",
    "BOS",
    "CLE",
    "CIN",
    "ARI",
    "TOR",
    "SFG",
    "MIL",
    "SEA",
    "HOU",
    "SDP",
    "PHI",
    "OAK",
    "ATL",
    "TEX",
    "MIN",
    "CHC",
    "DET",
    "COL",
    "STL",
    "PIT",
    "LAA",
    "WSN",
    "MIA",
    "TBR",
    "CHW",
]

In [115]:
data = schedule_and_record(2008, 'NYY')

http://www.baseball-reference.com/teams/NYY/2008-schedule-scores.shtml


In [119]:
data.head()

Unnamed: 0,Date,Tm,Home_Away,Opp,W/L,R,RA,Inn,W-L,Rank,GB,Win,Loss,Save,Time,D/N,Attendance,cLI,Streak,Orig. Scheduled
1,"Tuesday, Apr 1",NYY,Home,TOR,W,3.0,2.0,9.0,1-0,1.0,Tied,Wang,Halladay,Rivera,2:31,N,55112.0,1.09,1,2008-03-31 (Rain)
2,"Wednesday, Apr 2",NYY,Home,TOR,L,2.0,5.0,9.0,1-1,2.0,1.0,Burnett,Mussina,Accardo,3:10,N,48544.0,1.11,-1,
3,"Thursday, Apr 3",NYY,Home,TOR,W,3.0,2.0,9.0,2-1,2.0,0.5,Chamberlain,Wolfe,Rivera,2:45,N,47785.0,1.04,1,
4,"Friday, Apr 4",NYY,Home,TBR,L,4.0,13.0,9.0,2-2,4.0,0.5,Sonnanstine,Kennedy,,3:07,N,49255.0,1.11,-1,
5,"Saturday, Apr 5",NYY,Home,TBR,L,3.0,6.0,9.0,2-3,5.0,1.5,Jackson,Pettitte,Percival,3:05,D,52247.0,1.05,-2,


In [122]:
batting_cleaned[batting_cleaned.Season == 2024].Team.unique()

array(['NYY', 'KCR', 'LAD', 'BAL', 'NYM', 'BOS', 'CLE', 'CIN', 'ARI',
       'TOR', 'SFG', 'MIL', 'SEA', 'HOU', 'SDP', 'PHI', 'OAK', 'ATL',
       'TEX', 'MIN', 'CHC', 'DET', '- - -', 'COL', 'STL', 'PIT', 'LAA',
       'WSN', 'MIA', 'TBR', 'CHW'], dtype=object)

In [120]:
batting_cleaned.head()

Unnamed: 0,IDfg,Season,Team,Age,OPS,WAR,OBP+,2B,3B,HR,SO,CS:SB
0,1008559,1998,STL,34,0.002401,8.5,138.0,0.041257,0.0,0.137525,0.304519,0.0
1,1109,1998,SFG,33,0.001897,8.5,129.0,0.07971,0.012681,0.067029,0.166667,0.428571
2,1093,1998,NYM,29,0.001792,8.1,132.0,0.064632,0.007181,0.039497,0.131059,1.0
3,1274,1998,SEA,22,0.00134,7.9,106.0,0.05102,0.007289,0.061224,0.176385,0.282609
4,190,1998,BOS,24,0.001566,7.3,106.0,0.061258,0.013245,0.057947,0.102649,0.5
