In [5]:
import polars as pl

In [6]:
pitching = pl.read_csv("./data/lahman_1871-2024_csv/pitching.csv")
pitching.head()

playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,SHO,SV,IPouts,H,ER,HR,BB,SO,BAOpp,ERA,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP
str,i64,i64,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""aardsda01""",2004,1,"""SFN""","""NL""",1,0,11,0,0,0,0,32,20,8,1,10,5,0.417,6.75,0,0,2,0,61,5,8,0,1,1
"""aardsda01""",2006,1,"""CHN""","""NL""",3,0,45,0,0,0,0,159,41,24,9,28,49,0.214,4.08,0,1,1,0,225,9,25,1,3,2
"""aardsda01""",2007,1,"""CHA""","""AL""",2,1,25,0,0,0,0,97,39,23,4,17,36,0.3,6.4,3,2,1,0,151,7,24,2,1,1
"""aardsda01""",2008,1,"""BOS""","""AL""",4,2,47,0,0,0,0,146,49,30,4,35,49,0.268,5.55,2,3,5,0,228,7,32,3,2,4
"""aardsda01""",2009,1,"""SEA""","""AL""",3,6,73,0,0,0,38,214,49,20,4,34,80,0.19,2.52,3,2,0,0,296,53,23,2,1,2


In [7]:
league_ERA = (
    pitching
    .filter(pl.col("yearID") >= 1946)
    .group_by(["yearID", "lgID"])
    .agg([
        pl.col("ER").sum().alias("total_ER"),
        pl.col("IPouts").sum().alias("total_IPouts")
    ])
    .with_columns(
        ((9 * pl.col("total_ER")) / (pl.col("total_IPouts") * 3)).alias("league_ERA")
    )
)

print("Current league_ERA structure:")
print(league_ERA.head())
print(f"Columns: {league_ERA.columns}")


Current league_ERA structure:
shape: (5, 5)
┌────────┬──────┬──────────┬──────────────┬────────────┐
│ yearID ┆ lgID ┆ total_ER ┆ total_IPouts ┆ league_ERA │
│ ---    ┆ ---  ┆ ---      ┆ ---          ┆ ---        │
│ i64    ┆ str  ┆ i64      ┆ i64          ┆ f64        │
╞════════╪══════╪══════════╪══════════════╪════════════╡
│ 2014   ┆ NL   ┆ 8878     ┆ 65445        ┆ 0.406968   │
│ 1979   ┆ NL   ┆ 7243     ┆ 52280        ┆ 0.415627   │
│ 1967   ┆ NL   ┆ 5470     ┆ 43748        ┆ 0.375103   │
│ 1993   ┆ AL   ┆ 9741     ┆ 60667        ┆ 0.481695   │
│ 2000   ┆ AL   ┆ 11014    ┆ 60423        ┆ 0.546845   │
└────────┴──────┴──────────┴──────────────┴────────────┘
Columns: ['yearID', 'lgID', 'total_ER', 'total_IPouts', 'league_ERA']


In [18]:
def assign_dh_state(year, league):
    if year < 1973:
        return "No DH (Both Leagues)"
    elif year >= 2022:
        return "Universal DH"
    elif league == "AL":
        return "AL with DH"
    elif league == "NL":
        return "NL without DH"
    else:
        return "Unknown"

league_ERA_with_DH = league_ERA.with_columns(
    pl.struct(["yearID", "lgID"]).map_elements(
        lambda row: assign_dh_state(row["yearID"], row["lgID"])
    ).alias("DH_state")
)

print("\nLeague ERA with DH states:")
print(league_ERA_with_DH.head(10))

print("\nUnique DH states:")
print(league_ERA_with_DH.select("DH_state").unique().sort("DH_state"))
league_ERA_with_DH.write_csv("./league_ERA_analysis.csv")
print("\nData saved to ./league_ERA_analysis.csv")
print("\nSample of saved data (sorted by year and league):")
print(league_ERA_with_DH.sort(["yearID", "lgID"]).head(15))

league_ERA.head()


League ERA with DH states:
shape: (10, 6)
┌────────┬──────┬──────────┬──────────────┬────────────┬──────────────────────┐
│ yearID ┆ lgID ┆ total_ER ┆ total_IPouts ┆ league_ERA ┆ DH_state             │
│ ---    ┆ ---  ┆ ---      ┆ ---          ┆ ---        ┆ ---                  │
│ i64    ┆ str  ┆ i64      ┆ i64          ┆ f64        ┆ str                  │
╞════════╪══════╪══════════╪══════════════╪════════════╪══════════════════════╡
│ 2014   ┆ NL   ┆ 8878     ┆ 65445        ┆ 0.406968   ┆ NL without DH        │
│ 1979   ┆ NL   ┆ 7243     ┆ 52280        ┆ 0.415627   ┆ NL without DH        │
│ 1967   ┆ NL   ┆ 5470     ┆ 43748        ┆ 0.375103   ┆ No DH (Both Leagues) │
│ 1993   ┆ AL   ┆ 9741     ┆ 60667        ┆ 0.481695   ┆ AL with DH           │
│ 2000   ┆ AL   ┆ 11014    ┆ 60423        ┆ 0.546845   ┆ AL with DH           │
│ 1948   ┆ NL   ┆ 4814     ┆ 32867        ┆ 0.439407   ┆ No DH (Both Leagues) │
│ 1948   ┆ AL   ┆ 5223     ┆ 32929        ┆ 0.475842   ┆ No DH (Both Leagues)

  league_ERA_with_DH = league_ERA.with_columns(


yearID,lgID,total_ER,total_IPouts,league_ERA
i64,str,i64,i64,f64
2014,"""NL""",8878,65445,0.406968
1979,"""NL""",7243,52280,0.415627
1967,"""NL""",5470,43748,0.375103
1993,"""AL""",9741,60667,0.481695
2000,"""AL""",11014,60423,0.546845
