<a href="https://colab.research.google.com/github/tracyhua2/SYS3034-BaseballCase/blob/main/Code/BaseballDraft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

player_url = "https://raw.githubusercontent.com/tracyhua2/SYS3034-BaseballCase/refs/heads/main/player_data.csv"

player_data = pd.read_csv(player_url)

player_data.head()


Unnamed: 0,Player,Player #,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,BA,OBP,SLG
0,6125000,1,688,113,204,37,19,16,68,56,15,66,0.297,0.358,0.475
1,18000000,2,686,103,213,20,7,6,42,43,4,51,0.31,0.361,0.386
2,4000000,3,661,93,186,33,1,8,57,19,6,56,0.281,0.334,0.371
3,1750000,4,653,118,213,54,2,17,83,20,1,50,0.326,0.376,0.493
4,13054526,5,645,102,183,36,2,12,82,10,0,55,0.284,0.339,0.402


In [None]:
# Selected Target Metrics based on Standardized and Normalized Weights
target_metrics = []

In [None]:
# Data Preparation Function
def data_prep(df):
    df = df.copy(deep=True)
    df["Date"] = [str(year) + "-03-01" for year in df["Season"]]
    # create COVID indicator to help models learn around the shortened season
    df["COVID"] = [1 if year == 2020 else 0 for year in df["Season"]]
    df = df[
        target_metrics
        + [  # Basic info
            "Player #"
            # Superficial rates
            "BB%",
            "K%",
            "OBP",
            "SLG",
            "ISO",
            # Expected rates based on batted ball data
            "xBA",
            "Barrel%",
            "maxEV",
            "xSLG",
            "Oppo%",
            "HR/FB",
            "IFFB%",
            "xwOBA",
            # Plate discipline
            "CStr%",
            "SwStr%",
            "O-Swing% (pi)",
            "Z-Swing% (pi)",
            "O-Contact% (pi)",
            "Z-Contact% (pi)",
            "Zone% (pi)",
            # Speed + position adjustment
            "CS",
            "BsR",
            "Pos",
        ]
    ]
    df["SB%"] = df["SB"] / (df["SB"] + df["CS"])
    df["SB/PA"] = df["SB"] / (df["PA"])
    df["BsR/PA"] = df["BsR"] / (df["PA"])

    # Scale counting stats (e.g., PA, HR, etc.) to the season's highest total
    # to better adjust for COVID-shortened season
    counting_stats = ["PA", "HR", "SB", "CS", "R", "RBI"]
    for stat in counting_stats:
        max_stat = pd.DataFrame(df.groupby("Season")[stat].max())
        max_stat.rename(columns={f"{stat}": f"{stat}_max"}, inplace=True)
        df = pd.merge(df, max_stat, how="left", on="Season")
        df[f"{stat}_scaled"] = df[f"{stat}"] / df[f"{stat}_max"]

    df["weight"] = np.sqrt(df["PA"])

    return df


batters_training = data_prep(batters_training)