In [21]:
import pandas as pd
from itertools import combinations

player_url = "https://raw.githubusercontent.com/tracyhua2/SYS3034-BaseballCase/refs/heads/main/Data/player_data.csv"
player_data = pd.read_csv(player_url)


team_url = "https://raw.githubusercontent.com/tracyhua2/SYS3034-BaseballCase/refs/heads/main/Data/team_performance.csv"
team_data = pd.read_csv(team_url)

In [22]:
team_data.columns

Index(['win_percentage', 'team_rank', 'SLG', 'OBP', 'RBI', 'OPS'], dtype='object')

In [23]:
team_data.head()

Unnamed: 0,win_percentage,team_rank,SLG,OBP,RBI,OPS
0,0.364198,28,0.373,0.323,608,0.696
1,0.376543,26,0.389,0.318,631,0.707
2,0.388889,30,0.39,0.317,615,0.707
3,0.416149,19,0.403,0.32,705,0.723
4,0.417178,11,0.429,0.333,750,0.762


In [24]:
# Create copy of player_data
player_df = player_data.copy()

# Change Player column to float
player_df['Player'] = player_df['Player'].str.replace(',', '').astype(float)

player_df.head()

Unnamed: 0,Player,Player #,AB,R,H,1B,2B,3B,HR,RBI,SB,CS,BB,BA,OBP,SLG,OPS,TB,final_score
0,6125000.0,1,688,113,204,132,37,19,16,68,56,15,66,0.297,0.358,0.475,0.833,327,0.676931
1,18000000.0,2,686,103,213,180,20,7,6,42,43,4,51,0.31,0.361,0.386,0.747,265,0.49357
2,4000000.0,3,661,93,186,144,33,1,8,57,19,6,56,0.281,0.334,0.371,0.705,245,0.380853
3,1750000.0,4,653,118,213,140,54,2,17,83,20,1,50,0.326,0.376,0.493,0.869,322,0.688431
4,13054526.0,5,645,102,183,133,36,2,12,82,10,0,55,0.284,0.339,0.402,0.741,259,0.428782


Predicted Win % based on scaled weights (Not Using Pythagorean Theorem of Win Percentage)


In [25]:
def calculate_win_percentage(player, team_win_pct):
    # Define weights dictionary
    weights = {
        'AB': 0.026580,
        'R': 0.100512,
        'H': 0.072376,
        '2B': 0.026418,
        '3B': 0.006667,
        'HR': 0.073431,
        'TB': 0.093896,
        'RBI': 0.064800,
        'BA': 0.064209,
        'OBP': 0.105842,
        'SLG': 0.114382,
        'OPS': 0.128888,
        'BB': 0.037556,
        'CS': 0.042222,
        'SB': 0.042222
    }

    # Calculate contribution
    contribution = 0
    for stat, weight in weights.items():
        if stat in player.index:  # Check if stat exists in player's data
            contribution += player[stat] * weight

    # Add contribution to team's base win percentage
    predicted_win = team_win_pct + contribution
    return predicted_win

# Get top 10 players
# Use player_df instead of player_data to access the converted 'Player' column
top_10 = player_df.sort_values('final_score', ascending=False).head(10)

# Get team win percentage for team 26
team_win = team_data.loc[team_data['team_rank'] == 26, 'win_percentage'].values[0]

# Calculate predicted win percentage for each player
for _, player in top_10.iterrows():
    predicted_win = calculate_win_percentage(player, team_win)
    print(f"Player {int(player['Player #'])}: Team Predicted Win = {predicted_win:.3f}")

Player 13: Team Predicted Win = 89.812
Player 41: Team Predicted Win = 84.640
Player 49: Team Predicted Win = 83.786
Player 4: Team Predicted Win = 86.305
Player 26: Team Predicted Win = 84.785
Player 1: Team Predicted Win = 87.871
Player 14: Team Predicted Win = 86.437
Player 37: Team Predicted Win = 80.487
Player 28: Team Predicted Win = 84.238
Player 31: Team Predicted Win = 83.106


Predicted Win % based on simulated Runs Allowed/Runs Scored (Pythagorean Formula for Calculating win Percentage): https://www.samford.edu/sports-analytics/fans/2022/MLB-Winning-Percentage-Breakdown-Which-Statistics-Help-Teams-Win-More-Games


In [26]:
def simulate_team_runs(player_stats, base_team_runs):
    # Offensive weights that contribute to run production
    run_weights = {
        'R': 0.100512,
        'H': 0.072376,
        'HR': 0.073431,
        'RBI': 0.064800,
        'OBP': 0.105842,
        'SLG': 0.114382
    }
    
    # Calculate additional runs contribution
    runs_contribution = 0
    for stat, weight in run_weights.items():
        if stat in player_stats:
            runs_contribution += player_stats[stat] * weight
    
    # Add to base team runs
    simulated_runs = base_team_runs + runs_contribution
    return simulated_runs

def predict_win_percentage(simulated_runs, runs_allowed):
    return (simulated_runs ** 2) / ((simulated_runs ** 2) + (runs_allowed ** 2)) #pythagorean theorem


In [27]:
# Get base team stats (Team 26 - Seattle)
base_runs_scored = 671  
base_runs_allowed = 811  

# For each highlighted player
for _, player in top_10.iterrows():
    # Simulate new runs scored with player
    new_runs = simulate_team_runs(player, base_runs_scored)
    
    # Calculate new win percentage
    predicted_win_pct = predict_win_percentage(new_runs, base_runs_allowed)
    
    print(f"Player {int(player['Player #'])}")
    print(f"Simulated Runs: {new_runs:.1f}")
    print(f"Predicted Win%: {predicted_win_pct:.3f}\n")
    team_win_pct = team_data.loc[team_data['team_rank']==26,'win_percentage'].values[0]
    print(f"Predicted %Gain Per Player: {(predicted_win_pct-team_win_pct)*100:.2f}%\n")


Player 13
Simulated Runs: 706.8
Predicted Win%: 0.432

Predicted %Gain Per Player: 5.51%

Player 41
Simulated Runs: 703.2
Predicted Win%: 0.429

Predicted %Gain Per Player: 5.27%

Player 49
Simulated Runs: 704.4
Predicted Win%: 0.430

Predicted %Gain Per Player: 5.35%

Player 4
Simulated Runs: 705.0
Predicted Win%: 0.430

Predicted %Gain Per Player: 5.39%

Player 26
Simulated Runs: 704.4
Predicted Win%: 0.430

Predicted %Gain Per Player: 5.35%

Player 1
Simulated Runs: 702.8
Predicted Win%: 0.429

Predicted %Gain Per Player: 5.23%

Player 14
Simulated Runs: 705.5
Predicted Win%: 0.431

Predicted %Gain Per Player: 5.42%

Player 37
Simulated Runs: 702.0
Predicted Win%: 0.428

Predicted %Gain Per Player: 5.18%

Player 28
Simulated Runs: 704.4
Predicted Win%: 0.430

Predicted %Gain Per Player: 5.35%

Player 31
Simulated Runs: 703.3
Predicted Win%: 0.429

Predicted %Gain Per Player: 5.27%

