In [1]:
import pandas as pd
from itertools import combinations

player_url = "https://raw.githubusercontent.com/tracyhua2/SYS3034-BaseballCase/refs/heads/main/Data/player_data.csv"
player_data = pd.read_csv(player_url)


team_url = "https://raw.githubusercontent.com/tracyhua2/SYS3034-BaseballCase/refs/heads/main/Data/team_performance.csv"
team_data = pd.read_csv(team_url)

In [3]:
# Create copy of player_data
player_df = player_data.copy()

# Change Player column to float
player_df['Player'] = player_df['Player'].str.replace(',', '').astype(float)

player_df.head()

Unnamed: 0,Player,Player #,AB,R,H,1B,2B,3B,HR,RBI,SB,CS,BB,BA,OBP,SLG,OPS,TB,final_score
0,6125000.0,1,688,113,204,132,37,19,16,68,56,15,66,0.297,0.358,0.475,0.833,327,0.676931
1,18000000.0,2,686,103,213,180,20,7,6,42,43,4,51,0.31,0.361,0.386,0.747,265,0.49357
2,4000000.0,3,661,93,186,144,33,1,8,57,19,6,56,0.281,0.334,0.371,0.705,245,0.380853
3,1750000.0,4,653,118,213,140,54,2,17,83,20,1,50,0.326,0.376,0.493,0.869,322,0.688431
4,13054526.0,5,645,102,183,133,36,2,12,82,10,0,55,0.284,0.339,0.402,0.741,259,0.428782


In [7]:
def calculate_win_percentage(player, team_win_pct):
    # Define weights dictionary
    weights = {
        'AB': 0.026580,
        'R': 0.100512,
        'H': 0.072376,
        '2B': 0.026418,
        '3B': 0.006667,
        'HR': 0.073431,
        'TB': 0.093896,
        'RBI': 0.064800,
        'BA': 0.064209,
        'OBP': 0.105842,
        'SLG': 0.114382,
        'OPS': 0.128888,
        'BB': 0.037556,
        'CS': 0.042222,
        'SB': 0.042222
    }

    # Calculate contribution
    contribution = 0
    for stat, weight in weights.items():
        if stat in player.index:  # Check if stat exists in player's data
            contribution += player[stat] * weight

    # Add contribution to team's base win percentage
    predicted_win = team_win_pct + contribution
    return predicted_win

# Get top 10 players
# Use player_df instead of player_data to access the converted 'Player' column
top_10 = player_df.sort_values('final_score', ascending=False).head(10)

# Get team win percentage for team 26
team_win = team_data.loc[team_data['team_rank'] == 26, 'win_percentage'].values[0]

# Calculate predicted win percentage for each player
for _, player in top_10.iterrows():
    predicted_win = calculate_win_percentage(player, team_win)
    # The 'Player' column is already a float, so just cast it to int
    print(f"Player {int(player['Player #'])}: Predicted Win = {predicted_win:.3f}")

Player 13: Predicted Win = 89.812
Player 41: Predicted Win = 84.640
Player 49: Predicted Win = 83.786
Player 4: Predicted Win = 86.305
Player 26: Predicted Win = 84.785
Player 1: Predicted Win = 87.871
Player 14: Predicted Win = 86.437
Player 37: Predicted Win = 80.487
Player 28: Predicted Win = 84.238
Player 31: Predicted Win = 83.106
