This model generates a projected value for a given statistic by using a weighted mean of the previous three seasons for each player, with weights optimized using SciPy to minimize error. It also distinguishes between cumulative and rate statistics, calculating cumulative stats per plate appearance for improved accuracy. Finally, the model also incorporates a regression to the mean with a regression factor that is also optimized using SciPy. No projections are made for first-year players.

# Import Packages and Data

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from scipy.optimize import minimize

In [2]:
# Load the data from a CSV file into a DataFrame
df = pd.read_csv('../Resources/properly_formatted_data.csv')

In [3]:
non_pa_list = ['G', 'AB', 'PA', 'AVG', 'BB%', 'OBP', 'SLG', 'OPS', 'ISO', 'wOBA', 'wRAA', 'wRC+', 'BB%+']

In [4]:
pa_list = ['H', '1B', '2B', '3B', 'HR', 'R', 'RBI', 'BB', 'SH', 'SB', 'L-WAR', 'wRC', 'WAR']

# Automating

In [5]:
# Calculate a weighted mean projection based on three years of data.
# Normalize the weights to ensure they sum to 1 before returning the result.
def projection(weights, regression_factor, year1, year2, year3, league_avg):
    weighted_mean = weights[0] * year1 + weights[1] * year2 + weights[2] * year3
    weighted_mean /= np.sum(weights)  # Normalize the weights to sum to 1
    regression_result = weighted_mean + regression_factor * (league_avg - weighted_mean)
    return regression_result

In [6]:
def objective(params, df):
    weights = params[:3]  # First 3 params are the weights for the years
    regression_factor = params[3]  # Last param is the regression factor
    
    total_rmse = 0  # Sum of RMSE for all players
    
    # Loop through each player and compute the error
    for _, row in df.iterrows():
        # Access columns by their positional index
        year1_stat, year2_stat, year3_stat = row.iloc[1], row.iloc[2], row.iloc[3]
        league_stat_avg = row.iloc[4]
        observed_stat = row.iloc[0]
        
        # Compute projected stats
        projected_stat = projection(weights, regression_factor, year1_stat, year2_stat, year3_stat, league_stat_avg)
        
        # Compute RMSE for stat
        rmse_stat = np.sqrt(np.mean((projected_stat - observed_stat) ** 2))
        total_rmse += rmse_stat
    
    return total_rmse  # Total RMSE for the entire dataset

In [7]:
def calculate_projected_stat(row):
    # Get the previous 3 years' stat values
    previous_stat_values = row.iloc[1:4].values
    
    # Filter out NaN values (which represent missing data for second and third year players) and their corresponding weights
    valid_values = [val for val, weight in zip(previous_stat_values, optimized_weights) if not np.isnan(val)]
    valid_weights = [weight for val, weight in zip(previous_stat_values, optimized_weights) if not np.isnan(val)]
    
    # Check if no valid data exists
    if len(valid_values) == 0:
        return np.nan  # Return NaN if no previous value is available (rookie season)
    
    # Calculate the weighted average of the valid previous stat values
    weighted_sum = sum(val * weight for val, weight in zip(valid_values, valid_weights))
    weighted_avg = weighted_sum / sum(valid_weights)
    
    # Apply the regression formula
    regressed_avg = (1 - optimized_regression_factor) * weighted_avg + optimized_regression_factor * row.iloc[4]
    
    return regressed_avg


In [8]:
# Iterate through each statistic in the 'non_pa_list' for projection.
for stat in non_pa_list:
    # For each stat, create a subset of the DataFrame containing the current season's stat
    # and the stats from the previous three seasons (e.g., '1Prev_stat', '2Prev_stat', '3Prev_stat').
    training_df = df[[stat, f'1Prev_{stat}', f'2Prev_{stat}', f'3Prev_{stat}', f'Prev_League_Avg_{stat}']]

    # Copy the data for future use and drop any rows with missing (NaN) values.
    training_df_copy = training_df.copy()
    training_df = training_df.dropna()

    # Use an initial guess of equal weights (0.33) for the three seasons and a regression factor of 0.1 to start the optimization.
    initial_guess = [0.33, 0.33, 0.33, 0.1]

    # Apply the Nelder-Mead optimization method to minimize the objective function and find the best weights.
    result = minimize(objective, initial_guess, args=(training_df,), method='Nelder-Mead')

    # Extract the optimized weights and regression factor for the three seasons and print them for verification.
    optimized_weights = result.x[:3]
    optimized_regression_factor = result.x[3]
    print("Optimized Weights:", optimized_weights)
    print("Optimized Regression Factor:", optimized_regression_factor)

    # Apply the 'calculate_projected_stat' function to the cleaned data to generate the projected stat
    # and add it as a new column in the DataFrame
    df[f'Projected_{stat}'] = training_df_copy.apply(calculate_projected_stat, axis=1)
    

Optimized Weights: [0.71609882 0.1685434  0.06187706]
Optimized Regression Factor: 0.052708532232576745
Optimized Weights: [0.95898046 0.18723826 0.07225563]
Optimized Regression Factor: 0.03336558589642274
Optimized Weights: [0.91743295 0.17144443 0.06794621]
Optimized Regression Factor: 0.031158167940444098
Optimized Weights: [0.3562843  0.25627431 0.15555734]
Optimized Regression Factor: 0.18221435959967344
Optimized Weights: [0.50942585 0.35316551 0.20414012]
Optimized Regression Factor: 0.14259548050087373
Optimized Weights: [0.42457226 0.29066665 0.18037887]
Optimized Regression Factor: 0.14929217684143112
Optimized Weights: [0.46086043 0.2970658  0.18560532]
Optimized Regression Factor: 0.16127093093891592
Optimized Weights: [0.45892235 0.30128805 0.19063395]
Optimized Regression Factor: 0.1520359804519525
Optimized Weights: [0.53610326 0.30593184 0.18975548]
Optimized Regression Factor: 0.11267502838720828
Optimized Weights: [0.43649616 0.29236325 0.18519496]
Optimized Regressi

# Adding per PA 

In [9]:
def calculate_projected_stat_per_pa(row):
    # Get the previous 3 years' stat values
    previous_stat_values = row.iloc[1:4].values
    
    # Filter out NaN values (which represent missing data for second and third year players) and their corresponding weights
    valid_values = [val for val, weight in zip(previous_stat_values, optimized_weights) if not np.isnan(val)]
    valid_weights = [weight for val, weight in zip(previous_stat_values, optimized_weights) if not np.isnan(val)]
    
    # Check if no valid data exists
    if len(valid_values) == 0:
        return np.nan  # Return NaN if no previous value is available (rookie season)
    
    # Calculate the weighted average of the valid previous stat values
    weighted_sum = sum(val * weight for val, weight in zip(valid_values, valid_weights))
    weighted_avg = weighted_sum / sum(valid_weights)
    
    # Multiply the expecteted per PA stat by projected PA to get an estimate for projected cumlative stat
    regressed_avg = (1 - optimized_regression_factor) * weighted_avg + optimized_regression_factor * row.iloc[4]
    regressed_avg = regressed_avg * row.iloc[5]
    
    return regressed_avg

In [10]:
# Iterate through each statistic in the 'non_pa_list' for projection.
for stat in pa_list:
    # Calculate per plate appearance (PA) values for the current and previous three seasons
    # by dividing the stat for each season by the corresponding projected or actual PA.
    df[f'{stat}/PA'] = df[stat] / df['Projected_PA']
    df[f'1Prev_{stat}/PA'] = df[f'1Prev_{stat}'] / df['1Prev_PA']
    df[f'2Prev_{stat}/PA'] = df[f'2Prev_{stat}'] / df['2Prev_PA']
    df[f'3Prev_{stat}/PA'] = df[f'3Prev_{stat}'] / df['3Prev_PA']
    df[f'Prev_League_Avg_{stat}/PA'] = df[f'Prev_League_Avg_{stat}'] / df['Projected_PA']
    
    # Create a 'training_df' with the calculated per-PA values and the projected PA 
    # to be used in the optimization or further analysis.
    training_df = df[[f'{stat}/PA', f'1Prev_{stat}/PA', f'2Prev_{stat}/PA', f'3Prev_{stat}/PA', f'Prev_League_Avg_{stat}/PA', 'Projected_PA']]

    # Copy the data for future use and drop any rows with missing (NaN) values.
    training_df_original = training_df.copy()
    training_df = training_df.dropna()

    # Use an initial guess of equal weights (0.33) for the three seasons and a regression factor of 0.1 to start the optimization.
    initial_guess = [0.33, 0.33, 0.33, 0.1]

    # Apply the Nelder-Mead optimization method to minimize the objective function and find the best weights.
    result = minimize(objective, initial_guess, args=(training_df,), method='Nelder-Mead')

    # Extract the optimized weights and regression factor for the three seasons and print them for verification.
    optimized_weights = result.x[:3]
    optimized_regression_factor = result.x[3]
    print("Optimized Weights:", optimized_weights)
    print("Optimized Regression Factor:", optimized_regression_factor)

    # Apply the 'calculate_projected_stat_per_pa' function to the cleaned data to generate the projected stat
    # and add it as a new column in the DataFrame
    df[f'Projected_{stat}'] = training_df_original.apply(calculate_projected_stat_per_pa, axis=1)

Optimized Weights: [0.66947808 0.38363504 0.27849741]
Optimized Regression Factor: -0.006908837815925198
Optimized Weights: [0.6231345  0.41888255 0.29240792]
Optimized Regression Factor: -0.006077177592096925
Optimized Weights: [0.72545591 0.45141944 0.3075327 ]
Optimized Regression Factor: -1.2396322402340638e-09
Optimized Weights: [1.30288139 0.7259229  0.3294816 ]
Optimized Regression Factor: 1.2698498241666145e-10
Optimized Weights: [0.74241305 0.31544517 0.26572174]
Optimized Regression Factor: 2.824582804766954e-09
Optimized Weights: [3.4316382  1.85004374 0.80293776]
Optimized Regression Factor: -5.225266288914641e-12
Optimized Weights: [0.85664342 0.47936484 0.29853027]
Optimized Regression Factor: -1.528009848965232e-11
Optimized Weights: [0.60675533 0.40722269 0.25965433]
Optimized Regression Factor: -9.370150449119542e-09
Optimized Weights: [0.42260441 0.32815276 0.24857106]
Optimized Regression Factor: 0.107580646460706
Optimized Weights: [0.53340332 0.24670705 0.10675039]

# Organizing and Exporting Data

In [11]:
columns_to_keep = ["IDfg", "Name", "Season", "Team", "Age"]

In [12]:
df = df.drop(columns=df.filter(like='Prev').columns)

In [13]:
df = df.drop(columns=df.filter(like='/PA').columns)

In [14]:
# Create a list of stats (e.g., 'H', 'HR', etc.)
stats = pa_list + non_pa_list

# Create a new column order
ordered_columns = []
for stat in stats:
    ordered_columns.append(stat)  # Add actual stat
    projected_col = f"Projected_{stat}"
    if projected_col in df.columns:
        ordered_columns.append(projected_col)  # Add projected stat if it exists

In [15]:
columns_to_keep = columns_to_keep + ordered_columns
df = df.loc[:, columns_to_keep]

In [16]:
df.head()

Unnamed: 0,IDfg,Name,Season,Team,Age,H,Projected_H,1B,Projected_1B,2B,...,ISO,Projected_ISO,wOBA,Projected_wOBA,wRAA,Projected_wRAA,wRC+,Projected_wRC+,BB%+,Projected_BB%+
0,1,Alfredo Amezaga,2002,ANA,24.0,7.0,,5.0,,2.0,...,0.154,,0.536,,2.3,,239.0,,0.0,
1,1,Alfredo Amezaga,2003,ANA,25.0,22.0,9.835703,15.0,7.061435,3.0,...,0.124,0.147478,0.273,0.49174,-5.5,1.551732,63.0,211.601594,91.0,10.207542
2,1,Alfredo Amezaga,2004,ANA,26.0,15.0,33.053889,11.0,24.272516,2.0,...,0.086,0.130787,0.208,0.358793,-10.9,-1.963284,19.0,122.154376,34.0,56.310706
3,1,Alfredo Amezaga,2005,- - -,27.0,2.0,24.704201,2.0,17.98755,0.0,...,0.0,0.108771,0.276,0.286467,-0.4,-4.808402,57.0,73.137358,126.0,49.720583
4,1,Alfredo Amezaga,2006,FLA,28.0,87.0,6.413915,72.0,5.585938,9.0,...,0.072,0.054246,0.296,0.252708,-11.5,-2.935005,75.0,46.262237,98.0,86.399767


In [17]:
df.to_csv('../Projection_Results/model_four.csv', index=False)