This model creates a projected value for a given statistic by using a weighted mean of the previous three seasons for each player, with weights optimized using scipy to minimize error. There is no projection for first-year players.

# Import Packages and Data

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from scipy.optimize import minimize

In [2]:
# Load the data from a CSV file into a DataFrame
df = pd.read_csv('../Resources/properly_formatted_data.csv')

In [3]:
# Combine lists of plate appearance stats and non-plate appearance stats
pa_list = ['H', '1B', '2B', '3B', 'HR', 'R', 'RBI', 'BB', 'SH', 'SB', 'L-WAR', 'wRC', 'WAR']
non_pa_list = ['G', 'AB', 'PA', 'AVG', 'BB%', 'OBP', 'SLG', 'OPS', 'ISO', 'wOBA', 'wRAA', 'wRC+', 'BB%+']
combined_list = pa_list + non_pa_list 

# Automate

In [4]:
# Calculate a weighted mean projection based on three years of data.
# - 'weights': A list or array of weights for each year's contribution.
# - 'year1', 'year2', 'year3': The values for the three years being averaged.
# Normalize the weights to ensure they sum to 1 before returning the result.
def projection(weights, year1, year2, year3):
    weighted_mean = weights[0] * year1 + weights[1] * year2 + weights[2] * year3
    weighted_mean /= np.sum(weights)  # Normalize the weights to sum to 1
    return weighted_mean

In [5]:
def objective(params, df):
    weights = params[:3]  # First 3 params are the weights for the years
    
    total_rmse = 0  # Sum of RMSE for all players
    
    # Loop through each player and compute the error
    for _, row in df.iterrows():
        # Access previous year values by their positional index
        year1_stat, year2_stat, year3_stat = row.iloc[1], row.iloc[2], row.iloc[3]

        observed_stat = row.iloc[0]
        
        # Compute projected stats
        projected_stat = projection(weights, year1_stat, year2_stat, year3_stat)
        
        
        # Compute RMSE for stat
        rmse_HR = np.sqrt(np.mean((projected_stat - observed_stat) ** 2))
        
        
        total_rmse += rmse_HR
    
    return total_rmse  # Total RMSE for the entire dataset

In [6]:
def calculate_projected_stat(row):
    # Get the previous 3 years' stat values
    previous_stat_values = row.iloc[1:4].values
    
    # Filter out NaN values (which represent missing data for second and third year players) and their corresponding weights
    valid_values = [val for val, weight in zip(previous_stat_values, optimized_weights) if not np.isnan(val)]
    valid_weights = [weight for val, weight in zip(previous_stat_values, optimized_weights) if not np.isnan(val)]
    
    # Check if no valid data exists
    if len(valid_values) == 0:
        return np.nan  # Return NaN if no previous value is available (rookie season)
    
    # Calculate the weighted average of the valid previous stat values
    weighted_sum = sum(val * weight for val, weight in zip(valid_values, valid_weights))
    weighted_avg = weighted_sum / sum(valid_weights)
    
    return weighted_avg

In [7]:
# Iterate through each statistic in the 'combined_list' for projection.
for stat in combined_list:
    # For each stat, create a subset of the DataFrame containing the current season's stat
    # and the stats from the previous three seasons (e.g., '1Prev_stat', '2Prev_stat', '3Prev_stat').
    training_df = df[[stat, f'1Prev_{stat}', f'2Prev_{stat}', f'3Prev_{stat}']]
    
    # Copy the data for future use and drop any rows with missing (NaN) values.
    training_df_original = training_df.copy()
    training_df = training_df.dropna()
    
    # Use an initial guess of equal weights (0.33) for the three seasons to start the optimization.
    initial_guess = [0.33, 0.33, 0.33]

    # Apply the Nelder-Mead optimization method to minimize the objective function and find the best weights.
    result = minimize(objective, initial_guess, args=(training_df,), method='Nelder-Mead')

    # Extract the optimized weights for the three seasons and print them for verification.
    optimized_weights = result.x[:3]
    print("Optimized Weights:", optimized_weights)

    # Apply the 'calculate_projected_stat' function to the cleaned data to generate the projected stat
    # and add it as a new column in the DataFrame
    df[f'Projected_{stat}'] = training_df_original.apply(calculate_projected_stat, axis=1)

Optimized Weights: [0.60596365 0.15367101 0.07483735]
Optimized Weights: [0.5496231  0.13759488 0.07965056]
Optimized Weights: [0.58841243 0.21215713 0.12407723]
Optimized Weights: [0.50949539 0.22290407 0.15921795]
Optimized Weights: [0.55829698 0.21711552 0.12406602]
Optimized Weights: [0.55721194 0.14858995 0.07429491]
Optimized Weights: [0.55859253 0.18096627 0.08480745]
Optimized Weights: [0.60660238 0.16984838 0.10767075]
Optimized Weights: [0.51234964 0.20494065 0.15370484]
Optimized Weights: [0.60743883 0.15846715 0.10563983]
Optimized Weights: [0.49503017 0.24751544 0.16500945]
Optimized Weights: [0.611617   0.18908297 0.09068475]
Optimized Weights: [0.49269349 0.24634528 0.16422768]
Optimized Weights: [0.55706882 0.14237771 0.0636827 ]
Optimized Weights: [0.60607341 0.12210693 0.06075946]
Optimized Weights: [0.65158376 0.12981028 0.0617733 ]
Optimized Weights: [0.42961582 0.31509783 0.20345475]
Optimized Weights: [0.43566675 0.30478457 0.1969243 ]
Optimized Weights: [0.430343

# Organize and Export

In [8]:
columns_to_keep = ["IDfg", "Name", "Season", "Team", "Age"]

In [9]:
df = df.drop(columns=df.filter(like='Prev').columns)

In [10]:
# Create a new column order
ordered_columns = []
for stat in combined_list:
    ordered_columns.append(stat)  # Add actual stat
    projected_col = f"Projected_{stat}"
    if projected_col in df.columns:
        ordered_columns.append(projected_col)  # Add projected stat if it exists

In [11]:
columns_to_keep = columns_to_keep + ordered_columns
df = df.loc[:, columns_to_keep]

In [12]:
df.head()

Unnamed: 0,IDfg,Name,Season,Team,Age,H,Projected_H,1B,Projected_1B,2B,...,ISO,Projected_ISO,wOBA,Projected_wOBA,wRAA,Projected_wRAA,wRC+,Projected_wRC+,BB%+,Projected_BB%+
0,1,Alfredo Amezaga,2002,ANA,24.0,7.0,,5.0,,2.0,...,0.154,,0.536,,2.3,,239.0,,0.0,
1,1,Alfredo Amezaga,2003,ANA,25.0,22.0,7.0,15.0,5.0,3.0,...,0.124,0.154,0.273,0.536,-5.5,2.3,63.0,239.0,91.0,0.0
2,1,Alfredo Amezaga,2004,ANA,26.0,15.0,18.965561,11.0,12.997799,2.0,...,0.086,0.135151,0.208,0.381464,-10.9,-2.614627,19.0,135.310567,34.0,53.794181
3,1,Alfredo Amezaga,2005,- - -,27.0,2.0,15.571617,2.0,11.094509,0.0,...,0.0,0.11072,0.276,0.298591,-0.4,-6.334758,57.0,80.976304,126.0,45.345521
4,1,Alfredo Amezaga,2006,FLA,28.0,87.0,6.187642,72.0,4.96506,9.0,...,0.072,0.050067,0.296,0.25326,-11.5,-4.562836,75.0,46.08613,98.0,88.930996


In [13]:
# Export to folder
df.to_csv('../Projection_Results/model_two.csv', index=False)