# Import Packages and Data

In [1]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize

In [2]:
df = pd.read_csv('../Resources/properly_formatted_data.csv')

# Automate

In [3]:
def projection(weights, year1, year2, year3):
    weighted_mean = weights[0] * year1 + weights[1] * year2 + weights[2] * year3
    weighted_mean /= np.sum(weights)  # Normalize the weights to sum to 1
    return weighted_mean

In [4]:
def objective(params, df):
    weights = params[:3]  # First 3 params are the weights for the years
    
    total_rmse = 0  # Sum of RMSE for all players
    
    # Loop through each player and compute the error
    for _, row in df.iterrows():
        # Access columns by their positional index
        year1_HR, year2_HR, year3_HR = row.iloc[1], row.iloc[2], row.iloc[3]

        observed_HR = row.iloc[0]
        
        # Projected stats for HR and RBI (you could extend this for more stats)
        projected_HR = projection(weights, year1_HR, year2_HR, year3_HR)
        
        
        # Compute RMSE for HR and RBI (you can expand this to more stats)
        rmse_HR = np.sqrt(np.mean((projected_HR - observed_HR) ** 2))
        
        
        total_rmse += rmse_HR
    
    return total_rmse  # Total RMSE for the entire dataset

In [5]:
def calculate_projected_hr(row):
    # Get the previous 3 years' HR values
    previous_hr_values = row.iloc[1:4].values
    
    # Filter out NaN values and their corresponding weights
    valid_values = [val for val, weight in zip(previous_hr_values, optimized_weights) if not np.isnan(val)]
    valid_weights = [weight for val, weight in zip(previous_hr_values, optimized_weights) if not np.isnan(val)]
    
    # Check if no valid data exists
    if len(valid_values) == 0:
        return np.nan  # Return NaN if no previous HR value is available
    
    # Calculate the weighted average of the valid previous HR values
    weighted_sum = sum(val * weight for val, weight in zip(valid_values, valid_weights))
    weighted_avg = weighted_sum / sum(valid_weights)
    
    return weighted_avg

In [6]:
pa_list = ['H', '1B', '2B', '3B', 'HR', 'R', 'RBI', 'BB', 'SH', 'SB', 'L-WAR', 'wRC', 'WAR']
non_pa_list = ['G', 'AB', 'PA', 'AVG', 'BB%', 'OBP', 'SLG', 'OPS', 'ISO', 'wOBA', 'wRAA', 'wRC+', 'BB%+']

In [7]:
for stat in non_pa_list:
    training_df = df[[stat, f'1Prev_{stat}', f'2Prev_{stat}', f'3Prev_{stat}']]
    for_future_use = training_df.copy()
    training_df = training_df.dropna()
    initial_guess = [0.33, 0.33, 0.33]
    
    result = minimize(objective, initial_guess, args=(training_df,), method='Nelder-Mead')
    
    optimized_weights = result.x[:3]

    print("Optimized Weights:", optimized_weights)

    df[f'Projected_{stat}'] = for_future_use.apply(calculate_projected_hr, axis=1)

Optimized Weights: [0.55706882 0.14237771 0.0636827 ]
Optimized Weights: [0.60607341 0.12210693 0.06075946]
Optimized Weights: [0.65158376 0.12981028 0.0617733 ]
Optimized Weights: [0.42961582 0.31509783 0.20345475]
Optimized Weights: [0.43566675 0.30478457 0.1969243 ]
Optimized Weights: [0.430343   0.30751144 0.19723082]
Optimized Weights: [0.47859045 0.30625046 0.20317302]
Optimized Weights: [0.45856742 0.31210143 0.20945251]
Optimized Weights: [0.5024387  0.2972551  0.19577622]
Optimized Weights: [0.43250237 0.3035576  0.19775468]
Optimized Weights: [0.47453558 0.27859936 0.22395609]
Optimized Weights: [0.43523122 0.30351999 0.20522359]
Optimized Weights: [0.44503495 0.30780076 0.19854949]


# Adding per PA

In [8]:
def calculate_projected_hr_per_pa(row):
    # Get the previous 3 years' HR values
    previous_hr_values = row.iloc[1:4].values
    
    # Filter out NaN values and their corresponding weights
    valid_values = [val for val, weight in zip(previous_hr_values, optimized_weights) if not np.isnan(val)]
    valid_weights = [weight for val, weight in zip(previous_hr_values, optimized_weights) if not np.isnan(val)]
    
    # Check if no valid data exists
    if len(valid_values) == 0:
        return np.nan  # Return NaN if no previous HR value is available
    
    # Calculate the weighted average of the valid previous HR values
    weighted_sum = sum(val * weight for val, weight in zip(valid_values, valid_weights))
    weighted_avg = weighted_sum / sum(valid_weights)
    
    regressed_avg = weighted_avg * row.iloc[4]
    
    return regressed_avg

In [9]:
for stat in pa_list:
    df[f'{stat}/PA'] = df[stat] / df['Projected_PA']
    df[f'1Prev_{stat}/PA'] = df[f'1Prev_{stat}'] / df['1Prev_PA']
    df[f'2Prev_{stat}/PA'] = df[f'2Prev_{stat}'] / df['2Prev_PA']
    df[f'3Prev_{stat}/PA'] = df[f'3Prev_{stat}'] / df['3Prev_PA']
    
    training_df = df[[f'{stat}/PA', f'1Prev_{stat}/PA', f'2Prev_{stat}/PA', f'3Prev_{stat}/PA', 'Projected_PA']]
    for_future_use = training_df.copy()
    training_df = training_df.dropna()
    initial_guess = [0.33, 0.33, 0.33]
    
    result = minimize(objective, initial_guess, args=(training_df,), method='Nelder-Mead')
    
    optimized_weights = result.x[:3]

    print("Optimized Weights:", optimized_weights)

    df[f'Projected_{stat}'] = for_future_use.apply(calculate_projected_hr_per_pa, axis=1)

Optimized Weights: [0.51078732 0.25702032 0.17601161]
Optimized Weights: [0.49111432 0.30726938 0.20644621]
Optimized Weights: [0.49227301 0.26364831 0.17673368]
Optimized Weights: [0.55075862 0.29879894 0.13028598]
Optimized Weights: [0.51864579 0.24960802 0.14697175]
Optimized Weights: [0.5726461  0.28837273 0.11705846]
Optimized Weights: [0.50856386 0.28501604 0.16570355]
Optimized Weights: [0.49846368 0.31202633 0.18759177]
Optimized Weights: [0.42385782 0.30626452 0.21442836]
Optimized Weights: [0.53230826 0.23240628 0.12287041]
Optimized Weights: [0.43577103 0.31527484 0.18602095]
Optimized Weights: [0.49173994 0.30143547 0.20096576]
Optimized Weights: [0.43624589 0.31621958 0.18578682]


In [10]:
columns_to_keep = ["IDfg", "Name", "Season", "Team", "Age"]

In [11]:
df = df.drop(columns=df.filter(like='Prev').columns)

In [12]:
df = df.drop(columns=df.filter(like='/PA').columns)

In [13]:
# Create a list of stats (e.g., 'H', 'HR', etc.)
stats = pa_list + non_pa_list

# Create a new column order
ordered_columns = []
for stat in stats:
    ordered_columns.append(stat)  # Add actual stat
    projected_col = f"Projected_{stat}"
    if projected_col in df.columns:
        ordered_columns.append(projected_col)  # Add projected stat if it exists

# Add any remaining columns that are not stats or projected stats
#remaining_columns = [col for col in df.columns if col not in ordered_columns]
#ordered_columns.extend(remaining_columns)

# Reorder the DataFrame
#df = df[ordered_columns]

In [14]:
columns_to_keep = columns_to_keep + ordered_columns
df = df.loc[:, columns_to_keep]

In [15]:
df.head()

Unnamed: 0,IDfg,Name,Season,Team,Age,H,Projected_H,1B,Projected_1B,2B,...,ISO,Projected_ISO,wOBA,Projected_wOBA,wRAA,Projected_wRAA,wRC+,Projected_wRC+,BB%+,Projected_BB%+
0,1,Alfredo Amezaga,2002,ANA,24.0,7.0,,5.0,,2.0,...,0.154,,0.536,,2.3,,239.0,,0.0,
1,1,Alfredo Amezaga,2003,ANA,25.0,22.0,7.0,15.0,5.0,3.0,...,0.124,0.154,0.273,0.536,-5.5,2.3,63.0,239.0,91.0,0.0
2,1,Alfredo Amezaga,2004,ANA,26.0,15.0,30.893354,11.0,22.991988,2.0,...,0.086,0.135151,0.208,0.381464,-10.9,-2.614627,19.0,135.310567,34.0,53.794181
3,1,Alfredo Amezaga,2005,- - -,27.0,2.0,22.895093,2.0,16.940618,0.0,...,0.0,0.11072,0.276,0.298591,-0.4,-6.334758,57.0,80.976304,126.0,45.345521
4,1,Alfredo Amezaga,2006,FLA,28.0,87.0,6.17041,72.0,5.307888,9.0,...,0.072,0.050067,0.296,0.25326,-11.5,-4.562836,75.0,46.08613,98.0,88.930996


In [16]:
# Export to a specific folder
df.to_csv('../Projection_Results/model_three.csv', index=False)