<a href="https://colab.research.google.com/github/tracyhua2/SYS3034-BaseballCase/blob/main/Code/BaseballDraft_Top10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression

player_url = "https://raw.githubusercontent.com/tracyhua2/SYS3034-BaseballCase/refs/heads/main/Data/player_data.csv"
player_data = pd.read_csv(player_url)

hitting_url = "https://raw.githubusercontent.com/tracyhua2/SYS3034-BaseballCase/refs/heads/main/Data/hitting_data.csv"
hitting_data = pd.read_csv(hitting_url)

pitching_url = "https://raw.githubusercontent.com/tracyhua2/SYS3034-BaseballCase/refs/heads/main/Data/pitching_data.csv"
pitching_data = pd.read_csv(pitching_url)


In [None]:
player_df = player_data.copy()

# Create performance metrics function
def calculate_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, r2

# # Calculate team batting metrics
# def analyze_team_performance(df):
#     # Create composite scoring metric
#     player_df['score'] = (player_df['R'] * 0.3 +
#                            player_df['HR'] * 0.4 +
#                            player_df['RBI'] * 0.2 +
#                            player_df['OPS'] * 0.1)
#     # insert weights and metrics here

#     return player_df.sort_values('score', ascending=False)

# Analyze player performance
def analyze_player_stats(player_df):
    # # Calculate OPS+ (normalized OPS)
    # league_ops = hitting_data['OPS'].mean()
    # df['OPS_plus'] = (hitting_data['OPS'] / league_ops) * 100

    # Create player value metric
    player_df['value'] = (player_df['RBI'] * 0.3 +
                         player_df['HR'] * 0.35 +
                         player_df['OPS_plus'] * 0.35)
    # insert weights and metrics here

    return player_df.sort_values('value', ascending=False)

# Test the functions
player_analysis = analyze_player_stats(player_data)

# Create prediction model
def predict_batting_average(df):

    # Prepare input features and target
    X = df[['H', 'AB']].values
    y = df['BA'].values

    # Normalize features (optional)
    X[:, 0] = X[:, 0] / X[:, 1]  # Hits divided by At-bats

    # Split into training and test sets
    train_size = int(0.8 * len(df))
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    # Fit model
    model = LinearRegression()
    model.fit(X_train[:, [0]], y_train)  # Use only normalized feature

    # Make predictions
    y_pred = model.predict(X_test[:, [0]])

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return rmse, mae, r2

# Print results
print("\nTop 10 Players by Value Metric:")
print(player_analysis[['Player', 'player_value', 'HR', 'RBI', 'OPS_plus']].head(10))

# Test prediction model
rmse, mae, r2 = predict_batting_average(hitting_data)
print(f"\nPrediction Metrics:")
print(f"RMSE: {rmse:.3f}")
print(f"MAE: {mae:.3f}")
print(f"R2 Score: {r2:.3f}")


Top 10 Players by Value Metric:
        Player  player_value  HR  RBI    OPS_plus
17   4,000,000     86.138773  37  127  100.253638
13     555,000     85.662555  32  130  101.321586
12   7,750,000     84.352723  33  124  101.722066
18   3,450,000     82.080537  36  119   96.515819
15  11,600,000     81.932219  23  129  100.520625
23   1,032,500     78.951041  37  106   97.717261
6    7,166,666     76.699446  23  110  101.855560
25  11,285,714     75.782973  33  104   94.379923
27  19,243,683     75.569023  27  112   92.911494
8    4,766,666     74.526505  33   90  102.790015

Prediction Metrics:
RMSE: 0.000
MAE: 0.000
R2 Score: 0.998
