Install Dependencies

In [None]:
!pip install pybaseball pandas numpy

Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pybaseball import batting_stats

Load Data / Select Features

In [None]:
# Load Batting Data (2015-2023) and include all features
years = list(range(2015, 2024))
batting_data = pd.concat([batting_stats(y) for y in years], ignore_index=True)

# Rename only necessary columns
batting_data.rename(columns={
    "IDfg": "player_id",
    "Season": "year",
    "HardHit%": "hard_hit_rate",
    "Barrel%": "barrel_rate",
    "HR/FB": "hr_fb_rate",
    "LD%": "line_drive_rate",
    "BB%": "bb_rate",
    "K%": "k_rate"
}, inplace=True)

# Ensure correct data types
batting_data["year"] = batting_data["year"].astype(int)

# Display basic dataset info
print("✅ Dataset Loaded with All Features")
print("Dataset Info:")
batting_data.info()

# Display all columns to verify
print("\n📌 Available Columns in Dataset:")
print(batting_data.columns.tolist())


Describe

In [None]:
# Display summary statistics
print("\nSummary Statistics:")
print(batting_data.describe())

Create a Feature for Future wOBA

In [None]:
# Shift wOBA forward by one year to predict next season's performance
batting_data["next_year_woba"] = batting_data.groupby("player_id")["wOBA"].shift(-1)

# Drop rows where next year's wOBA is NaN (last recorded season for a player)
batting_data = batting_data.dropna(subset=["next_year_woba"])

print("✅ Shifted wOBA to predict next season's performance.")


Search for Missing Values

In [None]:
# Check missing values
missing_values = batting_data.isnull().sum()
print("\nMissing Values:")
print(missing_values[missing_values > 0])

Plotm feature distributions

In [None]:
# Plot feature distributions
batting_data.hist(figsize=(12, 10), bins=30)
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()

Compute Correlation Matrix

In [None]:
numeric_data = batting_data.select_dtypes(include=['number'])

# Compute correlation matrix
correlation_matrix = numeric_data.corr()

# Plot heatmap (This will be very large!)
plt.figure(figsize=(20, 16))  # Increase figure size
sns.heatmap(correlation_matrix, cmap="coolwarm", fmt=".2f", linewidths=0.5)

plt.title("Full Feature Correlation Matrix")
plt.show()

In [None]:
# Plot heatmap of feature correlations
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Get the absolute correlation values and sort
corr_wOBA = correlation_matrix["next_year_woba"].abs().sort_values(ascending=False)

# Show the top 20 features most correlated with wOBA
print("🔍 Top 20 Features Most Correlated with wOBA:")
print(corr_wOBA.head(20))


Box Plot for Outlier Detection

In [None]:
# Boxplots for outlier detection
features_to_check = ["ISO", "OBP", "SLG", "hr_fb_rate", "line_drive_rate", "wOBA"]

plt.figure(figsize=(12, 6))
for i, feature in enumerate(features_to_check, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(y=batting_data[feature])
    plt.title(feature)

plt.tight_layout()
plt.show()

print("✅ EDA Complete: Review Plots & Summary Stats for Insights")