In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
file_path = "/content/statcast_pitch_swing_data_20240402_20241030_with_arm_angle.csv"  # Update file path as needed
data = pd.read_csv(file_path)

# Filter relevant columns
columns = ['bat_speed', 'swing_length', 'events', 'pitch_type']
data = data[columns].dropna()

# Create the binary response variable: On Base (1) vs. Not On Base (0)
data['on_base'] = data['events'].apply(lambda x: 1 if x in ['single', 'double', 'triple', 'home_run'] else 0)

# Get the 6 most popular pitch types
top_6_pitches = data['pitch_type'].value_counts().nlargest(6).index

# Filter data for the top 6 pitches
data_top_6 = data[data['pitch_type'].isin(top_6_pitches)]

# Create the faceted scatterplot with regression lines
g = sns.FacetGrid(data_top_6, col='pitch_type', col_wrap=3, height=4)

# Define colors for each hue
colors = {0: 'blue', 1: 'red'}

# Loop through each facet and add regression lines
for ax in g.axes.flat:
    pitch_type = ax.get_title().split('=')[-1].strip()
    for on_base in [0, 1]:
        subset = data_top_6[(data_top_6['pitch_type'] == pitch_type) & (data_top_6['on_base'] == on_base)]
        if not subset.empty:
            sns.regplot(x='swing_length', y='bat_speed', data=subset, ax=ax, scatter=False, color=colors[on_base])

g.map(sns.scatterplot, 'swing_length', 'bat_speed', 'on_base', palette=colors, alpha=0.7)

g.set_titles("{col_name}")
g.set_axis_labels("Swing Length", "Bat Speed")
g.add_legend(title='On Base')
plt.tight_layout()
plt.show()

# Print the regression line equations
print("Regression Line Equations (y = mx + b):")
for (pitch_type, on_base_status), (slope, intercept) in regression_equations.items():
    status_label = "On Base" if on_base_status == 1 else "Not On Base"
    print(f"Pitch Type: {pitch_type}, {status_label}: y = {slope:.4f}x + {intercept:.4f}")

In [None]:
# prompt: Create scatterplots with the 6 most popular pitch types, hue is on base/not on base
# Swing length/bat speed are the axes
# Add linear regression lines for each graph


import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
from scipy.stats import pearsonr

# Load the dataset
file_path = "/content/statcast_pitch_swing_data_20240402_20241030_with_arm_angle.csv"  # Update file path as needed
data = pd.read_csv(file_path)

# Remove bunts and filter batters with at least 20 plate appearances
data = data[data['bb_type'] != 'bunt']
batters_with_20_pa = data['batter'].value_counts()
valid_batters = batters_with_20_pa[batters_with_20_pa >= 20].index
data = data[data['batter'].isin(valid_batters)]

# Filter relevant columns
columns = ['bat_speed', 'swing_length', 'events', 'pitch_type']
data = data[columns].dropna()

# Create the binary response variable: On Base (1) vs. Not On Base (0)
data['on_base'] = data['events'].apply(lambda x: 1 if x in ['single', 'double', 'triple', 'home_run'] else 0)

# Get the 6 most popular pitch types
top_6_pitches = data['pitch_type'].value_counts().nlargest(6).index

# Filter data for the top 6 pitches
data_top_6 = data[data['pitch_type'].isin(top_6_pitches)]

# Create the faceted scatterplot with regression lines
g = sns.FacetGrid(data_top_6, col='pitch_type', col_wrap=3, height=4, hue = 'on_base', palette='Set1')

# Define colors for each hue
colors = {0: 'blue', 1: 'red'}

def add_regression_line(data, x_col, y_col, ax, hue_value):
    """Fit and plot a regression line."""

    #Drop any nan values in x or y
    data_no_nan = data.dropna(subset=[x_col, y_col])

    #make sure there are enough values to calculate the regression line.
    if len(data_no_nan) > 1:
        x = data_no_nan[x_col].values.reshape(-1, 1)
        y = data_no_nan[y_col].values

        model = LinearRegression()
        model.fit(x, y)
        x_pred = np.linspace(x.min(), x.max(), 100).reshape(-1, 1)
        y_pred = model.predict(x_pred)
        ax.plot(x_pred, y_pred, color=colors[hue_value])

# Loop through each facet and add regression lines
for pitch_type, sub_df in data_top_6.groupby('pitch_type'):
    ax = g.axes_dict[pitch_type]
    for on_base in [0, 1]:
        subset = sub_df[sub_df['on_base'] == on_base]
        add_regression_line(subset, 'swing_length', 'bat_speed', ax, on_base)

g.map(sns.scatterplot, 'swing_length', 'bat_speed',  alpha=0.7)

g.set_titles("{col_name}")
g.set_axis_labels("Swing Length", "Bat Speed")
g.add_legend(title='On Base')
plt.tight_layout()
plt.show()

# For each of the six most popular pitch types, calculate mean and standard deviation for on-base swing length and on-base bat speed and their correlation. Do the same for not on base.
# Load the dataset
file_path = "/content/statcast_pitch_swing_data_20240402_20241030_with_arm_angle.csv"  # Update file path as needed
data = pd.read_csv(file_path)

# Remove bunts and filter batters with at least 20 plate appearances
data = data[data['bb_type'] != 'bunt']
batters_with_20_pa = data['batter'].value_counts()
valid_batters = batters_with_20_pa[batters_with_20_pa >= 20].index
data = data[data['batter'].isin(valid_batters)]

# Filter relevant columns
columns = ['bat_speed', 'swing_length', 'events', 'pitch_type']
data = data[columns].dropna()

# Create the binary response variable: On Base (1) vs. Not On Base (0)
data['on_base'] = data['events'].apply(lambda x: 1 if x in ['single', 'double', 'triple', 'home_run'] else 0)

# Get the 6 most popular pitch types
top_6_pitches = data['pitch_type'].value_counts().nlargest(6).index

# Create a dictionary to store the results
results = {}

for pitch in top_6_pitches:
    results[pitch] = {}
    pitch_data = data[data['pitch_type'] == pitch]

    for on_base in [0, 1]:  # Iterate through on-base and not on-base
        subset = pitch_data[pitch_data['on_base'] == on_base]

        if not subset.empty: #check to make sure there is data for the group
          results[pitch][on_base] = {}

          # Calculate mean and standard deviation
          results[pitch][on_base]['swing_length_mean'] = subset['swing_length'].mean()
          results[pitch][on_base]['swing_length_std'] = subset['swing_length'].std()
          results[pitch][on_base]['bat_speed_mean'] = subset['bat_speed'].mean()
          results[pitch][on_base]['bat_speed_std'] = subset['bat_speed'].std()

          # Calculate correlation
          correlation, _ = pearsonr(subset['swing_length'], subset['bat_speed'])
          results[pitch][on_base]['correlation'] = correlation
        else:
          print(f"No data for pitch type '{pitch}' and on_base status '{on_base}'. Skipping.")


# Print or further process the results
for pitch, pitch_results in results.items():
    print(f"\nPitch Type: {pitch}")
    for on_base, stats in pitch_results.items():
        on_base_label = "On Base" if on_base == 1 else "Not On Base"
        print(f"  {on_base_label}:")
        print(f"    Swing Length Mean: {stats.get('swing_length_mean', 'N/A')}")
        print(f"    Swing Length Std: {stats.get('swing_length_std', 'N/A')}")
        print(f"    Bat Speed Mean: {stats.get('bat_speed_mean', 'N/A')}")
        print(f"    Bat Speed Std: {stats.get('bat_speed_std', 'N/A')}")
        print(f"    Correlation: {stats.get('correlation', 'N/A')}")