From Pybaseball: https://github.com/jldbc/pybaseball?tab=readme-ov-file

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import os #to handle file paths
# Set pandas options to display 1000 rows and columns
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

#caching helps to store and reuse previously fetched data, reducing the need to repeatedly download the same data
from pybaseball import statcast, cache
cache.enable()

Getting the data: This function fetches data for each specified year and saves it to a separate CSV file. This avoids large memory usage by handling one year at a time.
Files are named in a way that includes the year for easy identification.

In [3]:
def fetch_and_save_yearly_data(start_year, end_year):
    '''Function to fetch and save data for each year separately'''
    for year in range(start_year, end_year + 1):
        start_date = f"{year}-01-01"
        end_date = f"{year}-12-31"
        print(f"Fetching data for: {start_date} to {end_date}")
        yearly_data = statcast(start_dt=start_date, end_dt=end_date)
        # Save the data to a CSV file
        csv_filename = f"statcast_data_{year}.csv"
        yearly_data.to_csv(csv_filename, index=False)
        print(f"Data for {year} saved to {csv_filename}")

# Fetching data in chunks of two years. 2018-2024 and save each year separately.
fetch_and_save_yearly_data(2023, 2024)

Fetching data for: 2023-01-01 to 2023-12-31
This is a large query, it may take a moment to complete
Skipping offseason dates
Skipping offseason dates


100%|██████████| 246/246 [04:11<00:00,  1.02s/it]


Data for 2023 saved to statcast_data_2023.csv
Fetching data for: 2024-01-01 to 2024-12-31
This is a large query, it may take a moment to complete
Skipping offseason dates
Skipping offseason dates


100%|██████████| 246/246 [04:25<00:00,  1.08s/it]


Data for 2024 saved to statcast_data_2024.csv


combine all the CSV files (statcast_data_2018.csv to statcast_data_2024.csv) into a single DataFrame.
The final combined DataFrame is then saved to a new CSV file, to work with the complete dataset in Tableau.

In [12]:
# List of CSV files to combine
csv_files = [
    "statcast_data_2018.csv",
    "statcast_data_2019.csv",
    "statcast_data_2020.csv",
    "statcast_data_2021.csv",
    "statcast_data_2022.csv",
    "statcast_data_2023.csv",
    "statcast_data_2024.csv"
]

# Path where the CSV files are stored
path = "/home/vanel/Data_Viz/py_analysis"

# Initialize an empty DataFrame
combined_data = pd.DataFrame()

# Iterate through the list of CSV files and concatenate them
for file in csv_files:
    full_path = os.path.join(path, file)
    print(f"Reading {full_path}")
    yearly_data = pd.read_csv(full_path)
    combined_data = pd.concat([combined_data, yearly_data], ignore_index=True)

# Verify the combined data
print(f"Combined data shape: {combined_data.shape}")

Reading /home/vanel/Data_Viz/py_analysis/statcast_data_2018.csv
Reading /home/vanel/Data_Viz/py_analysis/statcast_data_2019.csv
Reading /home/vanel/Data_Viz/py_analysis/statcast_data_2020.csv
Reading /home/vanel/Data_Viz/py_analysis/statcast_data_2021.csv
Reading /home/vanel/Data_Viz/py_analysis/statcast_data_2022.csv
Reading /home/vanel/Data_Viz/py_analysis/statcast_data_2023.csv
Reading /home/vanel/Data_Viz/py_analysis/statcast_data_2024.csv
Combined data shape: (4441922, 94)


In [15]:
combined_data.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,fielder_2,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,pitcher.1,fielder_2.1,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,bat_speed,swing_length
0,SL,2018-10-28,84.0,3.05,5.26,"Sale, Chris",592518,519242,strikeout,swinging_strike,,,,,13.0,Manny Machado strikes out swinging.,W,R,L,LAD,BOS,S,2.0,,1,2,2018,-1.35,-0.26,-1.69,0.9,,,,2,9,Bot,,,,,543877,,,-8.287278,-121.937872,-3.051222,-11.346649,26.649434,-34.287809,3.49,1.6,,,,82.9,2681.0,5.9,563411,519242,543877,456665,571788,646240,593428,643217,598265,605141,54.56,,0.0,0.0,1.0,0.0,0.0,,65,4,Slider,1,5,1,5,5,1,1,5,Standard,Standard,281.0,-0.001,-0.067,,
1,FF,2018-10-28,95.3,3.17,5.5,"Sale, Chris",592518,519242,,ball,,,,,12.0,Manny Machado strikes out swinging.,W,R,L,LAD,BOS,B,,,0,2,2018,0.83,1.26,0.16,4.48,,,,2,9,Bot,,,,,543877,,,-9.701178,-138.390708,0.212266,12.883696,33.319856,-16.484654,3.74,1.72,,,,94.6,2431.0,6.2,563411,519242,543877,456665,571788,646240,593428,643217,598265,605141,54.35,,,,,,,,65,3,4-Seam Fastball,1,5,1,5,5,1,1,5,Standard,Standard,147.0,0.0,0.008,,
2,FF,2018-10-28,96.4,3.07,5.54,"Sale, Chris",592518,519242,,foul,,,,,3.0,Manny Machado strikes out swinging.,W,R,L,LAD,BOS,S,,,0,1,2018,1.05,1.04,0.62,3.59,,,,2,9,Bot,,,,,543877,,,-8.852806,-140.120015,-1.9552,15.725661,29.903912,-18.3468,3.49,1.6,,,,96.3,2416.0,6.2,563411,519242,543877,456665,571788,646240,593428,643217,598265,605141,54.31,,,,,,,,65,2,4-Seam Fastball,1,5,1,5,5,1,1,5,Standard,Standard,135.0,0.0,-0.025,,
3,CH,2018-10-28,86.6,3.1,4.98,"Sale, Chris",592518,519242,,swinging_strike,,,,,13.0,Manny Machado strikes out swinging.,W,R,L,LAD,BOS,S,,,0,0,2018,1.52,-0.02,-0.1,1.39,,,,2,9,Bot,,,,,543877,,,-10.697003,-125.691745,-2.299318,18.251366,25.266016,-32.130579,3.49,1.6,,,,86.3,2177.0,6.2,563411,519242,543877,456665,571788,646240,593428,643217,598265,605141,54.27,,,,,,,,65,1,Changeup,1,5,1,5,5,1,1,5,Standard,Standard,89.0,0.0,-0.016,,
4,SL,2018-10-28,78.5,3.31,5.16,"Sale, Chris",571771,519242,strikeout,swinging_strike,,,,,4.0,Enrique Hernandez strikes out swinging.,W,R,L,LAD,BOS,S,2.0,,3,2,2018,-1.28,-0.25,-0.41,2.74,,,,1,9,Bot,,,,,543877,,,-5.650976,-114.004641,2.039603,-9.806001,21.021107,-34.9534,3.32,1.51,,,,77.2,2508.0,5.5,563411,519242,543877,456665,571788,646240,593428,643217,598265,605141,55.04,,0.0,0.0,1.0,0.0,0.0,,64,7,Slider,1,5,1,5,5,1,1,5,Standard,Standard,281.0,-0.005,-0.212,,


saving file to tableau for a more extensive dashboard

In [None]:
combined_data.to_csv("statcast_data_2018_2024.csv", index=False)

### Exploration: Data Vizualization

How do different teams' run expectancy change based on the pitch types they face?

In [16]:
import altair as alt
import pandas as pd

# Assuming `data` is your dataframe

# Aggregating data for visualization
aggregated_data = combined_data.groupby(['home_team', 'pitch_type']).agg({'delta_run_exp': 'mean'}).reset_index()

# Create an Altair chart
chart = alt.Chart(aggregated_data).mark_bar().encode(
    x=alt.X('pitch_type:N', title='Pitch Type'),
    y=alt.Y('delta_run_exp:Q', title='Change in Run Expectancy'),
    color='home_team:N',
    tooltip=['home_team', 'pitch_type', 'delta_run_exp']
).properties(
    width=800,
    height=400,
    title='Change in Run Expectancy by Pitch Type and Team'
).interactive()

chart


## from pybaseball import pitching_stats

Questions targeted:

- How do different pitching stats correspond to runs allowed?
- How has average fastball velocity changed over time (data since 2015)?
- How have strikeouts and run scoring trended over time?

In [2]:
from pybaseball import pitching_stats

#data = pitching_stats(2015)

# Initialize an empty list to store DataFrames for each year
all_years_data = []

# Loop through the years from 2015 to 2024
for year in range(2015, 2025):
    yearly_data = pitching_stats(year)  # Fetch the data for the year
    all_years_data.append(yearly_data)  # Add the year's data to the list

# Concatenate all the yearly DataFrames into a single DataFrame
full_data = pd.concat(all_years_data, ignore_index=True)
full_data.columns, full_data.shape

# Optionally, save the combined DataFrame to a CSV file
csv_file_path = 'pitching_stats_2015_2024.csv'
full_data.to_csv(csv_file_path, index=False)

print(f"Combined pitching stats have been saved to {csv_file_path}")

Combined pitching stats have been saved to pitching_stats_2015_2024.csv


1. How do different pitching stats correspond to runs allowed? HEATMAP

In [3]:
import altair as alt
import pandas as pd

# Load the combined data from CSV
combined_data2 = pd.read_csv('pitching_stats_2015_2024.csv')

# Selecting relevant columns for correlation analysis
relevant_columns = ['ERA', 'WHIP', 'SO', 'BB', 'HR/9', 'K/9', 'BB/9', 'K/BB', 'H', 'R', 'ER']

# Filtering the data to include only the relevant columns and dropping NaN values
data_filtered = combined_data2[relevant_columns].dropna()

# Calculating the correlation matrix
correlation_matrix = data_filtered.corr().reset_index().melt('index')

# Renaming columns for better clarity in visualization
correlation_matrix.columns = ['Variable1', 'Variable2', 'Correlation']

# Creating the correlation heatmap
heatmap = alt.Chart(correlation_matrix).mark_rect().encode(
    x='Variable1:O',
    y='Variable2:O',
    color='Correlation:Q',
    tooltip=['Variable1', 'Variable2', 'Correlation']
).properties(
    width=600,
    height=600,
    title='Correlation between Pitching Stats and Runs Allowed'
)

# Adding text labels for the correlation values
text = heatmap.mark_text(baseline='middle').encode(
    text=alt.Text('Correlation:Q', format='.2f'),
    color=alt.condition(
        alt.datum.Correlation > 0.5, 
        alt.value('black'),
        alt.value('white')
    )
)

# Combining the heatmap and text
heatmap_with_text = heatmap + text
heatmap_with_text.show()


How has average fastball velocity changed over time?

In [4]:
import altair as alt
import pandas as pd

# Load the combined data from CSV
combined_data2 = pd.read_csv('pitching_stats_2015_2024.csv')

# Grouping by Season and calculating the average fastball velocity
velocity_trends = combined_data2.groupby('Season')['FBv'].mean().reset_index()

# Creating a line chart for average fastball velocity over time
line_chart = alt.Chart(velocity_trends).mark_line(point=True).encode(
    x='Season:O',
    y=alt.Y('FBv:Q', title='Average Fastball Velocity (mph)'),
    tooltip=['Season', 'FBv']
).properties(
    width=600,
    height=400,
    title='Average Fastball Velocity Over Time'
)

line_chart.show()


3. How have strikeouts and run scoring trended over time?
Visualization Example: Dual-Axis Line Chart using Altair

In [5]:
# Load the combined data from CSV
combined_data2 = pd.read_csv('pitching_stats_2015_2024.csv')

# Grouping by Season and calculating total strikeouts and runs
trends = combined_data2.groupby('Season').agg({'SO': 'sum', 'R': 'sum'}).reset_index()

# Creating a dual-axis line chart for strikeouts and runs over time
base = alt.Chart(trends).encode(x='Season:O')

# Line chart for Strikeouts
strikeouts_line = base.mark_line(point=True, color='blue').encode(
    y=alt.Y('SO:Q', title='Strikeouts', axis=alt.Axis(titleColor='blue')),
    tooltip=['Season', 'SO']
)

# Line chart for Runs
runs_line = base.mark_line(point=True, color='red').encode(
    y=alt.Y('R:Q', title='Runs', axis=alt.Axis(titleColor='red')),
    tooltip=['Season', 'R']
)

# Combine the charts with dual y-axes
combined_chart = alt.layer(strikeouts_line, runs_line).resolve_scale(
    y='independent'
).properties(
    width=600,
    height=400,
    title='Strikeouts and Runs Over Time'
)

combined_chart.show()