# MLB Statcast + Bat Tracking Dataset (2024-2025)

Generate pitch-by-pitch Statcast data with Bat Tracking metrics.

## Test Version (1 week)

In [2]:
# Install required packages
!pip install pybaseball -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/426.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m358.4/426.1 kB[0m [31m10.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m426.1/426.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/432.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m432.7/432.7 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import pandas as pd
import numpy as np
from pybaseball import statcast
from datetime import date, timedelta

In [4]:
# Test: 1 week of 2024 season
start_date = '2024-09-01'
end_date = '2024-09-07'

print(f'Fetching data: {start_date} to {end_date}')
df = statcast(start_dt=start_date, end_dt=end_date)
print(f'Total rows: {len(df):,}')
print(f'Total columns: {len(df.columns)}')

Fetching data: 2024-09-01 to 2024-09-07
This is a large query, it may take a moment to complete


  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_cop

Total rows: 27,610
Total columns: 118


In [5]:
# Check Bat Tracking coverage
bat_tracking_cols = ['bat_speed', 'swing_length', 'swing_path_tilt']
print('\n=== Bat Tracking Coverage ===')
for col in bat_tracking_cols:
    non_null = df[col].notna().sum()
    pct = non_null / len(df) * 100
    print(f'{col}: {non_null:,} ({pct:.1f}%)')


=== Bat Tracking Coverage ===
bat_speed: 12,853 (46.6%)
swing_length: 12,853 (46.6%)
swing_path_tilt: 12,853 (46.6%)


In [6]:
# Check data types and memory usage
print('\n=== Memory Usage ===')
print(f'Total memory: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB')
print(f'\nPer 1000 rows: {df.memory_usage(deep=True).sum() / len(df) * 1000 / 1024**2:.2f} MB')


=== Memory Usage ===
Total memory: 46.7 MB

Per 1000 rows: 1.69 MB


In [7]:
# Sample data with bat tracking
print('\n=== Sample Data ===')
sample_cols = ['game_date', 'batter', 'player_name', 'events',
               'bat_speed', 'swing_length', 'launch_speed', 'launch_angle']
sample = df[df['bat_speed'].notna()][sample_cols].head(10)
print(sample)


=== Sample Data ===
      game_date  batter      player_name     events  bat_speed  swing_length  \
4034 2024-09-07  518595  Little, Brendon  field_out       66.5           6.7   
4252 2024-09-07  518595  Little, Brendon        NaN       78.0           8.1   
4389 2024-09-07  518595  Little, Brendon        NaN       70.1           7.2   
4468 2024-09-07  671739  Little, Brendon  field_out       73.0           7.8   
2316 2024-09-07  663897      Frías, Luis     double       73.9           7.7   
2416 2024-09-07  663897      Frías, Luis        NaN       68.7           7.7   
2905 2024-09-07  570482      Frías, Luis  field_out       69.8           7.7   
3010 2024-09-07  672284      Frías, Luis     single       77.4           7.7   
3236 2024-09-07  657656      Frías, Luis     single       64.2           7.3   
3420 2024-09-07  657656      Frías, Luis        NaN       74.2           8.1   

      launch_speed  launch_angle  
4034          92.1           -24  
4252          <NA>          

In [8]:
# Estimate full dataset size
# 2024 season: ~183 days, 2025 season: ~183 days (total ~366 days)
days_test = 7
days_full = 366
rows_full = len(df) * days_full / days_test
size_full_mb = df.memory_usage(deep=True).sum() / 1024**2 * days_full / days_test

print('\n=== Full Dataset Estimate (2024-2025) ===')
print(f'Estimated rows: {rows_full:,.0f}')
print(f'Estimated size: {size_full_mb:.0f} MB')
print(f'Bat tracking rows: {df["bat_speed"].notna().sum() * days_full / days_test:,.0f}')


=== Full Dataset Estimate (2024-2025) ===
Estimated rows: 1,443,609
Estimated size: 2441 MB
Bat tracking rows: 672,028
