In [2]:
pip install MLB-StatsAPI

Note: you may need to restart the kernel to use updated packages.


In [3]:
import statsapi

# Get all regular season games from March 28 to today
game_list = statsapi.schedule(start_date='2025-03-28', end_date='2025-07-31', sportId=1)

# Extract game IDs
game_pks = [game['game_id'] for game in game_list]
print(f"Found {len(game_pks)} games.")

Found 1652 games.


In [4]:
pip install tqdm


Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install tqdm notebook

Note: you may need to restart the kernel to use updated packages.


In [6]:
from tqdm.notebook import tqdm

In [6]:
print(f"Number of games found: {len(game_pks)}")

Number of games found: 1652


In [None]:
all_plays = []

for gamePk in game_pks:
    try:
        data = statsapi.get('game_playByPlay', {'gamePk': gamePk})
        plays = data.get('allPlays', [])
        all_plays.extend(plays)
    except Exception as e:
        print(f"Error fetching gamePk {gamePk}: {e}")

In [78]:
#result table
def extract_result(play, event):
    return {
        'playId': event['playId'],
        'startTime': event['startTime'],
        'endTime': event['endTime'],
        'type': play['result'].get('type'),
        'event': play['result'].get('event'),
        'eventType': play['result'].get('eventType'),
        'description': play['result'].get('description'),
        'awayScore': play['result'].get('awayScore'),
        'homeScore': play['result'].get('homeScore'),
        'isOut': play['result'].get('isOut')
    }

In [79]:
#about table
def extract_about(play, event):
    about = play['about']
    return {
        'playId': event['playId'],
        'startTime': event['startTime'],
        'endTime': event['endTime'],
        'atBatIndex': about.get('atBatIndex'),
        'halfInning': about.get('halfInning'),
        'isTopInning': about.get('isTopInning'),
        'inning': about.get('inning'),
        'isComplete': about.get('isComplete'),
        'isScoringPlay': about.get('isScoringPlay'),
        'hasOut': about.get('hasOut')
    }

In [80]:
#count table
def extract_count(play, event):
    count = play['count']
    return {
        'playId': event['playId'],
        'startTime': event['startTime'],
        'endTime': event['endTime'],
        'balls': count.get('balls'),
        'strikes': count.get('strikes'),
        'outs': count.get('outs')
    }

In [81]:
#matchup table
def extract_matchup(play, event):
    matchup = play['matchup']
    return {
        'playId': event['playId'],
        'startTime': event['startTime'],
        'endTime': event['endTime'],
        'batterId': matchup['batter'].get('id'),
        'batterName': matchup['batter'].get('fullName'),
        'batSide': matchup['batSide'].get('code'),
        'pitcherId': matchup['pitcher'].get('id'),
        'pitcherName': matchup['pitcher'].get('fullName'),
        'pitchHand': matchup['pitchHand'].get('code'),
        'menOnBase': matchup['splits'].get('menOnBase'),
        'responsiblePitcher': None,  # not in matchup, might be in runner details
        'isScoringEvent': play['result'].get('isOut'),  # proxy
        'rbi': play['result'].get('rbi'),
        'earned': None  # not in matchup
    }

In [82]:
#pitchdetail table
def extract_pitch_detail(event):
    pd = event.get('pitchData', {})
    coords = pd.get('coordinates', {})
    breaks = pd.get('breaks', {})
    return {
        'playId': event['playId'],
        'startTime': event['startTime'],
        'endTime': event['endTime'],
        'isInPlay': event['details'].get('isInPlay'),
        'isStrike': event['details'].get('isStrike'),
        'isBall': event['details'].get('isBall'),
        'code': event['details'].get('code'),
        'description': event['details'].get('description'),
        'isOut': event['details'].get('isOut'),
        'startSpeed': pd.get('startSpeed'),
        'strikeZoneTop': pd.get('strikeZoneTop'),
        'strikeZoneBottom': pd.get('strikeZoneBottom'),
        'aY': coords.get('aY'),
        'aZ': coords.get('aZ'),
        'pfxX': coords.get('pfxX'),
        'pfxZ': coords.get('pfxZ'),
        'pX': coords.get('pX'),
        'pZ': coords.get('pZ'),
        'vX0': coords.get('vX0'),
        'vY0': coords.get('vY0'),
        'vZ0': coords.get('vZ0'),
        'x': coords.get('x'),
        'y': coords.get('y'),
        'x0': coords.get('x0'),
        'y0': coords.get('y0'),
        'z0': coords.get('z0'),
        'aX': coords.get('aX'),
        'breakAngle': breaks.get('breakAngle'),
        'breakLength': breaks.get('breakLength'),
        'breakY': breaks.get('breakY'),
        'breakVertical': breaks.get('breakVertical'),
        'breakHorizontal': breaks.get('breakHorizontal'),
        'spinRate': breaks.get('spinRate'),
        'spinDirection': breaks.get('spinDirection'),
        'zone': pd.get('zone'),
        'pitchNumber': event.get('pitchNumber'),
        'isPitch': event.get('isPitch'),
        'type': event.get('type')
    }

In [83]:
#DetailsCall table
def extract_details_call(event):
    return {
        'playId': event['playId'],
        'startTime': event['startTime'],
        'endTime': event['endTime'],
        'description': event['details'].get('description')
    }

In [84]:
#hit data table
def extract_hit_data(event):
    hd = event.get('hitData', {})
    coords = hd.get('coordinates', {})
    return {
        'playId': event['playId'],
        'startTime': event['startTime'],
        'endTime': event['endTime'],
        'launchSpeed': hd.get('launchSpeed'),
        'launchAngle': hd.get('launchAngle'),
        'totalDistance': hd.get('totalDistance'),
        'trajectory': hd.get('trajectory'),
        'hardness': hd.get('hardness'),
        'location': hd.get('location'),
        'coordX': coords.get('coordX'),
        'coordY': coords.get('coordY'),
        'index': event.get('index')
    }

In [85]:
result_rows, about_rows, count_rows = [], [], []
matchup_rows, pitch_rows, call_rows, hit_rows = [], [], [], []

for play in all_plays:
    for event in play['playEvents']:
        if event.get('isPitch'):
            result_rows.append(extract_result(play, event))
            about_rows.append(extract_about(play, event))
            count_rows.append(extract_count(play, event))
            matchup_rows.append(extract_matchup(play, event))
            pitch_rows.append(extract_pitch_detail(event))
            call_rows.append(extract_details_call(event))
            if 'hitData' in event:
                hit_rows.append(extract_hit_data(event))

In [49]:
import pandas as pd

df_result = pd.DataFrame(result_rows)
df_about = pd.DataFrame(about_rows)
df_count = pd.DataFrame(count_rows)
df_matchup = pd.DataFrame(matchup_rows)
df_pitchv = pd.DataFrame(pitch_rows)
df_call = pd.DataFrame(call_rows)
df_hit = pd.DataFrame(hit_rows)

# Save each DataFrame to a CSV file
df_result.to_csv('df_result.csv', index=False)
df_about.to_csv('df_about.csv', index=False)
df_count.to_csv('df_count.csv', index=False)
df_matchup.to_csv('df_matchup.csv', index=False)
df_pitch.to_csv('df_pitch.csv', index=False)
df_call.to_csv('df_call.csv', index=False)
df_hit.to_csv('df_hit.csv', index=False)

In [50]:
# Save each DataFrame to a CSV file
df_result.to_csv('df_result.csv', index=False)
df_about.to_csv('df_about.csv', index=False)
df_count.to_csv('df_count.csv', index=False)
df_matchup.to_csv('df_matchup.csv', index=False)
df_pitchv.to_csv('df_pitch.csv', index=False)
df_call.to_csv('df_call.csv', index=False)
df_hit.to_csv('df_hit.csv', index=False)

In [59]:
def extract_pitch_detail_2(event):
    pd = event.get('pitchData', {})
    coords = pd.get('coordinates', {})
    breaks = pd.get('breaks', {})
    pitch_type = event.get('details', {}).get('type', {}).get('description')  # ← Extracted from nested structure

    return {
        'playId': event['playId'],
        'startTime': event['startTime'],
        'endTime': event['endTime'],
        'isInPlay': event['details'].get('isInPlay'),
        'isStrike': event['details'].get('isStrike'),
        'isBall': event['details'].get('isBall'),
        'code': event['details'].get('code'),
        'description': event['details'].get('description'),
        'isOut': event['details'].get('isOut'),
        'pitchType': pitch_type,  # ← Added here
        'startSpeed': pd.get('startSpeed'),
        'strikeZoneTop': pd.get('strikeZoneTop'),
        'strikeZoneBottom': pd.get('strikeZoneBottom'),
        'aY': coords.get('aY'),
        'aZ': coords.get('aZ'),
        'pfxX': coords.get('pfxX'),
        'pfxZ': coords.get('pfxZ'),
        'pX': coords.get('pX'),
        'pZ': coords.get('pZ'),
        'vX0': coords.get('vX0'),
        'vY0': coords.get('vY0'),
        'vZ0': coords.get('vZ0'),
        'x': coords.get('x'),
        'y': coords.get('y'),
        'x0': coords.get('x0'),
        'y0': coords.get('y0'),
        'z0': coords.get('z0'),
        'aX': coords.get('aX'),
        'breakAngle': breaks.get('breakAngle'),
        'breakLength': breaks.get('breakLength'),
        'breakY': breaks.get('breakY'),
        'breakVertical': breaks.get('breakVertical'),
        'breakHorizontal': breaks.get('breakHorizontal'),
        'spinRate': breaks.get('spinRate'),
        'spinDirection': breaks.get('spinDirection'),
        'zone': pd.get('zone'),
        'pitchNumber': event.get('pitchNumber'),
        'isPitch': event.get('isPitch'),
        'type': event.get('type')
    }

In [62]:
pitch_rows2 = []

for play in all_plays:
    for event in play.get('playEvents', []):
        if event.get('isPitch'):
            pitch_rows2.append(extract_pitch_detail_2(event))

df_pitch2 = pd.DataFrame(pitch_rows2)

In [65]:
df_pitch2.to_csv('df_pitch2.csv', index=False)

In [71]:
baserunners_data = []

for play in all_plays:
    play_id = play.get('playId')
    start_time = play.get('about', {}).get('startTime')
    end_time = play.get('about', {}).get('endTime')
    
    # Initialize flags
    on_first = False
    on_second = False
    on_third = False
    
    # Check runners
    for runner in play.get('runners', []):
        origin = runner.get('movement', {}).get('originBase')
        if origin == '1B':
            on_first = True
        elif origin == '2B':
            on_second = True
        elif origin == '3B':
            on_third = True
    
    baserunners_data.append({
        'playId': play_id,
        'startTime': start_time,
        'endTime': end_time,
        'on_first': on_first,
        'on_second': on_second,
        'on_third': on_third
    })

# Convert to DataFrame
baserunners = pd.DataFrame(baserunners_data)