In [5]:
import requests
import pandas as pd
import json
from datetime import datetime, timedelta
import time
import os
import logging

In [None]:
# Setup logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# MLB Stats API base URL
BASE_URL = "https://statsapi.mlb.com/api"


class MLBStatsAPI:
    def __init__(self, rate_limit_pause=1):
        self.rate_limit_pause = rate_limit_pause  # Time to wait between API calls

    def make_request(self, endpoint, params=None, max_retries=3):
        """Make API request with retries"""
        url = f"{BASE_URL}/{endpoint}"
        for attempt in range(max_retries):
            try:
                response = requests.get(url, params=params)
                response.raise_for_status()
                time.sleep(self.rate_limit_pause)  # Prevent rate limiting
                return response.json()
            except Exception as e:
                logger.warning(
                    f"API request failed: {e}. Attempt {attempt + 1}/{max_retries}"
                )
                if attempt == max_retries - 1:
                    raise
                time.sleep(2)  # Wait before retry

    def get_schedule(self, start_date, end_date):
        """Get MLB schedule for a date range"""
        endpoint = "v1/schedule"
        params = {
            "startDate": start_date,
            "endDate": end_date,
            "sportId": 1,  # MLB
            "gameType": "R",  # Regular season games
            "hydrate": "venue,team",
        }
        return self.make_request(endpoint, params)

    def get_game_data(self, game_pk):
        """Get detailed game data"""
        endpoint = f"v1.1/game/{game_pk}/feed/live"
        params = {"hydrate": "boxscore,weather"}
        return self.make_request(endpoint, params)

    def get_player_stats(self, player_id, season, end_date=None):
        """Get player stats for a season optionally up to a date"""
        endpoint = f"v1/people/{player_id}/stats"

        # Basic stats
        params = {
            "stats": "season",
            "season": season,
            "group": "hitting,pitching",
            "gameType": "R",
        }

        # Add date range if provided
        if end_date:
            params["stats"] = "byDateRange"
            params["startDate"] = f"{season}-03-01"
            params["endDate"] = end_date

        regular_stats = self.make_request(endpoint, params)

        # Advanced stats
        params["stats"] = "sabermetrics"
        advanced_stats = self.make_request(endpoint, params)

        return {"regular": regular_stats, "advanced": advanced_stats}

    def get_matchup_stats(self, batter_id, pitcher_id, season):
        """Get batter vs pitcher matchup stats"""
        endpoint = f"v1/people/{batter_id}/stats/vsPlayer"
        params = {"stats": "vsPlayer", "season": season, "opposingPlayerId": pitcher_id}
        return self.make_request(endpoint, params)


def collect_mlb_data(start_date, end_date, output_dir="mlb_data"):
    """Main function to collect MLB data"""
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)

    # Initialize API client
    api = MLBStatsAPI()
    season = start_date.split("-")[0]  # Extract season year

    # Get schedule
    logger.info(f"Getting MLB schedule from {start_date} to {end_date}")
    schedule = api.get_schedule(start_date, end_date)

    # Data storage
    games_data = []
    batter_stats = []
    pitcher_stats = []
    weather_data = []

    # Process each game date
    for date_info in schedule.get("dates", []):
        game_date = date_info["date"]
        logger.info(f"Processing games for {game_date}")

        # Process each game
        for game in date_info.get("games", []):
            game_pk = game["gamePk"]

            # Skip games that haven't been played yet
            if game["status"]["abstractGameState"] != "Final":
                logger.info(f"Skipping game {game_pk} - not completed")
                continue

            logger.info(f"Processing game {game_pk}")

            try:
                # Get detailed game data
                game_detail = api.get_game_data(game_pk)

                # Extract weather information
                weather = game_detail.get("gameData", {}).get("weather", {})
                weather_data.append(
                    {
                        "game_pk": game_pk,
                        "game_date": game_date,
                        "temperature": weather.get("temp"),
                        "condition": weather.get("condition"),
                        "wind": weather.get("wind"),
                    }
                )

                # Get box score data
                boxscore = game_detail.get("liveData", {}).get("boxscore", {})

                # Team information
                away_team = game["teams"]["away"]["team"]
                home_team = game["teams"]["home"]["team"]

                # Find starting pitchers
                away_starter = None
                home_starter = None

                for player_id, player in (
                    boxscore.get("teams", {}).get("away", {}).get("players", {}).items()
                ):
                    if (
                        player.get("stats", {})
                        .get("pitching", {})
                        .get("gamesStarted", 0)
                        > 0
                    ):
                        away_starter = player
                        break

                for player_id, player in (
                    boxscore.get("teams", {}).get("home", {}).get("players", {}).items()
                ):
                    if (
                        player.get("stats", {})
                        .get("pitching", {})
                        .get("gamesStarted", 0)
                        > 0
                    ):
                        home_starter = player
                        break

                # Process batters for both teams
                for (
                    team_side,
                    opponent_side,
                    team_info,
                    opponent_info,
                    opponent_starter,
                ) in [
                    ("away", "home", away_team, home_team, home_starter),
                    ("home", "away", home_team, away_team, away_starter),
                ]:
                    # Process batters
                    for player_id, player in (
                        boxscore.get("teams", {})
                        .get(team_side, {})
                        .get("players", {})
                        .items()
                    ):
                        # Skip players without batting stats
                        if not player.get("stats", {}).get("batting"):
                            continue

                        actual_player_id = player_id.replace("ID", "")
                        player_name = player.get("person", {}).get("fullName")

                        # Get season stats up to this game
                        season_stats = api.get_player_stats(
                            actual_player_id, season, game_date
                        )

                        # Get matchup stats if opponent starter is available
                        matchup_stats = None
                        if opponent_starter:
                            opponent_pitcher_id = opponent_starter.get(
                                "person", {}
                            ).get("id")
                            try:
                                matchup_stats = api.get_matchup_stats(
                                    actual_player_id, opponent_pitcher_id, season
                                )
                            except Exception as e:
                                logger.error(f"Error getting matchup stats: {e}")

                        # Add to batter stats
                        batter_stats.append(
                            {
                                "game_pk": game_pk,
                                "game_date": game_date,
                                "player_id": actual_player_id,
                                "player_name": player_name,
                                "team_id": team_info["id"],
                                "team_name": team_info["name"],
                                "position": player.get("position", {}).get(
                                    "abbreviation"
                                ),
                                "opposing_team_id": opponent_info["id"],
                                "opposing_team_name": opponent_info["name"],
                                "opposing_pitcher_id": opponent_starter.get(
                                    "person", {}
                                ).get("id")
                                if opponent_starter
                                else None,
                                "opposing_pitcher_name": opponent_starter.get(
                                    "person", {}
                                ).get("fullName")
                                if opponent_starter
                                else None,
                                "game_stats": player.get("stats", {}).get(
                                    "batting", {}
                                ),
                                "season_stats": season_stats,
                                "matchup_stats": matchup_stats,
                            }
                        )

                # Process pitchers for both teams
                for team_side, team_info, opponent_info in [
                    ("away", away_team, home_team),
                    ("home", home_team, away_team),
                ]:
                    for player_id, player in (
                        boxscore.get("teams", {})
                        .get(team_side, {})
                        .get("players", {})
                        .items()
                    ):
                        # Skip players without pitching stats
                        if not player.get("stats", {}).get("pitching"):
                            continue

                        actual_player_id = player_id.replace("ID", "")
                        player_name = player.get("person", {}).get("fullName")

                        # Get season stats up to this game
                        season_stats = api.get_player_stats(
                            actual_player_id, season, game_date
                        )

                        # Add to pitcher stats
                        pitcher_stats.append(
                            {
                                "game_pk": game_pk,
                                "game_date": game_date,
                                "player_id": actual_player_id,
                                "player_name": player_name,
                                "team_id": team_info["id"],
                                "team_name": team_info["name"],
                                "opposing_team_id": opponent_info["id"],
                                "opposing_team_name": opponent_info["name"],
                                "game_stats": player.get("stats", {}).get(
                                    "pitching", {}
                                ),
                                "season_stats": season_stats,
                            }
                        )

                # Add game information
                games_data.append(
                    {
                        "game_pk": game_pk,
                        "game_date": game_date,
                        "away_team_id": away_team["id"],
                        "away_team_name": away_team["name"],
                        "home_team_id": home_team["id"],
                        "home_team_name": home_team["name"],
                        "venue": game.get("venue", {}).get("name"),
                        "status": game["status"]["detailedState"],
                        "weather": weather,
                    }
                )

            except Exception as e:
                logger.error(f"Error processing game {game_pk}: {e}")

    # Convert to DataFrames
    games_df = pd.DataFrame(games_data)
    batter_df = pd.DataFrame(batter_stats)
    pitcher_df = pd.DataFrame(pitcher_stats)
    weather_df = pd.DataFrame(weather_data)

    # Save to CSV
    games_df.to_csv(f"{output_dir}/games.csv", index=False)
    batter_df.to_csv(f"{output_dir}/batters.csv", index=False)
    pitcher_df.to_csv(f"{output_dir}/pitchers.csv", index=False)
    weather_df.to_csv(f"{output_dir}/weather.csv", index=False)

    logger.info(f"Saved data to {output_dir}")
    logger.info(f"Collected {len(games_df)} games")
    logger.info(f"Collected {len(batter_df)} batter performances")
    logger.info(f"Collected {len(pitcher_df)} pitcher performances")

    return {
        "games": games_df,
        "batters": batter_df,
        "pitchers": pitcher_df,
        "weather": weather_df,
    }


# Example usage
if __name__ == "__main__":
    start_date = "2024-03-28"  # Opening Day 2024
    end_date = datetime.now().strftime("%Y-%m-%d")  # Today's date
    output_dir = f"mlb_data_{datetime.now().strftime('%Y%m%d')}"

    data = collect_mlb_data(start_date, end_date, output_dir)


In [6]:
team_list = [
    "diamondbacks",
    "braves",
    "orioles",
    "red sox",
    "cubs",
    "white sox",
    "reds",
    "guardians",
    "rockies",
    "tigers",
    "astros",
    "royals",
    "angels",
    "dodgers",
    "marlins",
    "brewers",
    "twins",
    "mets",
    "yankees",
    "athletics",
    "phillies",
    "pirates",
    "padres",
    "giants",
    "mariners",
    "cardinals",
    "rays",
    "rangers",
    "blue jays",
    "nationals",
]

main_season = [
    yr for yr in range(2023, datetime.datetime.now().year + 1)
] 

In [12]:
BASE_URL = "https://statsapi.mlb.com/api"

endpoint = "v1/schedule"
params = {
    "startDate": "2025-05-04",
    "endDate": "2025-05-04",
    "sportId": 1,  # MLB
    "gameType": "R",  # Regular season games
    "hydrate": "venue,team",
}

url = f"{BASE_URL}/{endpoint}"

response = requests.get(url, params=params)

response.json().get("dates", [])[0].get("games", [])[0].get("gamePk")

778053

In [None]:
endpoint = f"v1.1/game/778053/feed/live"
params = {"hydrate": "boxscore,weather"}

url = f"{BASE_URL}/{endpoint}"

response = requests.get(url, params=params)

# response.json().get("liveData", {}).get("boxscore", {}).get("teams", {}).get("away", {}).get("players", {})["ID640492"].get("stats", {}).get("batting", {})
for player_id, player in (
    response.json().get("liveData", {}).get("boxscore", {}).get("teams", {}).get(
        "away", {}
    ).get("players", {}).items()):
    

{'ID640492': {'person': {'id': 640492,
   'fullName': 'José Azocar',
   'link': '/api/v1/people/640492'},
  'jerseyNumber': '28',
  'position': {'code': '8',
   'name': 'Outfielder',
   'type': 'Outfielder',
   'abbreviation': 'CF'},
  'status': {'code': 'A', 'description': 'Active'},
  'parentTeamId': 121,
  'battingOrder': '900',
  'stats': {'batting': {'summary': '0-3 | K',
    'gamesPlayed': 1,
    'flyOuts': 0,
    'groundOuts': 1,
    'airOuts': 1,
    'runs': 0,
    'doubles': 0,
    'triples': 0,
    'homeRuns': 0,
    'strikeOuts': 1,
    'baseOnBalls': 0,
    'intentionalWalks': 0,
    'hits': 0,
    'hitByPitch': 0,
    'atBats': 3,
    'caughtStealing': 0,
    'stolenBases': 0,
    'stolenBasePercentage': '.---',
    'groundIntoDoublePlay': 0,
    'groundIntoTriplePlay': 0,
    'plateAppearances': 3,
    'totalBases': 0,
    'rbi': 0,
    'leftOnBase': 4,
    'sacBunts': 0,
    'sacFlies': 0,
    'catchersInterference': 0,
    'pickoffs': 0,
    'atBatsPerHomeRun': '-.--',
