### 2nd Process: Complete ID

- Fetch all players from seasons 1850 to 2025.
- Match top players data to identify IDs.
- Limit top players from 50 to the best 20 of each list.
- 20 batters all time, 20 batters 2024, 20 pitchers all time, 20 pitchers 2024

### Imports

In [2]:
import json
import requests
import pandas as pd
from unidecode import unidecode
from concurrent.futures import ThreadPoolExecutor

In [2]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', None)

### Utils

In [3]:
# Function to Process Results from Various MLB Stats API Endpoints
# Provided by the Google Colab from MLB Hackaton

def process_endpoint_url(endpoint_url, pop_key=None):
  """
  Fetches data from a URL, parses JSON, and optionally pops a key.

  Args:
    endpoint_url: The URL to fetch data from.
    pop_key: The key to pop from the JSON data (optional, defaults to None).

  Returns:
    A pandas DataFrame containing the processed data
  """
  json_result = requests.get(endpoint_url).content

  data = json.loads(json_result)

   # if pop_key is provided, pop key and normalize nested fields
  if pop_key:
    df_result = pd.json_normalize(data.pop(pop_key), sep = '_')
  # if pop_key is not provided, normalize entire json
  else:
    df_result = pd.json_normalize(data)

  return df_result

### Limit to Top 20 players with ID

In [3]:
top_players = pd.read_csv('mlb_top_players.csv')

In [5]:
seasons = range(1850, 2025)

def fetch_players_from_seasons(season: str):

    seasons_url = f'https://statsapi.mlb.com/api/v1/sports/1/players?season={season}'
    player_seasons = process_endpoint_url(seasons_url, "people")

    if not player_seasons.empty:
        return player_seasons

    return pd.DataFrame()

with ThreadPoolExecutor() as executor:
    season_results = list(executor.map(fetch_players_from_seasons, [season for season in seasons]))

all_players = (
    pd.concat(season_results, ignore_index=True)
    .drop_duplicates(subset=['id'])
    .assign(fullName_std = lambda x: x['fullName'].apply(unidecode))
    .rename(columns={'id': 'player_id'})
    .filter(items=['player_id', 'fullName', 'fullName_std'])
)

In [6]:
top_players = (
    top_players
    .merge(all_players, left_on='player', right_on='fullName_std',how='left')
    .pipe(lambda df: df.loc[df.groupby(['top_source', 'player'])['player_id'].idxmin().dropna().astype(int)])
    .sort_values(by=['top_idx'])
    .groupby('top_source')
    .head(20)
    .astype({'player_id': 'Int32'})
    .filter(items=['player_id', 'top_idx', 'top_source', 'fullName'])
)

In [10]:
top_players.to_csv('mlb_top_players_id.csv', index=False)