### 4th Process:

- With all the plays, get all the ids (tops and rivals) and make a feature store.
- Retrieve data for stats=yearByYear and acumulate in years.
- The accumulation is because the more the player plays, the better he could be.

### Import

In [19]:
import json
import requests
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor

In [65]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', None)
pd.set_option('future.no_silent_downcasting', True)

### Utils

In [4]:
# Function to Process Results from Various MLB Stats API Endpoints
# Provided by the Google Colab from MLB Hackaton

def process_endpoint_url(endpoint_url, pop_key=None):
  """
  Fetches data from a URL, parses JSON, and optionally pops a key.

  Args:
    endpoint_url: The URL to fetch data from.
    pop_key: The key to pop from the JSON data (optional, defaults to None).

  Returns:
    A pandas DataFrame containing the processed data
  """
  json_result = requests.get(endpoint_url).content

  data = json.loads(json_result)

   # if pop_key is provided, pop key and normalize nested fields
  if pop_key:
    df_result = pd.json_normalize(data.pop(pop_key), sep = '_')
  # if pop_key is not provided, normalize entire json
  else:
    df_result = pd.json_normalize(data)

  return df_result

### Get Full stats

In [20]:
top_players_plays = pd.read_csv('mlb_top_players_plays.csv')

In [10]:
batter_id = top_players_plays['batter_id'].unique().tolist()
pitcher_id = top_players_plays['pitcher_id'].unique().tolist()
total_id = list(set(batter_id + pitcher_id))

In [None]:
# Full stats list

# Normal data
# ['airOuts', 'atBats', 'balks', 'baseOnBalls', 'battersFaced', 'blownSaves', 'catchersInterference', 'caughtStealing',
# 'completeGames', 'doubles', 'earnedRuns', 'gamesFinished', 'gamesPitched', 'gamesPlayed', 'gamesStarted', 'groundIntoDoublePlay',
# 'groundOuts', 'hitBatsmen', 'hitByPitch', 'hits', 'holds', 'homeRuns', 'inheritedRunners', 'inheritedRunnersScored', 'inningsPitched',
# 'intentionalWalks', 'leftOnBase', 'losses', 'numberOfPitches', 'outs', 'pickoffs', 'plateAppearances', 'rbi', 'runs', 'sacBunts',
# 'sacFlies', 'saveOpportunities', 'saves', 'shutouts', 'stolenBases', 'strikeOuts', 'strikes', 'totalBases', 'triples', 'wildPitches', 'wins']

# Calculated data
# 'atBatsPerHomeRun' = 'atBats' / 'homeRuns'
# 'avg' = 'hits' / 'atBats'
# 'babip' = ('hits' - 'homeRuns') / ('atBats' - 'strikeOuts' - 'homeRuns')
# 'era' = 'earnedRuns' * 9 / 'inningsPitched'
# 'groundOutsToAirouts' = 'groundOuts' / 'airOuts'
# 'hitsPer9Inn' = 'hits' * 9 / 'inningsPitched'
# 'homeRunsPer9' = 'homeRuns' * 9 / 'inningsPitched'
# 'obp' = ('hits' + 'baseOnBalls' + 'hitByPitch') / ('atBats' + 'baseOnBalls' + 'hitByPitch')
# 'ops' = 'obp' + 'slg'
# 'pitchesPerInning' = 'numberOfPitches' / 'inningsPitched'
# 'runsScoredPer9' = 'runs' * 9 / 'inningsPitched'
# 'singles' = 'hits' - 'doubles' - 'triples' - 'homeRuns'
# 'slg' = ('singles' + 2 * 'doubles' + 3 * 'triples' + 4 * 'homeRuns') / 'atBats'
# 'stolenBasePercentage' = 'stolenBases' / ('stolenBases' + 'caughtStealing')
# 'strikePercentage' = 'strikes' / 'numberOfPitches'
# 'strikeoutWalkRatio' = 'strikeOuts' / 'baseOnBalls'
# 'strikeoutsPer9Inn' = 'strikeOuts' * 9 / 'inningsPitched'
# 'walksPer9Inn' = 'baseOnBalls' * 9 / 'inningsPitched'
# 'whip' = ('baseOnBalls' + 'hits') / 'inningsPitched'
# 'winPercentage' = 'wins' / ('wins' + 'losses')

In [None]:
base_stats = [
    'airOuts', 'atBats', 'balks', 'baseOnBalls', 'battersFaced',
    'blownSaves', 'catchersInterference', 'caughtStealing',
    'completeGames', 'doubles', 'earnedRuns', 'gamesFinished',
    'gamesPitched', 'gamesPlayed', 'gamesStarted', 'groundIntoDoublePlay',
    'groundOuts', 'hitBatsmen', 'hitByPitch', 'hits', 'holds', 'homeRuns',
    'inheritedRunners', 'inheritedRunnersScored', 'inningsPitched',
    'intentionalWalks', 'leftOnBase', 'losses', 'numberOfPitches', 'outs',
    'pickoffs', 'plateAppearances', 'rbi', 'runs', 'sacBunts', 'sacFlies',
    'saveOpportunities', 'saves', 'shutouts', 'stolenBases', 'strikeOuts',
    'strikes', 'totalBases', 'triples', 'wildPitches', 'wins'
    ]

base_season_stats = ['season', 'player_id'] + base_stats

def fetch_player_stats(player_id: int):

    player_url = f'https://statsapi.mlb.com/api/v1/people/{player_id}/stats?stats=yearByYear'
    player_data = process_endpoint_url(player_url, "stats")
    player_stats_df = pd.DataFrame(columns=base_season_stats)

    if not player_data.empty:
        player_data = player_data.loc[0, 'splits']
        for p in player_data:
            p_dict = {
                **{'season': p['season'], 'player_id': p['player']['id']},
                **p['stat']
            }
            p_dict_filtered = {
                k: v
                for k, v
                in p_dict.items()
                if k in base_season_stats
            }
            p_data = pd.DataFrame(p_dict_filtered, index=[0])
            player_stats_df = pd.concat([player_stats_df, p_data], ignore_index=True)

        player_stats_df = (
            player_stats_df
            .astype({
                'season': int,
                'inningsPitched': float,
            })
            .sort_values('season')
        )

        player_stats_df.iloc[:, 2:] = player_stats_df.iloc[:, 2:].cumsum()

        player_stats_df = (
            player_stats_df
            .assign(
                singles = player_stats_df['hits'] - player_stats_df['doubles'] - player_stats_df['triples'] - player_stats_df['homeRuns'],
            )
        )

        player_stats_df = (
            player_stats_df
            .assign(
                atBatsPerHomeRun = player_stats_df['atBats'] / player_stats_df['homeRuns'].replace(0, np.nan),
                avg = player_stats_df['hits'] / player_stats_df['atBats'].replace(0, np.nan),
                babip = (player_stats_df['hits'] - player_stats_df['homeRuns']) / (player_stats_df['atBats'] - player_stats_df['strikeOuts'] - player_stats_df['homeRuns']).replace(0, np.nan),
                era = player_stats_df['earnedRuns'] * 9 / player_stats_df['inningsPitched'].replace(0, np.nan),
                groundOutsToAirouts = player_stats_df['groundOuts'] / player_stats_df['airOuts'].replace(0, np.nan),
                hitsPer9Inn = player_stats_df['hits'] * 9 / player_stats_df['inningsPitched'].replace(0, np.nan),
                homeRunsPer9 = player_stats_df['homeRuns'] * 9 / player_stats_df['inningsPitched'].replace(0, np.nan),
                obp = (player_stats_df['hits'] + player_stats_df['baseOnBalls'] + player_stats_df['hitByPitch']) / (player_stats_df['atBats'] + player_stats_df['baseOnBalls'] + player_stats_df['hitByPitch']).replace(0, np.nan),
                pitchesPerInning = player_stats_df['numberOfPitches'] / player_stats_df['inningsPitched'].replace(0, np.nan),
                runsScoredPer9 = player_stats_df['runs'] * 9 / player_stats_df['inningsPitched'].replace(0, np.nan),
                slg = (player_stats_df['singles'] + 2 * player_stats_df['doubles'] + 3 * player_stats_df['triples'] + 4 * player_stats_df['homeRuns']) / player_stats_df['atBats'].replace(0, np.nan),
                stolenBasePercentage = player_stats_df['stolenBases'] / (player_stats_df['stolenBases'] + player_stats_df['caughtStealing']).replace(0, np.nan),
                strikePercentage = player_stats_df['strikes'] / player_stats_df['numberOfPitches'].replace(0, np.nan),
                strikeoutWalkRatio = player_stats_df['strikeOuts'] / player_stats_df['baseOnBalls'].replace(0, np.nan),
                strikeoutsPer9Inn = player_stats_df['strikeOuts'] * 9 / player_stats_df['inningsPitched'].replace(0, np.nan),
                walksPer9Inn = player_stats_df['baseOnBalls'] * 9 / player_stats_df['inningsPitched'].replace(0, np.nan),
                whip = (player_stats_df['baseOnBalls'] + player_stats_df['hits']) / player_stats_df['inningsPitched'].replace(0, np.nan),
                winPercentage = player_stats_df['wins'] / (player_stats_df['wins'] + player_stats_df['losses']).replace(0, np.nan)
            )
        )

        player_stats_df = (
            player_stats_df
            .assign(
                ops = player_stats_df['obp'] + player_stats_df['slg'],
            )
        )

    return player_stats_df

with ThreadPoolExecutor() as executor:
    player_stats_results = list(executor.map(fetch_player_stats, total_id))

all_player_stats = pd.concat(player_stats_results, ignore_index=True)

  all_player_stats = pd.concat(player_results, ignore_index=True)


In [23]:
all_player_stats.to_csv('all_player_stats.csv', index=False)

### Merge data

In [26]:
all_player_stats.head(5)

Unnamed: 0,season,player_id,airOuts,atBats,balks,baseOnBalls,battersFaced,blownSaves,catchersInterference,caughtStealing,...,runsScoredPer9,slg,stolenBasePercentage,strikePercentage,strikeoutWalkRatio,strikeoutsPer9Inn,walksPer9Inn,whip,winPercentage,ops
0,2017,622608,112,504,2,47,564,1,0,1,...,4.828614,0.436508,0.8,0.622914,2.170213,6.840537,3.152012,1.304024,0.666667,0.75903
1,2018,622608,199,857,2,77,954,1,0,1,...,4.694605,0.436406,0.888889,0.622264,2.220779,6.861346,3.089612,1.333036,0.592593,0.761592
2,2019,622608,300,1371,5,134,1536,1,0,1,...,5.578192,0.462436,0.928571,0.615577,1.843284,6.378766,3.460545,1.483501,0.55102,0.810721
3,2020,622608,372,1650,5,152,1839,1,0,1,...,5.230076,0.453333,0.944444,0.618345,1.894737,6.148008,3.244782,1.437381,0.561404,0.795105
4,2021,622608,519,2270,7,184,2509,1,0,2,...,5.124611,0.444493,0.909091,0.625052,2.13587,6.121495,2.866044,1.412253,0.507042,0.783476


In [25]:
top_players_plays = pd.read_csv('mlb_top_players_plays.csv')
top_players_id =pd.read_csv('mlb_top_players_id.csv')

In [40]:
(
    top_players_plays
    .head(5)
    .filter([
        'season', 'batter_id', 'pitcher_id',
        'event_type', 'vs_RHB', 'vs_LHB',
        'vs_SHB', 'vs_RHP', 'vs_LHP'
    ])
).to_dict()

{'season': {0: 1950, 1: 1950, 2: 1950, 3: 1950, 4: 1950},
 'batter_id': {0: 124341, 1: 124341, 2: 124341, 3: 124341, 4: 124341},
 'pitcher_id': {0: 117671, 1: 117671, 2: 117671, 3: 117671, 4: 117671},
 'event_type': {0: 'field_out',
  1: 'field_out',
  2: 'field_out',
  3: 'field_out',
  4: 'single'},
 'vs_RHB': {0: False, 1: False, 2: False, 3: False, 4: False},
 'vs_LHB': {0: True, 1: True, 2: True, 3: True, 4: True},
 'vs_SHB': {0: False, 1: False, 2: False, 3: False, 4: False},
 'vs_RHP': {0: True, 1: True, 2: True, 3: True, 4: True},
 'vs_LHP': {0: False, 1: False, 2: False, 3: False, 4: False}}

If models fail, possibility to reduce the field of action with these stats

In [None]:
# urL = https://raw.githubusercontent.com/MajorLeagueBaseball/google-cloud-mlb-hackathon/main/datasets/mlb-statsapi-docs/MLB-StatsAPI-Spec.json

# Batting Data (64 datos)
# requests.get(mlb_stats_url).json()['components']['schemas']['BattingData']['properties'].keys()

# Selection
# Efficiency: 'average', 'onBasePercentage', 'slugging', 'onBasePlusSlugging'
# Stats: 'hits', 'doubles', triples', 'homeRuns', 'runs', 'runsBattedIn', 'extraBaseHits', 'totalBases'
# Opportunity: 'plateAppearances', 'atBats', 'babip', 'walks', 'hitByPitch', 'intentionalWalks'

# Pitching Data (97 datos)
# requests.get(mlb_stats_url).json()['components']['schemas']['PitchingData']['properties'].keys()

# Selection
# Performance: 'earnedRunAverage', 'walksHitsPerInningPitched', 'inningsPitched', 'outsPitched', 'earnedRuns'
# Efficiency: 'strikeouts', 'walks', 'pitchesThrown', 'balls', 'strikes'
# Results: 'pitchesPerInning', 'strikePercentage', 'strikeoutWalkRatio', 'hitBatsmen', 'passedBall'

# Context
# 'wins', 'losses', 'saves', 'holds'