### 1st Process: Get TOP 50 batters/pitcher

Search scopes:
- All time
- Season 2024

### Imports

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', None)

### Utils

In [4]:
def get_player_data(url, tag):

    headers = {
       'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 '
                     '(KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error al hacer la solicitud: {e}")
        return []

    try:
        soup = BeautifulSoup(response.text, "html.parser")
        table = soup.find("table")
        if not table:
            print("Table not found.")
            return []

        rows = table.find_all("tr")
        row_detail = [row.find_all("td") for row in rows]

        player_list = [
            a_tag.text.strip()
            for row in row_detail
            if len(row) > 1 and (a_tag := row[1].find("a")) and a_tag.text.strip()
        ]

        player_df = pd.DataFrame({
            'top_idx': range(1, len(player_list) + 1),
            'top_source': tag,
            'player': player_list,
        })

        return player_df

    except Exception as e:
        print(f"Error processing page: {e}")
        return pd.DataFrame()

### Get Top Players 2024 and Historic

In [33]:
# Rankings

batting_leaders_history = 'https://www.espn.com/mlb/history/leaders/_/sort/avg'
batting_leaders_2024 = 'https://www.espn.com/mlb/history/leaders/_/breakdown/season/year/2024'
pitching_leaders_history = 'https://www.espn.com/mlb/history/leaders/_/type/pitching'
pitching_leaders_2024 = 'https://www.espn.com/mlb/history/leaders/_/type/pitching/breakdown/season/year/2024/sort/ERA'

batting_players_2024 = get_player_data(batting_leaders_2024, 'b2024')
batting_players_hist = get_player_data(batting_leaders_history, 'bhist')
pitching_players_2024 = get_player_data(pitching_leaders_2024, 'p2024')
pitching_players_hist = get_player_data(pitching_leaders_history, 'phist')

In [37]:
top_players = pd.concat([
    batting_players_2024,
    batting_players_hist,
    pitching_players_2024,
    pitching_players_hist
    ], ignore_index=True)

In [38]:
top_players.to_csv('mlb_top_players.csv', index=False)

['top_players.pkl']