# Contains functions to scrape data from various baseball data sites

In [1]:
# Import statements
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

### Function to find player IDs

In [3]:
def id_lookup(player_name:str, id_type:str = "MLB") -> str:
    '''
    Return ID for given player_name corresponding to given id_type. Defaults to MLBID.
    NOTE: Only supports MLBID for now.
    >>> id_lookup("Clayton Kershaw","MLB")
    "477132"
    >>> id_lookup("Chris Sale") # Defaults to MLBID
    "519242"
    '''
    # Load in player ID data
    id_df = pd.read_excel("..\Data\SFBB-Player-ID-Map.xlsx")
    
    # Extract player ID for given name and ID type
    raw_id = id_df.loc[id_df.PLAYERNAME==player_name,"MLBID"]
    
    # Convert raw_id to string and return
    return(str(int(raw_id)))  

### Function to scrape Brooks Baseball data

In [4]:
def scrape_brooksbb(pitcher:str, season:int, table_type:str, params_dict:dict=None) -> pd.DataFrame:
    '''
    Scrapes pitcher info from Brooks Baseball. Takes pitcher name, desired season, and desired table.
    table_type input currently can be either 'po' or 'ra'.
    Returns pandas dataframe of desired table.
    
    Balls, strikes, and baserunners can also be specified as follows in params_dict input:
    params_dict={'balls':2,'strikes':0,'1b':1,'2b':0,'3b':1}
    Baserunners are boolean 0 or 1, while balls can be 0,1,2,3 or -1 for any, and  strikes 0,1,2 or -1 for any.
    
    NOTE: Currently only supports 'Pitch Outcomes' and 'Results and Averages' tables. 
          Add support for 'Pitch Usage' tables to get good test data.
    '''
    # Get ID of given pitcher for use in query params
    pitcher_id = id_lookup(pitcher)
    startDate = '01/01/' + str(season)
    endDate = '01/01/' + str(season + 1)
    
    # Create/complete query param dict to send with request
    if not params_dict:
        params_dict = {'player':pitcher_id, 'b_hand':'R', 'rType':'perc', 'var':table_type,
                       'balls':-1, 'strikes':-1, 'startDate':startDate, 'endDate':endDate}
    else:
        params_dict['player'] = pitcher_id
        params_dict['b_hand'] = 'R'
        params_dict['rType'] = 'perc'
        params_dict['var'] = table_type
        params_dict['startDate'] = startDate
        params_dict['endDate'] = endDate
        
    # Send request
    brooksbb_url = "http://www.brooksbaseball.net/tabs.php"
    req = requests.get(brooksbb_url,params=params_dict)
    
    # Extract table as pd dataframe, clean, and return
    return(pd.read_html(req.text,header=0)[0].drop_duplicates().dropna().set_index('Pitch Type')[:-1])