# Scrape Functions (first draft)
## Oct. 19, 2021

In [3]:
import pandas as pd
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

Using code developed in file from Oct. 18, 2021 to develop scraping functions for minor league data.

In [51]:
def yearly_links(year, teams_excl = []):
    '''
    Takes as input a year and teams, and returns a collection of links to minor league teams for that year
    
    Input:
    year, a float corresponding to the year
    teams, a list containing the indices of teams to EXCLUDE, where:
    0 = MLB
    1 = Triple-A
    2 = Double-A
    3 = High-A
    4 = Low-A
    5 = Short-Season A
    6 = Rookie
    7 = Foreign Rookie
    
    Output:
    links, a list of length 30 of lists of length 8 of lists of teams at that level (index 0 = MLB team)
    '''
    # Creating BeautifulSoup object from URL
    
    url = f'https://www.baseball-reference.com/register/affiliate.cgi?year={year}' # URL to use
    html = urlopen(url) # collecting HTML data
    soup = BeautifulSoup(html, features="lxml") # creating object from HTML

    # Gets non-header rows and declares blank list to add links to team pages
    
    rows = soup.findAll('tr')[1:] # excluding the header row
    links = []

    # Iterates over rows, gets cell in row, gets links to teams in that row (if they exist)
    # Includes MLB team as well, assuming they're in the list of teams to scrape
    
    for i in range(len(rows)):
        
        row = rows[i]
        row_data = [] # data for each row
                    
        # Gets link to franchise page, and extracts abbreviation and adds to data
            
        franchise_link = row.findAll('th')[0].a['href']
        pattern = '\=(.*?)\&'
        franchise = re.search(pattern, franchise_link).group(1)
            
        # Baseball Reference hasn't adjusted Expos/Nationals abbreviations
        # Changing this manually when necessary
            
        if franchise == 'WSN' and year < 2005:
            franchise = 'MON'
            
        row_data.append([f'/teams/{franchise}/{year}.shtml']) # adds as its own list
            
        elements = row.findAll('td') # all elements corresponding to affiliates
        
        for td in elements: # for each affiliate level
            
            affils = td.findAll('a') # links to all possible affiliates
            affil_list = []
            
            if len(affils) != 0: # if the team has an affiliate at this level
                
                for affil in affils:
                    affil_list.append(affil['href'])
                    
                row_data.append(affil_list)
                
            else:
                row_data.append(['']) # adds a blank list for levels without an affiliate
                
        links.append(row_data)
        
    # The second column just counts the number of teams for each organization
    # It doesn't have an HTML link, so we just exclude these elements from our lists
    # This also excludes whatever teams we input into the function
        
    for elem in links:
        elem.pop(1)
        for i in sorted(teams_excl, reverse=True):
            del elem[i]
                
    return links

In [5]:
def team_data(url_ext):
    '''
    Takes as input a URL extension, and returns a collection of links to minor league teams for that year
    
    Input:
    url_ext, a string that will be appended to the Baseball Reference page and scraped
    
    Output:
    team_df_bat, a dataframe containing batting information for each player on that team
    team_df_pitch, a dataframe containing pitching information for each player on that team
    '''
    # Creating BeautifulSoup object from URL
    
    page = requests.get(f'https://www.baseball-reference.com{url_ext}').text # URL to use

    # Gets subset of text corresponding to the two tables of interest
    # Doing it this way becuase some info is contained in HTML comments
    
    batting = page[page.find('<table class="sortable stats_table" id="team_batting"'):
                   page.find('<div class="footer no_hide_long" id="tfooter_team_batting"')]
    
    pitching = page[page.find('<table class="sortable stats_table" id="team_pitching"'):
                    page.find('<div class="footer no_hide_long" id="tfooter_team_pitching"')]
    
    # Creating BeautifulSoup objects
    
    soup_bat = BeautifulSoup(batting, features="lxml")
    soup_pitch = BeautifulSoup(pitching, features="lxml")
    
    # Getting list of headers for each table, adding in var. for whether player played in majors
    
    headers_bat = [th.getText() for th in soup_bat.findAll('tr', limit=2)[0].findAll('th')]
    headers_bat.append('MLB')
    headers_pitch = [th.getText() for th in soup_pitch.findAll('tr', limit=2)[0].findAll('th')]
    headers_pitch.append('MLB')
    
    # Getting rows for each table

    bat_rows = soup_bat.findAll('tr')[1:] # excluding the header row, but including the summary row
    pitch_rows = soup_pitch.findAll('tr')[1:]

    # Putting them into their own tables

    # NEED TO FIGURE HOW TO ADD WHETHER THE PLAYER PLAYED IN THE MAJORS
    
#     team_bat = [[td.getText() for td in bat_rows[i].findAll('td')] for i in range(len(bat_rows))]
#     team_pitch = [[td.getText() for td in pitch_rows[i].findAll('td')] for i in range(len(pitch_rows))]

In [9]:
page = requests.get(f'https://www.baseball-reference.com/register/team.cgi?id=7f5ab515').text # URL to use

# Gets subset of text corresponding to the two tables of interest
# Doing it this way becuase some info is contained in HTML comments
    
batting = page[page.find('<table class="sortable stats_table" id="team_batting"'):
               page.find('<div class="footer no_hide_long" id="tfooter_team_batting"')]
    
pitching = page[page.find('<table class="sortable stats_table" id="team_pitching"'):
                page.find('<div class="footer no_hide_long" id="tfooter_team_pitching"')]
    
# Creating BeautifulSoup objects
    
soup_bat = BeautifulSoup(batting, features="lxml")
soup_pitch = BeautifulSoup(pitching, features="lxml")

# Getting rows for each table

bat_rows = soup_bat.findAll('tr')[1:] # excluding the header row, but including the summary row
pitch_rows = soup_pitch.findAll('tr')[1:]

# Putting them into their own tables

team_bat = [[td.getText() for td in bat_rows[i].findAll('td')] for i in range(len(bat_rows))]
team_pitch = [[td.getText() for td in pitch_rows[i].findAll('td')] for i in range(len(pitch_rows))]

In [14]:
bat_rows = soup_bat.findAll('tr')[1:] # excluding the header row, but including the summary row
pitch_rows = soup_pitch.findAll('tr')[1:]

# Putting them into their own tables


team_bat = [[td.getText() for td in bat_rows[i].findAll('td')] for i in range(len(bat_rows))]
team_pitch = [[td.getText() for td in pitch_rows[i].findAll('td')] for i in range(len(pitch_rows))]

In [23]:
'strong' in str(bat_rows[4])

False

In [10]:
team_data('/register/team.cgi?id=7f5ab515')

<html><body><table class="sortable stats_table" data-cols-to-freeze=",2" id="team_batting">
<caption>Team Batting</caption>
<colgroup><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/></colgroup>
<thead>
<tr>
<th aria-label=" Rank This is a count of the rows from top to bottom. It is recalculated following the sorting of a column." class="tooltip ranker poptip sort_default_asc show_partial_when_sorting center" data-stat="ranker" data-tip="&lt;strong&gt;Rank&lt;/strong&gt;&lt;br&gt;This is a count of the rows from top to bottom.&lt;br&gt;It is recalculated following the sorting of a column." scope="col">Rk</th>
<th aria-label=" Player Name Bold can mean player is active for this team or player has appeared in the majors * means LHP or LHB, # means switch hitter, + can mean HOFer." class="poptip sort_default_asc show_partial_when_sorting left" data-stat="player" data-tip="&lt;s

In [46]:
hbat = [th.getText() for th in soup_bat.findAll('tr', limit=2)[0].findAll('th')]
hbat = hbat.append('MLB')

In [47]:
hbat