# Scrape Functions
## Oct. 21, 2021

In [4]:
import pandas as pd
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import time

Cleaning up the functions created in Oct. 21 file to scrape Baseball Reference for minor and major league team data.

In [2]:
def yearly_links(year, teams_excl = []):
    '''
    Takes as input a year and teams, and returns a collection of links to minor league teams for that year
    
    Input:
    year, a float corresponding to the year
    teams, a list containing the indices of teams to EXCLUDE, where:
    0 = MLB
    1 = Triple-A
    2 = Double-A
    3 = High-A
    4 = Low-A
    5 = Short-Season A
    6 = Rookie
    7 = Foreign Rookie
    
    Output:
    links, a list of length 30 of lists of length 8 of lists of teams at that level (index 0 = MLB team)
    '''
    # Creating BeautifulSoup object from URL
    
    url = f'https://www.baseball-reference.com/register/affiliate.cgi?year={year}' # URL to use
    html = urlopen(url) # collecting HTML data
    soup = BeautifulSoup(html, features="lxml") # creating object from HTML

    # Gets non-header rows and declares blank list to add links to team pages
    
    rows = soup.findAll('tr')[1:] # excluding the header row
    links = []

    # Iterates over rows, gets cell in row, gets links to teams in that row (if they exist)
    # Includes MLB team as well, assuming they're in the list of teams to scrape
    
    for i in range(len(rows)):
        
        row = rows[i]
        row_data = [] # data for each row
                    
        # Gets link to franchise page, and extracts abbreviation and adds to data
            
        franchise_link = row.findAll('th')[0].a['href']
        pattern = '\=(.*?)\&'
        franchise = re.search(pattern, franchise_link).group(1)
            
        # Baseball Reference hasn't adjusted Expos/Nationals abbreviations
        # Changing this manually when necessary
            
        if franchise == 'WSN' and year < 2005:
            franchise = 'MON'
            
        row_data.append([f'/teams/{franchise}/{year}.shtml']) # adds as its own list
            
        elements = row.findAll('td') # all elements corresponding to affiliates
        
        for td in elements: # for each affiliate level
            
            affils = td.findAll('a') # links to all possible affiliates
            affil_list = []
            
            if len(affils) != 0: # if the team has an affiliate at this level
                
                for affil in affils:
                    affil_list.append(affil['href'])
                    
                row_data.append(affil_list)
                
            else:
                row_data.append(['']) # adds a blank list for levels without an affiliate
                
        links.append(row_data)
        
    # The second column just counts the number of teams for each organization
    # It doesn't have an HTML link, so we just exclude these elements from our lists
    # This also excludes whatever teams we input into the function
        
    for elem in links:
        elem.pop(1)
        for i in sorted(teams_excl, reverse=True):
            del elem[i]
                
    return links

In [3]:
def team_data(url_ext):
    '''
    Takes as input a URL extension, and returns a collection of links to minor league teams for that year
    
    Input:
    url_ext, a string that will be appended to the Baseball Reference page and scraped
    
    Output:
    team_df_bat, a dataframe containing batting information for each player on that team
    team_df_pitch, a dataframe containing pitching information for each player on that team
    '''
    # Creating BeautifulSoup object from URL
    
    page = requests.get(f'https://www.baseball-reference.com{url_ext}').text # URL to use

    # Getting level, organization, and year
    
    if 'teams' in url_ext: # i.e,. if the URL is for the major league team
        
        level = 'MLB'
        year = url_ext[-10:-6] # isolates year from URL
        org = re.search('teams\/(.*?)\/', url_ext).group(1) # gets organization
        
    else: # if it's for the minor league team
        
        # Subsets the page data and does string manipulation to extract level, year, and org
        
        subset = page[page.find('<strong>Classification'):page.find('<strong>Manager:')]
        
        level = subset[subset.find(': '):subset.find('\n')][2:]
        
        new_split = subset.split()
        org_link = [s for s in new_split if 'href="/register/affiliate.cgi?' in s][0]
        
        subset = org_link[30:]
        
        year = subset[subset.find('year='):subset.find('"')][5:]
        org = subset[subset.find('='):subset.find('&')][1:]
        
    # Gets subset of text corresponding to the two tables of interest
    # Doing it this way becuase some info is contained in HTML comments
    
    batting = page[page.find('<table class="sortable stats_table" id="team_batting"'):
                   page.find('<div class="footer no_hide_long" id="tfooter_team_batting"')]
    
    pitching = page[page.find('<table class="sortable stats_table" id="team_pitching"'):
                    page.find('<div class="footer no_hide_long" id="tfooter_team_pitching"')]
    
    # Creating BeautifulSoup objects
    
    soup_bat = BeautifulSoup(batting, features="lxml")
    soup_pitch = BeautifulSoup(pitching, features="lxml")
    
    # Getting list of headers for each table, excluding ordering column 
    # Adds in var. for whether player played in majors
    
    headers_bat = [th.getText() for th in soup_bat.findAll('tr', limit=2)[0].findAll('th')][1:]
    headers_bat.append('MLB')
    headers_pitch = [th.getText() for th in soup_pitch.findAll('tr', limit=2)[0].findAll('th')][1:]
    headers_pitch.append('MLB')
    
    # Getting rows for each table

    bat_rows = soup_bat.findAll('tr')[1:] # excluding the header row, but including the summary row
    pitch_rows = soup_pitch.findAll('tr')[1:]

    # Putting them into their own tables
    
    team_bat = []
    team_pitch = []
    
    # Iterates over row and adds text from table elements

    for i in range(len(bat_rows)):
        row_data = []
        row_text = str(bat_rows[i])

        for td in bat_rows[i].findAll('td'):
            row_data.append(td.getText())

        # Adds in whether the player played in the majors
        # For minor league teams: this is when their name is bolded
        # For major league teams: everyone, so the URL contains the string 'teams'
        
        if ('strong' in row_text) or ('teams' in url_ext):
            row_data.append('1')
        else:
            row_data.append('0')

        team_bat.append(row_data)

    # Replicates the loop above, but for pitching data

    for i in range(len(pitch_rows)):
        row_data = []
        row_text = str(pitch_rows[i])

        for td in pitch_rows[i].findAll('td'):
            row_data.append(td.getText())

        if ('strong' in row_text) or ('teams' in url_ext):
            row_data.append('1')
        else:
            row_data.append('0')

        team_pitch.append(row_data)
        
    # Converting these to dataframes and returning, along with info on level, year, and org
    
    bat = pd.DataFrame(team_bat, columns=headers_bat)
    pitch = pd.DataFrame(team_pitch, columns=headers_pitch)
        
    return level, year, org, bat, pitch

In [18]:
# for i in range(1998, 2020):
    
#     links = yearly_links(i)
    
for team_list in tst:
    
    for team in team_list:
        
        print(team[0] != '')
        
        


True
True
True
True
True
True
True
True
True
True
False
True
True
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True
True
True
True
True
False
True
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True
True
True
True
True
True
True
True
False
True
True
True
True
True
True
True
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True
True
True
True
True
True
False
Tru

In [13]:
tst

[[['/teams/ANA/1998.shtml'],
  ['/register/team.cgi?id=eb62c2d6'],
  ['/register/team.cgi?id=02e86fc6'],
  ['/register/team.cgi?id=c7128ac0'],
  ['/register/team.cgi?id=4de46db5'],
  ['/register/team.cgi?id=e55645b4'],
  ['/register/team.cgi?id=2e61b921'],
  ['/register/team.cgi?id=77ca8df8']],
 [['/teams/ARI/1998.shtml'],
  ['/register/team.cgi?id=ca985421'],
  [''],
  ['/register/team.cgi?id=3d93b47f'],
  ['/register/team.cgi?id=75235438'],
  [''],
  ['/register/team.cgi?id=aee1e45c', '/register/team.cgi?id=42b26f6b'],
  ['/register/team.cgi?id=8429e1f1']],
 [['/teams/ATL/1998.shtml'],
  ['/register/team.cgi?id=3f2c4e04'],
  ['/register/team.cgi?id=b6314403'],
  ['/register/team.cgi?id=5c689693'],
  ['/register/team.cgi?id=bd7af2f7'],
  ['/register/team.cgi?id=94fc1885'],
  ['/register/team.cgi?id=4f4b00c5', '/register/team.cgi?id=08702ef5'],
  ['/register/team.cgi?id=7ef15500']],
 [['/teams/BAL/1998.shtml'],
  ['/register/team.cgi?id=47fb0847'],
  ['/register/team.cgi?id=999fa0d0'],