# Scrape Functions (first draft)
## Oct. 19, 2021

In [11]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

Using code developed in file from Oct. 18, 2021 to develop scraping functions for minor league data.

In [51]:
def yearly_links(year, teams_excl = []):
    '''
    Takes as input a year and teams, and returns a collection of links to minor league teams for that year
    
    Input:
    year, a float corresponding to the year
    teams, a list containing the indices of teams to EXCLUDE, where:
    0 = MLB
    1 = Triple-A
    2 = Double-A
    3 = High-A
    4 = Low-A
    5 = Short-Season A
    6 = Rookie
    7 = Foreign Rookie
    
    Output:
    links, a list of length 30 of lists of length 8 of lists of teams at that level (index 0 = MLB team)
    '''
    # Creating BeautifulSoup object from URL
    
    url = f'https://www.baseball-reference.com/register/affiliate.cgi?year={year}' # URL to use
    html = urlopen(url) # collecting HTML data
    soup = BeautifulSoup(html, features="lxml") # creating object from HTML

    # Gets non-header rows and declares blank list to add links to team pages
    
    rows = soup.findAll('tr')[1:] # excluding the header row
    links = []

    # Iterates over rows, gets cell in row, gets links to teams in that row (if they exist)
    # Includes MLB team as well, assuming they're in the list of teams to scrape
    
    for i in range(len(rows)):
        
        row = rows[i]
        row_data = [] # data for each row
                    
        # Gets link to franchise page, and extracts abbreviation and adds to data
            
        franchise_link = row.findAll('th')[0].a['href']
        pattern = '\=(.*?)\&'
        franchise = re.search(pattern, franchise_link).group(1)
            
        # Baseball Reference hasn't adjusted Expos/Nationals abbreviations
        # Changing this manually when necessary
            
        if franchise == 'WSN' and year < 2005:
            franchise = 'MON'
            
        row_data.append([f'/teams/{franchise}/{year}.shtml']) # adds as its own list
            
        elements = row.findAll('td') # all elements corresponding to affiliates
        
        for td in elements: # for each affiliate level
            
            affils = td.findAll('a') # links to all possible affiliates
            affil_list = []
            
            if len(affils) != 0: # if the team has an affiliate at this level
                
                for affil in affils:
                    affil_list.append(affil['href'])
                    
                row_data.append(affil_list)
                
            else:
                row_data.append(['']) # adds a blank list for levels without an affiliate
                
        links.append(row_data)
        
    # The second column just counts the number of teams for each organization
    # It doesn't have an HTML link, so we just exclude these elements from our lists
    # This also excludes whatever teams we input into the function
        
    for elem in links:
        elem.pop(1)
        for i in sorted(teams_excl, reverse=True):
            del elem[i]
                
    return links

In [54]:
yearly_links(1998, [5,7])

[[['/teams/ANA/1998.shtml'],
  ['/register/team.cgi?id=eb62c2d6'],
  ['/register/team.cgi?id=02e86fc6'],
  ['/register/team.cgi?id=c7128ac0'],
  ['/register/team.cgi?id=4de46db5'],
  ['/register/team.cgi?id=2e61b921']],
 [['/teams/ARI/1998.shtml'],
  ['/register/team.cgi?id=ca985421'],
  [''],
  ['/register/team.cgi?id=3d93b47f'],
  ['/register/team.cgi?id=75235438'],
  ['/register/team.cgi?id=aee1e45c', '/register/team.cgi?id=42b26f6b']],
 [['/teams/ATL/1998.shtml'],
  ['/register/team.cgi?id=3f2c4e04'],
  ['/register/team.cgi?id=b6314403'],
  ['/register/team.cgi?id=5c689693'],
  ['/register/team.cgi?id=bd7af2f7'],
  ['/register/team.cgi?id=4f4b00c5', '/register/team.cgi?id=08702ef5']],
 [['/teams/BAL/1998.shtml'],
  ['/register/team.cgi?id=47fb0847'],
  ['/register/team.cgi?id=999fa0d0'],
  ['/register/team.cgi?id=d43539ce'],
  ['/register/team.cgi?id=8f2ffb13'],
  ['/register/team.cgi?id=059aef36', '/register/team.cgi?id=37a60811']],
 [['/teams/BOS/1998.shtml'],
  ['/register/team.