# Test Scrape
## Oct. 18, 2021

In [2]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup

Getting more familiar with scraping Baseball Reference.

What code structure might eventually look like (high-level): 
- iterate over years from 1998-2019 (first year w/ 30 teams)
- two different functions:
    - one gets table of minor league affiliates for each year [here](https://www.baseball-reference.com/register/affiliate.cgi?year=2019)
        - depending on detail in the tables, the info from this can be used to get player performance data from each of these team's individual season pages. TBD
            - exclude players based on playing time?
        - limit to Rookie and above (no short-season A) --> Rookie, Low-A, High-A, Double-A, and Triple-A
    - one gets league stats from [here](https://www.baseball-reference.com/register/league.cgi?year=1998), adjusting the year at the end of the URL
        - can be used to help calculate league FIP and wOBA in order to normalize player performance to league

In [12]:
url = "https://www.baseball-reference.com/register/affiliate.cgi?year=2005" # URL to use

html = urlopen(url) # collecting HTML data
        
soup = BeautifulSoup(html, features="lxml") # creating object from HTML

In [13]:
# Gets headers for the column

headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

In [14]:
soup.findAll('tr') # looking at each row in the table

[<tr>
 <th aria-label="Franchise" class="poptip center" data-stat="franch_name" scope="col">Franchise</th>
 <th aria-label="Tms" class="poptip center" data-stat="teams" scope="col">Tms</th>
 <th aria-label="AAA" class="poptip center" data-stat="AAA" scope="col">AAA</th>
 <th aria-label="AA" class="poptip center" data-stat="AA" scope="col">AA</th>
 <th aria-label="Adv A" class="poptip center" data-stat="Adv A" scope="col">Adv A</th>
 <th aria-label="Assists" class="poptip center" data-stat="A" scope="col">A</th>
 <th aria-label="Short-Season A" class="poptip center" data-stat="Short-Season A" scope="col">Short-Season A</th>
 <th aria-label="Rookie" class="poptip center" data-stat="Rookie" scope="col">Rookie</th>
 <th aria-label="Foreign Rookie" class="poptip center" data-stat="Foreign Rookie" scope="col">Foreign Rookie</th>
 </tr>,
 <tr><th class="left" data-stat="franch_name" scope="row"><a href="/register/affiliate.cgi?id=ARI&amp;year=2005">Arizona Diamondbacks</a></th><td class="left

In [6]:
rows = soup.findAll('tr')[1:] # excluding the header row

rows_data = [] # blank list where we'll add the list of links to team pages

# Iterates over rows, gets cell in row, gets links to teams in that row (if they exist)

for i in range(len(rows)):
    row = rows[i].findAll('td')
    row_data = []
    for td in row:
        teams = td.findAll('a')
        team_list = []
        if len(teams) != 0:
            for team in teams:
                team_list.append(team['href'])
            row_data.append(team_list)
        else:
            row_data.append([''])
    rows_data.append(row_data)
    
    
# Removes the first element in each list (blank, as it comes from the # of teams)

for elem in rows_data:
    elem.pop(0)

In [7]:
rows_data
# this is a list of list of lists: we have the main list, which...
# contains 30 lists, each corresponding to a team;
# each of these teams has a list of length 7, corresponding to the minor league levels...
# and each of these levels has a list of that team's affiliates at that level

# would a dictionary make more sense for this structure?

[[['/register/team.cgi?id=eb62c2d6'],
  ['/register/team.cgi?id=02e86fc6'],
  ['/register/team.cgi?id=c7128ac0'],
  ['/register/team.cgi?id=4de46db5'],
  ['/register/team.cgi?id=e55645b4'],
  ['/register/team.cgi?id=2e61b921'],
  ['/register/team.cgi?id=77ca8df8']],
 [['/register/team.cgi?id=ca985421'],
  [''],
  ['/register/team.cgi?id=3d93b47f'],
  ['/register/team.cgi?id=75235438'],
  [''],
  ['/register/team.cgi?id=aee1e45c', '/register/team.cgi?id=42b26f6b'],
  ['/register/team.cgi?id=8429e1f1']],
 [['/register/team.cgi?id=3f2c4e04'],
  ['/register/team.cgi?id=b6314403'],
  ['/register/team.cgi?id=5c689693'],
  ['/register/team.cgi?id=bd7af2f7'],
  ['/register/team.cgi?id=94fc1885'],
  ['/register/team.cgi?id=4f4b00c5', '/register/team.cgi?id=08702ef5'],
  ['/register/team.cgi?id=7ef15500']],
 [['/register/team.cgi?id=47fb0847'],
  ['/register/team.cgi?id=999fa0d0'],
  ['/register/team.cgi?id=d43539ce'],
  ['/register/team.cgi?id=8f2ffb13'],
  [''],
  ['/register/team.cgi?id=059ae

In [8]:
rows

[<tr><th class="left" data-stat="franch_name" scope="row"><a href="/register/affiliate.cgi?id=ANA&amp;year=1998">Anaheim Angels</a></th><td class="left" data-stat="teams">7</td><td class="left" data-stat="AAA"><a class="poptip" data-tip="Vancouver Canadians, PCL" href="/register/team.cgi?id=eb62c2d6"> Vancouver</a></td><td class="left" data-stat="AA"><a class="poptip" data-tip="Midland Angels, TL" href="/register/team.cgi?id=02e86fc6"> Midland</a></td><td class="left" data-stat="Adv A"><a class="poptip" data-tip="Lake Elsinore Storm, CALL" href="/register/team.cgi?id=c7128ac0"> Lake Elsinore</a></td><td class="left" data-stat="A"><a class="poptip" data-tip="Cedar Rapids Kernels, MIDW" href="/register/team.cgi?id=4de46db5"> Cedar Rapids</a></td><td class="left" data-stat="Short-Season A"><a class="poptip" data-tip="Boise Hawks, NORW" href="/register/team.cgi?id=e55645b4"> Boise</a></td><td class="left" data-stat="Rookie"><a class="poptip" data-tip="Butte Copper Kings, PION" href="/regis