## WSA Web Scraping with BeautifulSoup Demo Code

### Imports
Don't forget to go to your terminal and run pip install requests, pip install bs4, and pip install mysql-connector-python before running these imports.

In [1]:
import requests
from bs4 import BeautifulSoup

### Basic Example: Michigan Football All-Time Passing Leaders (FBReference)
This is an example that shows the basics of web scraping using FBReference. FBreference is part of a very large Sports Reference website that contains interesting statistics in a large variety of sports and will be used by many of you in your projects. 

Important steps include accessing the website using requests.get, creating a soup object, and using find and find_all functions to get the specific data from the website.

In [2]:
# step 1: make a url request to the specific url
url = requests.get('https://www.sports-reference.com/cfb/schools/michigan/passing.html')

# # step 2: create the soup (which essentially is getting all the HTML content from the url request)
soup = BeautifulSoup(url.text, 'html.parser')
#print(soup)
#soup.prettify()

# # step 3: use .find and .find_all to get the table, then the rows, then the columns from the HTML content
# # note: the following line of code will find the first div tag with the id of div_passing and then the nested table and tbody tag
table = soup.find('div', attrs = {'id' : 'div_passing'}).find('table').find('tbody')
#print(table)

# # note: the following line of code will find all tr tags that are nested within the tbody tag
rows = table.find_all('tr')
for row in rows:
    #print(row)
    #print('-----------')
    columns = row.find_all('td')
    
# # step 4: use indexing to find the specific data you are interested in gathering. This may require other functions as well
# # note: for each value, we use .text to parse out seperators such as new line characters
    if len(columns) > 0:
        name = columns[0].find('a').text
        #print(name)
        
        start_year = columns[1].text
        end_year = columns[2].text
        years_played = start_year + '-' + end_year
        #print(years_played)
        
        pass_pct = float(columns[5].text)
        pass_yrds = int(columns[6].text)
        if columns[9].text == '':
            pass_td = 0
        else:
            pass_td = int(columns[9].text)
        ints = int(columns[10].text)
        if columns[11].text == '':
            qbr = 0.0
        else:
            qbr = float(columns[11].text)

# # step 5: put all your variables into a list and print them
    values = [name, years_played, pass_pct, pass_yrds, pass_td, ints, qbr]
    print(values)

['Chad Henne', '2004-2007', 59.7, 9715, 87, 37, 133.9]
['John Navarre', '2000-2003', 56.1, 9014, 70, 30, 126.0]
['Devin Gardner', '2010-2014', 60.4, 6336, 44, 32, 138.3]
['Denard Robinson', '2009-2012', 57.2, 6250, 49, 39, 138.6]
['J.J. McCarthy', '2021-2023', 67.6, 6226, 49, 11, 160.5]
['Elvis Grbac', '1989-1992', 63.1, 5859, 64, 29, 148.7]
['Shea Patterson', '2018-2019', 60.1, 5661, 45, 15, 144.2]
['Todd Collins', '1991-1994', 65.0, 5504, 34, 17, 146.5]
['Jim Harbaugh', '1983-1986', 63.2, 5214, 31, 19, 149.5]
['Tom Brady', '1996-1999', 61.9, 4773, 30, 17, 134.9]
['Steve Smith', '1980-1983', 50.1, 4529, 41, 30, 126.2]
['Rick Leach', '1975-1978', 47.6, 3799, 45, 29, 136.3]
['Brian Griese', '1995-1997', 59.5, 3663, 27, 15, 130.6]
['Wilton Speight', '2015-2017', 58.8, 3192, 22, 10, 132.2]
['Cade McNamara', '2020-2022', 63.1, 3181, 21, 7, 139.4]
['Jake Rudock', '2015-2015', 64.0, 3017, 20, 9, 141.5]
['Scott Dreisbach', '1995-1998', 54.7, 2920, 15, 12, 126.0]
['Tate Forcier', '2009-2010', 

### Complex Example: 2011-2023 Michigan Football Game Logs
This example also scrapes the FBReference page but instead of scraping 1 page, it scrapes 13 pages using a for loop.

It also shows how to access multiple different pieces of information from the same td tag.

In [3]:
# this for loop with years gives us access to all sports ref pages from each year
years = ['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
for year in years:
    url = requests.get(f"https://www.sports-reference.com/cfb/schools/michigan/{year}/gamelog/")
    soup = BeautifulSoup(url.text, 'html.parser')
    #print(soup.prettify())
    
    table = soup.find("div", attrs = {'id': 'div_offense'}).find("table")
    #print(rosterTable)
    
    tableRows = table.find('tbody').find_all("tr")
    #print(tableRows)

    for row in tableRows:
        #print(row)
        #print("----------------")
        
        columns = row.find_all("td")
        
        date = columns[0].find('a').text
        #YYYY-MM-DD
        year = int(date.split('-')[0])
        
        home_away_section = columns[1].text
        if home_away_section == '':
            home_away = 'Home'
        elif home_away_section == '@':
            home_away = 'Away'
        else:
            home_away = 'Nuetral'
        
        opponent = columns[2].find('a').text
        
        # we are getting multiple data points of interest from this single column so we must split it
        result_list = columns[3].text.split(' ')
        result = result_list[0]
        points_scored = int(result_list[1].split('-')[0][1:])
        points_against = int(result_list[1].split('-')[1][:-1])
        
        pass_cmp = float(columns[4].text)
        pass_att = int(columns[5].text)
        pass_pct = float(columns[6].text)
        pass_yrds = int(columns[7].text)
        pass_td = int(columns[8].text)
        pass_1st_down = int(columns[16].text)
        
        rush_att = int(columns[9].text)
        rush_yrds = int(columns[10].text)
        rush_td = int(columns[12].text)
        rush_1st_down = int(columns[17].text)
        
        total_offense = int(columns[14].text)
        fumbles = int(columns[22].text)
        ints = int(columns[23].text)
        
        values = [date, year, opponent, home_away, result, points_scored, points_against, pass_cmp, pass_att, pass_pct,
                 pass_yrds, pass_td, pass_1st_down, rush_att, rush_yrds, rush_td, rush_1st_down, total_offense, fumbles, ints]
        print(values)

['2011-09-03', 2011, 'Western Michigan', 'Home', 'W', 34, 10, 9.0, 13, 69.2, 98, 0, 5, 26, 190, 3, 9, 288, 0, 0]
['2011-09-10', 2011, 'Notre Dame', 'Home', 'W', 35, 31, 11.0, 24, 45.8, 338, 4, 10, 26, 114, 1, 5, 452, 0, 3]
['2011-09-17', 2011, 'Eastern Michigan', 'Home', 'W', 31, 3, 7.0, 18, 38.9, 95, 2, 5, 50, 376, 2, 19, 471, 0, 1]
['2011-09-24', 2011, 'San Diego State', 'Home', 'W', 28, 7, 8.0, 17, 47.1, 93, 0, 3, 45, 320, 4, 14, 413, 2, 2]
['2011-10-01', 2011, 'Minnesota', 'Home', 'W', 58, 0, 18.0, 25, 72.0, 217, 3, 10, 48, 363, 3, 19, 580, 0, 0]
['2011-10-08', 2011, 'Northwestern', 'Away', 'W', 42, 24, 19.0, 28, 67.9, 362, 2, 14, 50, 179, 4, 8, 541, 0, 3]
['2011-10-15', 2011, 'Michigan State', 'Away', 'L', 14, 28, 12.0, 31, 38.7, 168, 1, 6, 36, 82, 1, 8, 250, 0, 1]
['2011-10-29', 2011, 'Purdue', 'Home', 'W', 36, 14, 10.0, 17, 58.8, 196, 0, 7, 53, 339, 4, 17, 535, 0, 2]
['2011-11-05', 2011, 'Iowa', 'Away', 'L', 16, 24, 18.0, 38, 47.4, 196, 2, 10, 37, 127, 0, 10, 323, 1, 1]
['2011-1

['2017-09-02', 2017, 'Florida', 'Home', 'W', 33, 17, 12.0, 26, 46.2, 218, 1, 8, 49, 215, 1, 10, 433, 0, 2]
['2017-09-09', 2017, 'Cincinnati', 'Home', 'W', 36, 14, 17.0, 29, 58.6, 221, 2, 7, 37, 193, 0, 9, 414, 2, 0]
['2017-09-16', 2017, 'Air Force', 'Home', 'W', 29, 13, 14.0, 23, 60.9, 169, 0, 6, 42, 190, 1, 11, 359, 1, 0]
['2017-09-23', 2017, 'Purdue', 'Away', 'W', 28, 10, 21.0, 31, 67.7, 284, 1, 14, 44, 139, 3, 6, 423, 1, 1]
['2017-10-07', 2017, 'Michigan State', 'Home', 'L', 10, 14, 16.0, 35, 45.7, 198, 0, 7, 39, 102, 1, 8, 300, 2, 3]
['2017-10-14', 2017, 'Indiana', 'Away', 'W', 27, 20, 10.0, 20, 50.0, 58, 0, 3, 44, 271, 3, 12, 329, 0, 0]
['2017-10-21', 2017, 'Penn State', 'Away', 'L', 13, 42, 16.0, 28, 57.1, 166, 0, 8, 42, 103, 2, 8, 269, 1, 0]
['2017-10-28', 2017, 'Rutgers', 'Home', 'W', 35, 14, 13.0, 20, 65.0, 137, 1, 9, 51, 334, 4, 16, 471, 0, 1]
['2017-11-04', 2017, 'Minnesota', 'Home', 'W', 33, 10, 8.0, 13, 61.5, 56, 1, 2, 37, 371, 4, 12, 427, 0, 0]
['2017-11-11', 2017, 'Maryl

### Complex Example: Michigan Football Roster on ESPN
This example scrapes the ESPN website, which sometimes needs a workaround as shown below. This involves creating a variable called headers and using it as a parameter in the requests.get() function. 

This example also shows how access multiple tables from within the same webpage

In [None]:
# this headers variable is a workaround for ESPN (bc they are a little finicky with their data)
headers = {'User-Agent': '...'}
url = requests.get("https://www.espn.com/college-football/team/roster/_/id/130/michigan-wolverines", headers = headers)
soup = BeautifulSoup(url.text, 'html.parser')

offense_table = soup.find("div", attrs = {'class': 'ResponsiveTable Offense'}).find("tbody").find_all("tr")
#print(offense_table)
defense_table = soup.find("div", attrs = {'class': 'ResponsiveTable Defense'}).find("tbody").find_all("tr")
#print(defense_table)
st_table = soup.find("div", attrs = {'class': 'ResponsiveTable Special Teams'}).find("tbody").find_all("tr")
#print(st_table)

tables = [offense_table, defense_table, st_table]
for table in tables:
    for row in table:
        columns = row.find_all("td")
        
        name = columns[1].find('a').text
        number = int(columns[1].find('span').text)
        
        position = columns[2].text
        
        height_total = columns[3].text
        height_ft = int(height_total.split(" ")[0][0])
        height_in = int(height_total.split(" ")[1].split('"')[0])
        height = (12 * height_ft) + height_in
        
        weight = int(columns[4].text.split(" ")[0])
        
        year = columns[5].text
        
        birthplace = columns[6].text
        
        values = [name, number, position, height, weight, year, birthplace]
        print(values)