## Web Scraping Part III

This notebook is associated with the lesson titled **Introduction to Web Scraping Part III** in the Web Scraping and Data Storage Module. In this notebook we perform the following tasks:

    - Dive deeper into BeautifulSoup to extract data from different table types

In [None]:
# Imports
import requests
from bs4 import BeautifulSoup
import pandas as pd

### Parsing the Team Stats Table

In [None]:
url = "https://www.pro-football-reference.com/boxscores/202009130buf.htm"
resp = requests.get(url)
soup = BeautifulSoup(resp.text, "html.parser")

In [None]:
table_div = soup.find(id="all_team_stats")

In [None]:
div_encoded = bytearray(str(table_div.contents), 'utf-8')
div_decoded = div_encoded.decode('utf-8')
div_soup = BeautifulSoup(div_decoded, "html.parser")
table_soup = div_soup.find('table')

In [None]:
table_soup

In [None]:
output = {}
for tr in table_soup.find('tbody').find_all('tr'):
    header = tr.find('th').text
    cell_contents = {td['data-stat']: td.text for td in tr.find_all("td")}
    output[header] = cell_contents
        

### Generalize Logic into Function 

In [None]:
def parse_table(soup, table_div_id):
    """ Takes a BeautifulSoup object for the game stat webpage and the table id of the table that is going to be
    scrapped. Parses through the table and creates a dictionary such that each header is a key and the cells
    contents are the values. Converts the dictionary to a dataframe and returns the transposed dataframe."""
    
    table_div = soup.find('div', id=table_div_id)
    div_encoded = bytearray(str(table_div.contents), 'utf-8')
    div_decoded = div_encoded.decode('utf-8')
    div_soup = BeautifulSoup(div_decoded, "html.parser")
    table_soup = div_soup.find('table')
    output = {}
    for tr in table_soup.find('tbody').find_all('tr'):
        if (tr.find('th').text != "") and (tr.find('th').text != "Player"):
            header = tr.find('th').text
            cell_contents = {td['data-stat']: td.text for td in tr.find_all("td")}
            output[header] = cell_contents
            
    return pd.DataFrame(output).T

In [None]:
parse_table(soup=soup, table_div_id="all_team_stats")

In [None]:
# test additional tables
parse_table(soup, "all_receiving_advanced")

In [None]:
# test additional tables
parse_table(soup, "all_vis_snap_counts")

In [None]:
# test additional tables
parse_table(soup, "all_home_starters")

### Parsing the Scorebox Table

In [None]:
url = "https://www.pro-football-reference.com/boxscores/202009130buf.htm"
resp = requests.get(url)
soup = BeautifulSoup(resp.text, "html.parser")
scorebox = soup.find('div', class_="scorebox")

In [None]:
scorebox

In [None]:
teams = [a.text for a in scorebox.find_all("a") if a['href'].startswith("/teams")]
teams

In [None]:
date = soup.find("div", class_="scorebox_meta").find_all('div')[0].text
date

In [None]:
scores = [float(d.text) for d in scorebox.find_all('div', class_="score")]
scores

In [None]:
def parse_scorebox(soup):
    """ Takes a BeautifulSoup object for the game stat webpage. Extracts the team names, the final score and the
    date of the game and stores as a dictionary. Returns the dictionary """
    scorebox = soup.find('div', class_="scorebox")
    teams = [a.text for a in scorebox.find_all("a") if a['href'].startswith("/teams")]
    scores = [float(d.text) for d in scorebox.find_all('div', class_="score")]
    date = soup.find("div", class_="scorebox_meta").find_all('div')[0].text
    output = {"home_team": teams[0],
              "away_team": teams[1],
              "home_team_score": scores[0],
              "away_team_score": scores[1],
              "date": date}
    return output

In [None]:
 parse_scorebox(soup=soup)