In [27]:
!pip install ipywidgets

Collecting ipywidgets
  Using cached ipywidgets-7.6.5-py2.py3-none-any.whl (121 kB)
Collecting widgetsnbextension~=3.5.0
  Using cached widgetsnbextension-3.5.1-py2.py3-none-any.whl (2.2 MB)
Collecting jupyterlab-widgets>=1.0.0; python_version >= "3.6"
  Using cached jupyterlab_widgets-1.0.2-py3-none-any.whl (243 kB)
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-7.6.5 jupyterlab-widgets-1.0.2 widgetsnbextension-3.5.1


In [24]:
!pip install nb-black
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.62.2-py2.py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 5.4 MB/s  eta 0:00:01
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.62.2


In [1]:
%load_ext lab_black

In [None]:
# Core libraries
from typing import List, Dict, Union
import os
import numpy as np
import pandas as pd
import requests
import re
import string
from tqdm.notebook import tqdm
import json
from bs4 import BeautifulSoup
from bs4.element import Tag
from datetime import datetime

In [54]:
# Script configuration vars
BASE_SITE_URL = "https://www.pro-football-reference.com"
BASE_PLAYERS_URL = "https://www.pro-football-reference.com/players"
HEADERS = {
    "user-agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
    )
}
ALPHABETS_LIST = [letter for letter in string.ascii_uppercase]
TODAY_STR = datetime.today().strftime("%Y%m%d")

In [163]:
# Draft code block to try to scrape a player letter link
sample_letter_url = "https://www.pro-football-reference.com/players/E/"
sample_letter_response = requests.get(sample_letter_url, headers=HEADERS)
sample_letter_soup = BeautifulSoup(sample_letter_response.content, "html.parser")
print(type(sample_letter_soup))

<class 'bs4.BeautifulSoup'>


In [164]:
print(sample_letter_soup.prettify())

<!DOCTYPE html>
<html class="no-js" data-root="/home/pfr/build" data-version="klecko-" itemscope="" itemtype="https://schema.org/WebSite" lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="ie=edge" http-equiv="x-ua-compatible"/>
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport">
   <link href="https://d2p3bygnnzw9w3.cloudfront.net/req/202109021" rel="dns-prefetch"/>
   <!-- Quantcast Choice. Consent Manager Tag v2.0 (for TCF 2.0) -->
   <script async="true" type="text/javascript">
    (function() {
	var host = window.location.hostname;
	var element = document.createElement('script');
	var firstScript = document.getElementsByTagName('script')[0];
	var url = 'https://quantcast.mgr.consensu.org'
	    .concat('/choice/', 'XwNYEpNeFfhfr', '/', host, '/choice.js')
	var uspTries = 0;
	var uspTriesLimit = 3;
	element.async = true;
	element.type = 'text/javascript';
	element.src = url;
	
	firstScript.parentNode.insertBefore(element, firstScrip

In [7]:
type(sample_player_soup.find("div", {"class": "stats_pullout"}))

bs4.element.Tag

In [66]:
def gather_player_links(
    base_player_url: str = BASE_PLAYERS_URL,
    headers: dict = HEADERS,
    letter: str = None,
) -> List[str]:
    """
    Function to scrape the links to the profiles of players whose last name starts with a letter (input)
    Args:
        base_url (str): base_url to attach index letter to
        headers (dict): GET requests headers
        letter (str): index letter, signififying the starting letter of players' last name
    Returns:
        player_links_list (List[str])
    """
    letter_url = f"{base_player_url}/{letter}/"
    letter_response = requests.get(letter_url, headers=headers)
    letter_soup = BeautifulSoup(letter_response.content, "html.parser")
    player_links_list = [
        tag["href"]
        for tag in letter_soup.find_all(
            "a", {"href": re.compile(f"(\/players\/{letter}\/)(.*)(\.htm)")}
        )
    ]
    return player_links_list

In [67]:
def get_player_soup(
    base_site_url: str = BASE_SITE_URL, player_link: str = None, headers: dict = HEADERS
) -> BeautifulSoup:
    """
    Function to get a BeautifulSoup HTML Content of a player info, to be used as an input for a downstream function
    Args:
        base_url (str): base_url to attach index letter to
        headers (dict): GET requests headers
        player_link (str): a suffix of the player url, to be attached onto the base URL
    Return:
        player_soup (bs4.BeautifulSoup): a player's BeautifulSoup HTML content
    """
    player_url = f"{base_site_url}{player_link}"
    player_response = requests.get(player_url, headers=headers)
    player_soup = BeautifulSoup(player_response.content, "html.parser")
    return player_soup

In [68]:
def get_career_stat_from_datatip(element_tag: Tag, datatip: str) -> Union[str, None]:
    """
    Function to get a career stat as string from an element tag and a datatip
    Args:
        element_tag (bs4.element.Tag): a player's specific stat element HTML Tag
        datatip (str): the datatip that shows up when hovering above the stat on the website
    Returns:
        stat_str (str): the string indicating the career statistic
            or
        None: if that stat is not found on the page
    """
    try:
        stat_str = (
            element_tag.find("span", {"data-tip": datatip})
            .find_next_siblings("p")[-1]
            .contents[0]
        )
        return stat_str
    except AttributeError:  # This means the stat is not relevant for that position e.g. Sacks for QB
        return None

In [73]:
def gather_player_info(player_soup: BeautifulSoup) -> dict:
    """
    Function to intake a player's BeautifulSoup HTML content and extract player info
    Args:
        player_soup (bs4.BeautifulSoup): a player's BeautifulSoup HTML content
    Return:
        player_info (dict): a dictionary containing player's info
    """

    # Initialising an empty dictionary to store player's info
    player_info = {}

    # Profile and metadata
    player_info_tag = player_soup.find("div", {"id": "info", "class": "players"})
    player_info["name"] = (
        player_info_tag.find("h1", itemprop="name").find("span").contents[0]
    )
    player_info["team"] = (
        player_info_tag.find("span", itemprop="affiliation").find("a").contents[0]
    )
    player_info["position"] = re.match(
        r"[A-Z]",
        player_info_tag.find("strong", text="Position").next_sibling.split(": ")[1],
    )[0]
    player_info["height"] = player_info_tag.find("span", itemprop="height").contents[0]
    player_info["weight"] = player_info_tag.find("span", itemprop="weight").contents[0]
    player_info["birthDate"] = player_info_tag.find("span", itemprop="birthDate")[
        "data-birth"
    ]
    player_info["awards"] = [
        award_tag.get_text()
        for award_tag in player_info_tag.find_all("a", href="/awards/")
    ]

    # Careers stats datatips - These are our only hints to get to the data
    gp_datatip = "Games played"
    av_datatip = "Approximate Value is our attempt to attach a single number to every player-season since 1960.<br>See the glossary for more information."
    qbrec_datatip = "Team record in games started by this QB (regular season)"
    cmp_pct_datatip = "Percentage of Passes Completed<br>Minimum 14 attempts per scheduled game to qualify as leader.<br />Minimum 1500 pass attempts to qualify as career leader."
    yds_pass_datatip = (
        "Yards Gained by Passing<br>For teams, sack yardage is deducted from this total"
    )
    ya_datatip = "Yards gained per pass attempt <br>Minimum 14 attempts per scheduled game to qualify as leader.<br>Minimum 1500 pass attempts to qualify as career leader."
    passing_td_datatip = "Passing Touchdowns"
    int_thrown_datatip = "Interceptions thrown"
    sacks_datatip = "Sacks (official since 1982,<br />based on play-by-play, game film<br />and other research since 1960)"
    solo_datatip = "Tackles<br>Before 1994:  unofficial and inconsistently recorded from team to team.  For amusement only.<br>1994-now:  unofficial but consistently recorded.<br>"
    ff_datatip = (
        "Number of times forced a fumble by the opposition recovered by either team"
    )
    fantpt_datatip = """<b>Fantasy points:</b><br />
								1 point per 25 yards passing<br />
								4 points per passing touchdown<br />
								-2 points per interception thrown<br />
								1 point per 10 yards rushing/receiving<br />
								6 points per TD<br />
								2 points per two-point conversion<br />
								-2 points per fumble lost (est. prior to 1994)"""
    # WR stat
    rec_datatip = "Receptions"
    yds_receive_datatip = "Receiving Yards"
    yr_datatip = "Receiving Yards per Reception<br>Minimum 1.875 catches per game scheduled to qualify as leader.<br />Minimum 200 receptions to qualify as career leader."
    receiving_td_datatip = "Receiving Touchdowns"

    # Scraping player performance info
    player_career_stats_tag = player_soup.find("div", {"class": "stats_pullout"})
    player_info["career_stats"] = {
        "games_played": get_career_stat_from_datatip(
            player_career_stats_tag, gp_datatip
        ),
        "approx_val": get_career_stat_from_datatip(player_career_stats_tag, av_datatip),
        "qbrec": get_career_stat_from_datatip(player_career_stats_tag, qbrec_datatip),
        "cmp_pct": get_career_stat_from_datatip(
            player_career_stats_tag, cmp_pct_datatip
        ),
        "yds_pass": get_career_stat_from_datatip(
            player_career_stats_tag, yds_pass_datatip
        ),
        "ya": get_career_stat_from_datatip(player_career_stats_tag, ya_datatip),
        "passing_td": get_career_stat_from_datatip(
            player_career_stats_tag, passing_td_datatip
        ),
        "int_thrown": get_career_stat_from_datatip(
            player_career_stats_tag, int_thrown_datatip
        ),
        "sacks": get_career_stat_from_datatip(player_career_stats_tag, sacks_datatip),
        "solo": get_career_stat_from_datatip(player_career_stats_tag, solo_datatip),
        "ff": get_career_stat_from_datatip(player_career_stats_tag, ff_datatip),
        "rec": get_career_stat_from_datatip(player_career_stats_tag, rec_datatip),
        "yds_receive": get_career_stat_from_datatip(
            player_career_stats_tag, yds_receive_datatip
        ),
        "yr": get_career_stat_from_datatip(player_career_stats_tag, yr_datatip),
        "receiving_td": get_career_stat_from_datatip(
            player_career_stats_tag, receiving_td_datatip
        ),
        "fantpt": get_career_stat_from_datatip(player_career_stats_tag, fantpt_datatip),
    }

    return player_info

In [74]:
# Draft code block to try to scrape a single player's info - Tom Brady in this case
sample_player_url = "https://www.pro-football-reference.com/players/A/AdamDa01.htm"
sample_player_response = requests.get(sample_player_url, headers=HEADERS)
sample_player_soup = BeautifulSoup(sample_player_response.content, "html.parser")
print(type(sample_player_soup))

<class 'bs4.BeautifulSoup'>


In [75]:
gather_player_info(sample_player_soup)

{'name': 'Davante Adams',
 'team': 'Green Bay Packers',
 'position': 'W',
 'height': '6-1',
 'weight': '215lb',
 'birthDate': '1992-12-24',
 'awards': ['1x All-Pro'],
 'career_stats': {'games_played': '101',
  'approx_val': '61',
  'qbrec': None,
  'cmp_pct': None,
  'yds_pass': None,
  'ya': None,
  'passing_td': None,
  'int_thrown': None,
  'sacks': None,
  'solo': None,
  'ff': None,
  'rec': '551',
  'yds_receive': '6624',
  'yr': '12.0',
  'receiving_td': '62',
  'fantpt': '1036.4'}}

In [81]:
# Main scraping execution code block
letter_list = ["A", "X", "Y", "Z"]
player_info_list = []
for letter in letter_list:
    player_links_list = gather_player_links(letter=letter)
    for player_link in tqdm(player_links_list):
        try:
            player_soup = get_player_soup(player_link=player_link)
            player_info_dict = gather_player_info(player_soup=player_soup)
            player_info_list.append(player_info_dict)
        except AttributeError:
            continue
        except IndexError:
            continue

  0%|          | 0/848 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

In [86]:
# Sample scraped DataFrame
sample_scraped_players_info_df = pd.json_normalize(player_info_list)
sample_scraped_players_info_df.drop_duplicates(
    subset=["name", "team", "position", "height", "weight", "birthDate"], inplace=True
)
print(sample_scraped_players_info_df.shape)
sample_scraped_players_info_df.to_csv(
    f"../output_data/{TODAY_STR}_SampleScrapedNflPlayersInfo.csv", index=False
)
sample_scraped_players_info_df.head()

(81, 23)


Unnamed: 0,name,team,position,height,weight,birthDate,awards,career_stats.games_played,career_stats.approx_val,career_stats.qbrec,...,career_stats.passing_td,career_stats.int_thrown,career_stats.sacks,career_stats.solo,career_stats.ff,career_stats.rec,career_stats.yds_receive,career_stats.yr,career_stats.receiving_td,career_stats.fantpt
0,Ameer Abdullah,Minnesota Vikings,R,5-9,203lb,1993-06-13,[],75,14,,...,,,,,,,,,,262.3
1,Oday Aboushi,Los Angeles Chargers,O,6-5,315lb,1991-06-05,[],66,19,,...,,,,,,,,,,
2,Johnathan Abram,Las Vegas Raiders,S,6-0,205lb,1996-10-25,[],15,8,,...,,,,,,,,,,
3,Andrew Adams,Tampa Bay Buccaneers,S,5-11,200lb,1992-10-28,[],74,13,,...,,,,,,,,,,
4,Davante Adams,Green Bay Packers,W,6-1,215lb,1992-12-24,[1x All-Pro],101,61,,...,,,,,,551.0,6624.0,12.0,62.0,1036.4


In [83]:
with open(f"../output_data/{TODAY_STR}_SampleScrapedNflPlayersInfo.json", "w") as fw:
    json.dump(player_info_list, fw)