In [1]:
%load_ext lab_black

In [2]:
# Core libraries
from typing import List, Dict, Union
import os
import numpy as np
import pandas as pd
import requests
import re
import string
from tqdm.notebook import tqdm
import json
from bs4 import BeautifulSoup
from bs4.element import Tag
from datetime import datetime

In [3]:
# Script configuration vars
BASE_SITE_URL = "https://www.pro-football-reference.com"
BASE_PLAYERS_URL = "https://www.pro-football-reference.com/players"
HEADERS = {
    "user-agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
    )
}
ALPHABETS_LIST = [letter for letter in string.ascii_uppercase]
TODAY_STR = datetime.today().strftime("%Y%m%d")

In [4]:
# Draft code block to try to scrape a player letter link
sample_letter_url = "https://www.pro-football-reference.com/players/E/"
sample_letter_response = requests.get(sample_letter_url, headers=HEADERS)
sample_letter_soup = BeautifulSoup(sample_letter_response.content, "html.parser")
print(type(sample_letter_soup))

<class 'bs4.BeautifulSoup'>


In [5]:
def gather_player_links(
    base_player_url: str = BASE_PLAYERS_URL,
    headers: dict = HEADERS,
    letter: str = None,
    start_threshold: int = 2016,
) -> List[str]:
    """
    Function to scrape the links to the profiles of players whose last name starts with a letter (input)
    Args:
        base_url (str): base_url to attach index letter to
        headers (dict): GET requests headers
        letter (str): index letter, signififying the starting letter of players' last name
        start_threshold (int): integer specifying the year the player start their career in the NFL. This is to limit the number of requests and get more relevant data.
    Returns:
        player_links_list (List[str])
    """
    letter_url = f"{base_player_url}/{letter}/"
    letter_response = requests.get(letter_url, headers=headers)
    letter_soup = BeautifulSoup(letter_response.content, "html.parser")
    player_links_list = []
    for tag in letter_soup.find_all(
        "a", {"href": re.compile(f"(\/players\/{letter}\/)(.*)(\.htm)")}
    ):
        try:
            # If the parent tag is <b> i.e. bold -> signifies current player
            if tag.parent.name == "b":
                # Specifying the year the player starts in the NFL
                if (
                    int(tag.parent.parent.text.split()[-1].split("-")[0])
                    >= start_threshold
                ):
                    player_links_list.append(tag["href"])
        except AttributeError:
            continue
    return player_links_list

In [6]:
def ft_in_to_cm(ft_in: str, delimiter: str = "-") -> int:
    """
    Function to convert feet and inches measurement to cm
    """
    ft_in_split_str = ft_in.split(delimiter)
    fts = ft_in_split_str[0]
    inches = ft_in_split_str[1]
    cm_output = int(fts) * 30.48 + int(inches) * 2.48
    return int(cm_output)

In [7]:
def get_player_soup(
    base_site_url: str = BASE_SITE_URL, player_link: str = None, headers: dict = HEADERS
) -> BeautifulSoup:
    """
    Function to get a BeautifulSoup HTML Content of a player info, to be used as an input for a downstream function
    Args:
        base_url (str): base_url to attach index letter to
        headers (dict): GET requests headers
        player_link (str): a suffix of the player url, to be attached onto the base URL
    Return:
        player_soup (bs4.BeautifulSoup): a player's BeautifulSoup HTML content
    """
    player_url = f"{base_site_url}{player_link}"
    player_response = requests.get(player_url, headers=headers)
    player_soup = BeautifulSoup(player_response.content, "html.parser")
    return player_soup

In [8]:
def get_career_stat_from_datatip(element_tag: Tag, datatip: str) -> Union[str, None]:
    """
    Function to get a career stat as string from an element tag and a datatip
    Args:
        element_tag (bs4.element.Tag): a player's specific stat element HTML Tag
        datatip (str): the datatip that shows up when hovering above the stat on the website
    Returns:
        stat_str (str): the string indicating the career statistic
            or
        None: if that stat is not found on the page
    """
    try:
        stat_str = (
            element_tag.find("span", {"data-tip": datatip})
            .find_next_siblings("p")[-1]
            .contents[0]
        )
        return stat_str
    except AttributeError:  # This means the stat is not relevant for that position e.g. Sacks for QB
        return None
    except IndexError:
        return None

In [16]:
def gather_player_info(player_soup: BeautifulSoup) -> dict:
    """
    Function to intake a player's BeautifulSoup HTML content and extract player info
    Args:
        player_soup (bs4.BeautifulSoup): a player's BeautifulSoup HTML content
    Return:
        player_info (dict): a dictionary containing player's info
    """

    # Initialising an empty dictionary to store player's info
    player_info = {}

    # Profile and metadata
    player_info_tag = player_soup.find("div", {"id": "info", "class": "players"})
    player_info["name"] = (
        player_info_tag.find("h1", itemprop="name").find("span").contents[0]
    )
    player_info["team"] = (
        player_info_tag.find("span", itemprop="affiliation").find("a").contents[0]
    )
    player_info["position"] = re.match(
        r"[A-Z]",
        player_info_tag.find("strong", text="Position").next_sibling.split(": ")[1],
    )[0]
    player_info["height"] = ft_in_to_cm(
        player_info_tag.find("span", itemprop="height").contents[0]
    )
    player_info["weight"] = player_info_tag.find("span", itemprop="weight").contents[0]
    player_info["birth_date"] = player_info_tag.find("span", itemprop="birthDate")[
        "data-birth"
    ]
    player_info["awards"] = [
        award_tag.get_text()
        for award_tag in player_info_tag.find_all("a", href="/awards/")
    ]

    # Careers stats datatips - These are our only hints to get to the data
    gp_datatip = "Games played"
    av_datatip = "Approximate Value is our attempt to attach a single number to every player-season since 1960.<br>See the glossary for more information."
    qbrec_datatip = "Team record in games started by this QB (regular season)"
    cmp_pct_datatip = "Percentage of Passes Completed<br>Minimum 14 attempts per scheduled game to qualify as leader.<br />Minimum 1500 pass attempts to qualify as career leader."
    yds_pass_datatip = (
        "Yards Gained by Passing<br>For teams, sack yardage is deducted from this total"
    )
    ya_pass_datatip = "Yards gained per pass attempt <br>Minimum 14 attempts per scheduled game to qualify as leader.<br>Minimum 1500 pass attempts to qualify as career leader."
    passing_td_datatip = "Passing Touchdowns"
    int_thrown_datatip = "Interceptions thrown"
    sacks_datatip = "Sacks (official since 1982,<br />based on play-by-play, game film<br />and other research since 1960)"
    solo_datatip = "Tackles<br>Before 1994:  unofficial and inconsistently recorded from team to team.  For amusement only.<br>1994-now:  unofficial but consistently recorded.<br>"
    ff_datatip = (
        "Number of times forced a fumble by the opposition recovered by either team"
    )
    fantpt_datatip = """<b>Fantasy points:</b><br />
								1 point per 25 yards passing<br />
								4 points per passing touchdown<br />
								-2 points per interception thrown<br />
								1 point per 10 yards rushing/receiving<br />
								6 points per TD<br />
								2 points per two-point conversion<br />
								-2 points per fumble lost (est. prior to 1994)"""
    ## WR stat
    rec_datatip = "Receptions"
    yds_receive_datatip = "Receiving Yards"
    yr_datatip = "Receiving Yards per Reception<br>Minimum 1.875 catches per game scheduled to qualify as leader.<br />Minimum 200 receptions to qualify as career leader."
    receiving_td_datatip = "Receiving Touchdowns"

    ## RB stats
    rush_datatip = "Rushing Attempts (sacks not included in NFL)"
    yds_rush_datatip = "Rushing Yards Gained (sack yardage is not included by NFL)"
    ya_rush_datatip = "Rushing Yards per Attempt<br>Minimum 6.25 rushes per game scheduled to qualify as leader.<br />Minimum 750 rushes to qualify as career leader."
    rushing_td_datatip = "Rushing Touchdowns"

    ## Scraping player performance info
    player_career_stats_tag = player_soup.find("div", {"class": "stats_pullout"})
    player_info["career_stats"] = {
        "games_played": get_career_stat_from_datatip(
            player_career_stats_tag, gp_datatip
        ),
        "approx_val": get_career_stat_from_datatip(player_career_stats_tag, av_datatip),
        "qbrec": get_career_stat_from_datatip(player_career_stats_tag, qbrec_datatip),
        "cmp_pct": get_career_stat_from_datatip(
            player_career_stats_tag, cmp_pct_datatip
        ),
        "yds_pass": get_career_stat_from_datatip(
            player_career_stats_tag, yds_pass_datatip
        ),
        "ya_pass": get_career_stat_from_datatip(
            player_career_stats_tag, ya_pass_datatip
        ),
        "passing_td": get_career_stat_from_datatip(
            player_career_stats_tag, passing_td_datatip
        ),
        "int_thrown": get_career_stat_from_datatip(
            player_career_stats_tag, int_thrown_datatip
        ),
        "sacks": get_career_stat_from_datatip(player_career_stats_tag, sacks_datatip),
        "solo": get_career_stat_from_datatip(player_career_stats_tag, solo_datatip),
        "ff": get_career_stat_from_datatip(player_career_stats_tag, ff_datatip),
        "rec": get_career_stat_from_datatip(player_career_stats_tag, rec_datatip),
        "yds_receive": get_career_stat_from_datatip(
            player_career_stats_tag, yds_receive_datatip
        ),
        "yr": get_career_stat_from_datatip(player_career_stats_tag, yr_datatip),
        "receiving_td": get_career_stat_from_datatip(
            player_career_stats_tag, receiving_td_datatip
        ),
        "rush": get_career_stat_from_datatip(player_career_stats_tag, rush_datatip),
        "yds_rush": get_career_stat_from_datatip(
            player_career_stats_tag, yds_rush_datatip
        ),
        "ya_rush": get_career_stat_from_datatip(
            player_career_stats_tag, ya_rush_datatip
        ),
        "rushing_td": get_career_stat_from_datatip(
            player_career_stats_tag, rushing_td_datatip
        ),
        "fantpt": get_career_stat_from_datatip(player_career_stats_tag, fantpt_datatip),
    }

    return player_info

In [17]:
# Draft code block to try to scrape a single player's info - Tom Brady in this case
sample_player_url = "https://www.pro-football-reference.com/players/A/AbduAm00.htm"
sample_player_response = requests.get(sample_player_url, headers=HEADERS)
sample_player_soup = BeautifulSoup(sample_player_response.content, "html.parser")
print(type(sample_player_soup))

<class 'bs4.BeautifulSoup'>


In [18]:
# An example of a collected player data object
gather_player_info(sample_player_soup)

{'name': 'Ameer Abdullah',
 'team': 'Minnesota Vikings',
 'position': 'R',
 'height': 174,
 'weight': '196lb',
 'birth_date': '1993-06-13',
 'awards': [],
 'career_stats': {'games_played': '77',
  'approx_val': '14',
  'qbrec': None,
  'cmp_pct': None,
  'yds_pass': None,
  'ya_pass': None,
  'passing_td': None,
  'int_thrown': None,
  'sacks': None,
  'solo': None,
  'ff': None,
  'rec': None,
  'yds_receive': None,
  'yr': None,
  'receiving_td': None,
  'rush': '364',
  'yds_rush': '1436',
  'ya_rush': '3.9',
  'rushing_td': '6',
  'fantpt': '264.7'}}

## Main scraping code block below

In [21]:
# UNCOMMENT THIS CODE BLOCK TO RUN
# Main scraping execution code block
player_info_list = []
# for letter in ALPHABETS_LIST:
for letter in ["A", "C"]:
    player_links_list = gather_player_links(letter=letter, start_threshold=2016)
    for player_link in tqdm(player_links_list):
        try:
            player_soup = get_player_soup(player_link=player_link)
            player_info_dict = gather_player_info(player_soup=player_soup)
            player_info_list.append(player_info_dict)
        except AttributeError:  # This error occurs with players who no longer plays in the league
            #             print(f"Attribute error occured for: {player_link}")
            continue
        #         except IndexError:
        #             print(f"Index error occured for: {player_link}")
        #             raise IndexError
        except ConnectionResetError:
            #             print("ConnectionResetError occurred")
            time.sleep(5)
            continue

  0%|          | 0/75 [00:00<?, ?it/s]

  0%|          | 0/143 [00:00<?, ?it/s]

In [22]:
# Sample scraped DataFrame
print(f"Collected info on {len(player_info_list)} players!")
sample_scraped_players_info_df = pd.json_normalize(player_info_list)
sample_scraped_players_info_df.drop_duplicates(
    subset=["name", "team", "position", "height", "weight", "birth_date"], inplace=True
)
print(sample_scraped_players_info_df.shape)
sample_scraped_players_info_df

Collected info on 142 players!
(142, 27)


Unnamed: 0,name,team,position,height,weight,birth_date,awards,career_stats.games_played,career_stats.approx_val,career_stats.qbrec,...,career_stats.ff,career_stats.rec,career_stats.yds_receive,career_stats.yr,career_stats.receiving_td,career_stats.rush,career_stats.yds_rush,career_stats.ya_rush,career_stats.rushing_td,career_stats.fantpt
0,Johnathan Abram,Las Vegas Raiders,S,182,205lb,1996-10-25,[],17,8,,...,,,,,,,,,,
1,Andrew Adams,Tampa Bay Buccaneers,S,179,205lb,1992-10-28,[],76,13,,...,,,,,,,,,,
2,Jamal Adams,Seattle Seahawks,S,185,213lb,1995-10-17,[1x All-Pro],61,42,,...,7,,,,,,,,,
3,Matthew Adams,Indianapolis Colts,L,182,229lb,1995-12-12,[],44,8,,...,,,,,,,,,,
4,Montravius Adams,New Orleans Saints,D,190,304lb,1995-07-24,[],47,4,,...,1,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,Dane Cruikshank,Tennessee Titans,C,185,209lb,1995-04-27,[],33,2,,...,,,,,,,,,,
138,Zach Cunningham,Houston Texans,O,190,238lb,1994-12-02,[],65,31,,...,4,,,,,,,,,
139,Jake Curhan,Seattle Seahawks,T,197,316lb,1998-02-09,[],3,,,...,,,,,,,,,,
140,Kamren Curl,Washington Football Team,S,187,198lb,1999-03-03,[],19,8,,...,,,,,,,,,,


In [23]:
# Outputting file
sample_scraped_players_info_df.to_csv(
    f"../output_data/{TODAY_STR}_SampleScrapedNflPlayersInfo.csv", index=False
)
sample_scraped_players_info_df.head()

Unnamed: 0,name,team,position,height,weight,birth_date,awards,career_stats.games_played,career_stats.approx_val,career_stats.qbrec,...,career_stats.ff,career_stats.rec,career_stats.yds_receive,career_stats.yr,career_stats.receiving_td,career_stats.rush,career_stats.yds_rush,career_stats.ya_rush,career_stats.rushing_td,career_stats.fantpt
0,Johnathan Abram,Las Vegas Raiders,S,182,205lb,1996-10-25,[],17,8,,...,,,,,,,,,,
1,Andrew Adams,Tampa Bay Buccaneers,S,179,205lb,1992-10-28,[],76,13,,...,,,,,,,,,,
2,Jamal Adams,Seattle Seahawks,S,185,213lb,1995-10-17,[1x All-Pro],61,42,,...,7.0,,,,,,,,,
3,Matthew Adams,Indianapolis Colts,L,182,229lb,1995-12-12,[],44,8,,...,,,,,,,,,,
4,Montravius Adams,New Orleans Saints,D,190,304lb,1995-07-24,[],47,4,,...,1.0,,,,,,,,,


In [24]:
with open(f"../output_data/{TODAY_STR}_SampleScrapedNflPlayersInfo_v1.json", "w") as fw:
    json.dump(player_info_list, fw)

In [25]:
sample_scraped_players_info_df.columns

Index(['name', 'team', 'position', 'height', 'weight', 'birth_date', 'awards',
       'career_stats.games_played', 'career_stats.approx_val',
       'career_stats.qbrec', 'career_stats.cmp_pct', 'career_stats.yds_pass',
       'career_stats.ya_pass', 'career_stats.passing_td',
       'career_stats.int_thrown', 'career_stats.sacks', 'career_stats.solo',
       'career_stats.ff', 'career_stats.rec', 'career_stats.yds_receive',
       'career_stats.yr', 'career_stats.receiving_td', 'career_stats.rush',
       'career_stats.yds_rush', 'career_stats.ya_rush',
       'career_stats.rushing_td', 'career_stats.fantpt'],
      dtype='object')

In [None]:
    gp_datatip = "Games played"
    av_datatip = "Approximate Value is our attempt to attach a single number to every player-season since 1960.<br>See the glossary for more information."
    qbrec_datatip = "Team record in games started by this QB (regular season)"
    cmp_pct_datatip = "Percentage of Passes Completed<br>Minimum 14 attempts per scheduled game to qualify as leader.<br />Minimum 1500 pass attempts to qualify as career leader."
    yds_pass_datatip = (
        "Yards Gained by Passing<br>For teams, sack yardage is deducted from this total"
    )
    ya_pass_datatip = "Yards gained per pass attempt <br>Minimum 14 attempts per scheduled game to qualify as leader.<br>Minimum 1500 pass attempts to qualify as career leader."
    passing_td_datatip = "Passing Touchdowns"
    int_thrown_datatip = "Interceptions thrown"
    sacks_datatip = "Sacks (official since 1982,<br />based on play-by-play, game film<br />and other research since 1960)"
    solo_datatip = "Tackles<br>Before 1994:  unofficial and inconsistently recorded from team to team.  For amusement only.<br>1994-now:  unofficial but consistently recorded.<br>"
    ff_datatip = (
        "Number of times forced a fumble by the opposition recovered by either team"
    )
    fantpt_datatip = """<b>Fantasy points:</b><br />
								1 point per 25 yards passing<br />
								4 points per passing touchdown<br />
								-2 points per interception thrown<br />
								1 point per 10 yards rushing/receiving<br />
								6 points per TD<br />
								2 points per two-point conversion<br />
								-2 points per fumble lost (est. prior to 1994)"""
    ## WR stat
    rec_datatip = "Receptions"
    yds_receive_datatip = "Receiving Yards"
    yr_datatip = "Receiving Yards per Reception<br>Minimum 1.875 catches per game scheduled to qualify as leader.<br />Minimum 200 receptions to qualify as career leader."
    receiving_td_datatip = "Receiving Touchdowns"

    ## RB stats
    rush_datatip = "Rushing Attempts (sacks not included in NFL)"
    yds_rush_datatip = "Rushing Yards Gained (sack yardage is not included by NFL)"
    ya_rush_datatip = "Rushing Yards per Attempt<br>Minimum 6.25 rushes per game scheduled to qualify as leader.<br />Minimum 750 rushes to qualify as career leader."
    rushing_td_datatip = "Rushing Touchdowns"

In [42]:
# Data dictionary
nfl_player_info_data_dict = {
    "name": "Player name",
    "team": "NFL team",
    "position": "Football position abbreviated",
    "height": "Height in cm",
    "weight": "Weight in lb",
    "birth_date": "Date of birth in DD-MM-YYYY format",
    "awards": "List of awards (List[str])",
    "career_stats.games_played": "Number of games played",
    "career_stats.approx_val": "Approximate Value is our attempt to attach a single number to every player-season since 1960.<br>See the glossary for more information.",
    "career_stats.qbrec": "Team record in games started by this QB (regular season)",
    "career_stats.cmp_pct": "Percentage of Passes Completed. Minimum 14 attempts per scheduled game to qualify as leader.<br />Minimum 1500 pass attempts to qualify as career leader.",
    "career_stats.yds_pass": "Yards Gained by Passing. For teams, sack yardage is deducted from this total",
    "career_stats.ya_pass": "Yards gained per pass attempt. Minimum 14 attempts per scheduled game to qualify as leader.<br>Minimum 1500 pass attempts to qualify as career leader.",
    "career_stats.passing_td": "Passing Touchdowns",
    "career_stats.int_thrown": "Interceptions thrown",
    "career_stats.sacks": "Sacks (official since 1982, based on play-by-play, game film and other research since 1960)",
    "career_stats.solo": "Tackles. Before 1994: unofficial and inconsistently recorded from team to team. For amusement only. 1994-now: unofficial but consistently recorded.",
    "career_stats.ff": "Number of times forced a fumble by the opposition recovered by either team",
    "career_stats.rec": "Receptions",
    "career_stats.yds_receive": "Receiving Yards",
    "career_stats.yr": "Receiving Yards per Reception<br>Minimum 1.875 catches per game scheduled to qualify as leader.<br />Minimum 200 receptions to qualify as career leader.",
    "career_stats.receiving_td": "Receiving Touchdowns",
    "career_stats.rush": "Rushing Attempts (sacks not included in NFL)",
    "career_stats.yds_rush": "Rushing Yards Gained (sack yardage is not included by NFL)",
    "career_stats.ya_rush": "Rushing Yards per Attempt<br>Minimum 6.25 rushes per game scheduled to qualify as leader.<br />Minimum 750 rushes to qualify as career leader.",
    "career_stats.rushing_td": "Rushing Touchdowns",
    "career_stats.fantpt": """Fantasy points:
                            1 point per 25 yards passing
                            4 points per passing touchdown
                            -2 points per interception thrown
                            1 point per 10 yards rushing/receiving
                            6 points per TD
                            2 points per two-point conversion
                            -2 points per fumble lost (est. prior to 1994)""",
}

nfl_player_info_data_dict_df = (
    pd.DataFrame.from_dict(
        nfl_player_info_data_dict,
        orient="index",
    )
    .reset_index()
    .rename(columns={"index": "Column name", 0: "Definition"})
)

nfl_player_info_data_dict_df.head()

Unnamed: 0,Column name,Definition
0,name,Player name
1,team,NFL team
2,position,Football position abbreviated
3,height,Height in cm
4,weight,Weight in lb


In [43]:
# Output data definition
nfl_player_info_data_dict_df.to_csv(
    "../data_definition/NflPlayerDataDefinition.csv", index=False
)