## Attempting to get NFL injury data

In [1]:
%load_ext lab_black

In [2]:
# Base packages
import os
import numpy as np
import pandas as pd
import requests
import re
from datetime import datetime
from bs4 import BeautifulSoup

In [3]:
# Injury report site
INJURY_REPORT_BASE_URL = "https://www.nfl.com/injuries/league"
HEADERS = {
    "user-agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
    )
}

# Time
TODAY_STR = datetime.today().strftime("%Y%m%d")

# Validation list for NFL periods
NFL_SEASON_PERIOD_LIST = [
    "REG1",
    "REG2",
    "REG3",
    "REG4",
    "REG5",
    "REG6",
    "REG7",
    "REG8",
    "REG9",
    "REG10",
    "REG11",
    "REG12",
    "REG13",
    "REG14",
    "REG15",
    "REG16",
    "REG17",
    "POST1",
    "POST2",
    "POST3",
    "PRO1",
    "POST4",
]

In [4]:
# Sample workflow
sample_injury_report_url = "https://www.nfl.com/injuries/league/2021/REG4"
sample_injury_report_response = requests.get(sample_injury_report_url, headers=HEADERS)
sample_injury_report_soup = BeautifulSoup(
    sample_injury_report_response.content, "html.parser"
)

# Parsing the HTML table
sample_injury_df = pd.concat(
    pd.read_html(
        str(
            sample_injury_report_soup.find_all(
                "table",
                {"class": "d3-o-table d3-o-table--detailed d3-o-reports--detailed"},
            )
        )
    )
)

sample_injury_df.head()

Unnamed: 0,Player,Position,Injuries,Practice Status,Game Status
0,Josh Lambo,K,Not injury related - personal matter,Did Not Participate In Practice,Out
1,Roy Robertson-Harris,DT,Ankle,Limited Participation in Practice,Out
2,Tre Herndon,CB,,Full Participation in Practice,
3,Cam Robinson,T,,Full Participation in Practice,
4,Andrew Norwell,G,,Full Participation in Practice,


In [5]:
def get_injury_report_df(
    base_site_url: str = INJURY_REPORT_BASE_URL, year: int = None, period: str = None
) -> pd.DataFrame:
    assert (
        1965 <= year <= int(datetime.today().strftime("%Y"))
    ), "The year must be between 1965 and this year!"
    assert (
        period in NFL_SEASON_PERIOD_LIST
    ), "Please enter a valid NFL season period code! Refer to the Documentation."
    injury_report_url = f"{base_site_url}/{str(year)}/{str(period)}"
    injury_report_response = requests.get(injury_report_url, headers=HEADERS)
    injury_report_soup = BeautifulSoup(injury_report_response.content, "html.parser")
    injury_report_table_tags_list = injury_report_soup.find_all(
        "table", {"class": "d3-o-table d3-o-table--detailed d3-o-reports--detailed"}
    )
    if len(injury_report_table_tags_list) == 0:
        raise ValueError(
            "I could not find any injury details here, please check if the year or the period exists (e.g. not in the future)."
        )
    injury_df = pd.concat(pd.read_html(str(injury_report_table_tags_list)))
    injury_df.dropna(subset=["Injuries"], inplace=True)
    injury_df.reset_index(drop=True, inplace=True)
    return injury_df

In [6]:
# Checking if it works
sample_year = 2021
sample_period = "REG4"
sample_output_injury_df = get_injury_report_df(year=sample_year, period=sample_period)

In [9]:
sample_output_injury_df.to_csv(
    f"../output_data/{TODAY_STR}_SampleInjuryReport-{sample_year}-{sample_period}.csv",
    index=False,
)