# Data Collection

Let's use the ProFootballReferenceScraper to collect NFL player and team statistics.

In [1]:
import sys
sys.path.append('../')
from src.scraper import ProFootballReferenceScraper

pfr = ProFootballReferenceScraper(data_dir="../data")
pfr.scrape_years(start_year=2023, end_year=2024, include_game_logs=False)

Scraping years:   0%|          | 0/2 [00:00<?, ?it/s]2025-06-09 21:45:35,034 - src.scraper - INFO - scrape_years - Scraping data from 2023
2025-06-09 21:45:39,025 - src.scraper - INFO - _get_soup - Requesting https://www.pro-football-reference.com/years/2023/#team_stats
2025-06-09 21:45:39,234 - src.scraper - INFO - scrape_years - Completed scraping data from 2023
Scraping years:  50%|█████     | 1/2 [00:04<00:04,  4.20s/it]2025-06-09 21:45:39,235 - src.scraper - INFO - scrape_years - Scraping data from 2024
2025-06-09 21:45:43,268 - src.scraper - INFO - _get_soup - Requesting https://www.pro-football-reference.com/years/2024/#team_stats
2025-06-09 21:45:43,374 - src.scraper - INFO - scrape_years - Completed scraping data from 2024
Scraping years: 100%|██████████| 2/2 [00:08<00:00,  4.17s/it]


# Data Exploration

Let's explore the collected data to make sure it looks good.

In [4]:
# List the files in the raw data directory
raw_files = os.listdir('../data/raw')
print(f"Found {len(raw_files)} files in the raw data directory:")
for file in raw_files:
    print(f" - {file}")

Found 30 files in the raw data directory:
 - years_2017_opp.htm.html
 - years_2020_fantasy.htm.html
 - years_2021_opp.htm.html
 - years_2015_fantasy.htm.html
 - years_2016_opp.htm.html
 - years_2024_fantasy.htm.html
 - years_2020_opp.htm.html
 - years_2021_fantasy.htm.html
 - years_2017.html
 - years_2021.html
 - years_2020.html
 - years_2016.html
 - years_2015.html
 - years_2017_fantasy.htm.html
 - years_2019.html
 - years_2022_fantasy.htm.html
 - years_2023.html
 - years_2018_opp.htm.html
 - years_2022.html
 - years_2018_fantasy.htm.html
 - years_2018.html
 - years_2019_opp.htm.html
 - years_2016_fantasy.htm.html
 - years_2022_opp.htm.html
 - years_2023_fantasy.htm.html
 - years_2019_fantasy.htm.html
 - years_2015_opp.htm.html
 - years_2024.html
 - years_2023_opp.htm.html
 - years_2024_opp.htm.html


In [5]:
# Load and preview a player stats file
player_files = [f for f in raw_files if 'player_stats' in f]
if player_files:
    latest_player_file = sorted(player_files)[-1]
    print(f"Loading {latest_player_file}...")
    
    player_data = pd.read_csv(f'../data/raw/{latest_player_file}')
    print(f"Shape: {player_data.shape}")
    
    # Display the first few rows
    player_data.head()

In [6]:
# Load and preview a team stats file
team_files = [f for f in raw_files if 'team_stats' in f]
if team_files:
    latest_team_file = sorted(team_files)[-1]
    print(f"Loading {latest_team_file}...")
    
    team_data = pd.read_csv(f'../data/raw/{latest_team_file}')
    print(f"Shape: {team_data.shape}")
    
    # Display the first few rows
    team_data.head()

# Data Quality Check

Let's check the quality of the collected data.

In [7]:
# Check for missing values in player data
if 'player_data' in locals():
    print("Missing values in player data:")
    print(player_data.isnull().sum())
    
    # Check data types
    print("\nData types in player data:")
    print(player_data.dtypes)
    
    # Check for duplicates
    print(f"\nDuplicate rows in player data: {player_data.duplicated().sum()}")

In [8]:
# Check for missing values in team data
if 'team_data' in locals():
    print("Missing values in team data:")
    print(team_data.isnull().sum())
    
    # Check data types
    print("\nData types in team data:")
    print(team_data.dtypes)
    
    # Check for duplicates
    print(f"\nDuplicate rows in team data: {team_data.duplicated().sum()}")