# Data Collection

Let's use the ProFootballReferenceScraper to collect NFL player and team statistics.

In [1]:
import sys
sys.path.append('../')
from src.scraper import ProFootballReferenceScraper

pfr = ProFootballReferenceScraper(data_dir="../data")
pfr.scrape_years(start_year=2023, end_year=2024)

Scraping years:   0%|          | 0/2 [00:00<?, ?it/s]2025-06-09 23:09:28,290 - src.scraper - INFO - scrape_years - Scraping data from 2023
2025-06-09 23:09:28,290 - src.scraper - INFO - _get_soup - Using existing html file ../data/html/2023_player_fantasy_stats.html
2025-06-09 23:09:28,510 - src.scraper - INFO - scrape_html_table - Adjusted headers from 43 to 33
2025-06-09 23:09:28,516 - src.scraper - INFO - scrape_player_fantasy_stats - Saved fantasy stats for 2023 to ../data/bronze/2023_player_fantasy_stats.csv
2025-06-09 23:09:32,258 - src.scraper - INFO - _get_soup - Requesting https://www.pro-football-reference.com/years/2023/passing_advanced.htm
2025-06-09 23:09:32,527 - src.scraper - INFO - scrape_html_table - Adjusted headers from 50 to 42
2025-06-09 23:09:32,530 - src.scraper - INFO - scrape_player_offensive_stats - Saved passing stats for 2023 to ../data/bronze/2023_passing_stats.csv
2025-06-09 23:09:36,385 - src.scraper - INFO - _get_soup - Requesting https://www.pro-footbal

# Data Processing

Build the silver layer from the scraped data

In [1]:
import sys
sys.path.append('../')
from src.processor import FantasyDataProcessor

processor = FantasyDataProcessor(data_dir="../data")
processor.build_player_fantasy_stats()
#processor.build_player_receiving_stats()
#processor.build_player_rushing_stats()
#processor.build_player_passing_stats()
#processor.build_team_stats()

Processing files matching: ../data/bronze/*_player_fantasy_stats.csv: 100%|██████████| 2/2 [00:00<00:00, 233.43it/s]
2025-06-18 23:23:40,200 - src.processor - INFO - Saved player_fantasy_stats.csv to ../data/silver/player_fantasy_stats.csv


In [5]:
# Load and preview a player stats file
player_files = [f for f in raw_files if 'player_stats' in f]
if player_files:
    latest_player_file = sorted(player_files)[-1]
    print(f"Loading {latest_player_file}...")
    
    player_data = pd.read_csv(f'../data/raw/{latest_player_file}')
    print(f"Shape: {player_data.shape}")
    
    # Display the first few rows
    player_data.head()

In [6]:
# Load and preview a team stats file
team_files = [f for f in raw_files if 'team_stats' in f]
if team_files:
    latest_team_file = sorted(team_files)[-1]
    print(f"Loading {latest_team_file}...")
    
    team_data = pd.read_csv(f'../data/raw/{latest_team_file}')
    print(f"Shape: {team_data.shape}")
    
    # Display the first few rows
    team_data.head()

# Data Quality Check

Let's check the quality of the collected data.

In [7]:
# Check for missing values in player data
if 'player_data' in locals():
    print("Missing values in player data:")
    print(player_data.isnull().sum())
    
    # Check data types
    print("\nData types in player data:")
    print(player_data.dtypes)
    
    # Check for duplicates
    print(f"\nDuplicate rows in player data: {player_data.duplicated().sum()}")

In [8]:
# Check for missing values in team data
if 'team_data' in locals():
    print("Missing values in team data:")
    print(team_data.isnull().sum())
    
    # Check data types
    print("\nData types in team data:")
    print(team_data.dtypes)
    
    # Check for duplicates
    print(f"\nDuplicate rows in team data: {team_data.duplicated().sum()}")