In [1]:
import sys
sys.path.append('../')
from src.processor import FantasyDataProcessor
from src.scraper import ProFootballReferenceScraper

# Data Collection

Let's use the ProFootballReferenceScraper to collect NFL player and team statistics.

In [None]:
pfr = ProFootballReferenceScraper(data_dir="../data")
pfr.scrape_years(start_year=2023, end_year=2024)

Scraping years:   0%|          | 0/2 [00:00<?, ?it/s]2025-07-01 14:29:59,519 - src.scraper - INFO - scrape_years - Scraping data from 2023
2025-07-01 14:29:59,520 - src.scraper - INFO - _get_soup - Using existing html file ../data/html/2023_player_fantasy_stats.html
2025-07-01 14:29:59,816 - src.scraper - INFO - scrape_html_table - Adjusted headers from 43 to 33
2025-07-01 14:29:59,828 - src.scraper - INFO - scrape_player_fantasy_stats - Saved fantasy stats for 2023 to ../data/bronze/2023_player_fantasy_stats.csv
2025-07-01 14:29:59,830 - src.scraper - INFO - _get_soup - Using existing html file ../data/html/2023_passing_stats.html
2025-07-01 14:29:59,959 - src.scraper - INFO - scrape_html_table - Adjusted headers from 50 to 42
2025-07-01 14:29:59,986 - src.scraper - INFO - scrape_player_offensive_stats - Saved passing stats for 2023 to ../data/bronze/2023_player_passing_stats.csv
2025-07-01 14:29:59,995 - src.scraper - INFO - _get_soup - Using existing html file ../data/html/2023_rush

# Data Processing

Build the silver layer from the scraped data

In [2]:
processor = FantasyDataProcessor(data_dir="../data")

In [3]:
processor.build_player_fantasy_stats()
processor.build_player_receiving_stats()
processor.build_player_rushing_stats()
processor.build_player_passing_stats()
processor.build_team_stats()

Processing files matching: *_player_fantasy_stats.csv: 100%|██████████| 2/2 [00:00<00:00, 152.90it/s]
2025-07-02 15:13:57,042 - src.processor - INFO - Saved player_fantasy_stats.csv to ../data/silver/player_fantasy_stats.csv
Processing files matching: *_player_receiving_stats.csv: 100%|██████████| 2/2 [00:00<00:00, 213.83it/s]
2025-07-02 15:13:57,162 - src.processor - INFO - Saved player_receiving_stats.csv to ../data/silver/player_receiving_stats.csv
Processing files matching: *_player_rushing_stats.csv: 100%|██████████| 2/2 [00:00<00:00, 242.12it/s]
2025-07-02 15:13:57,293 - src.processor - INFO - Saved player_rushing_stats.csv to ../data/silver/player_rushing_stats.csv
Processing files matching: *_player_passing_stats.csv: 100%|██████████| 2/2 [00:00<00:00, 259.37it/s]
2025-07-02 15:13:57,341 - src.processor - INFO - Saved player_passing_stats.csv to ../data/silver/player_passing_stats.csv
Processing files matching: *_team_offense.csv: 100%|██████████| 2/2 [00:00<00:00, 350.64it/s]


In [3]:
processor.join_stats()

In [5]:
# Load and preview a player stats file
player_files = [f for f in raw_files if 'player_stats' in f]
if player_files:
    latest_player_file = sorted(player_files)[-1]
    print(f"Loading {latest_player_file}...")
    
    player_data = pd.read_csv(f'../data/raw/{latest_player_file}')
    print(f"Shape: {player_data.shape}")
    
    # Display the first few rows
    player_data.head()

In [6]:
# Load and preview a team stats file
team_files = [f for f in raw_files if 'team_stats' in f]
if team_files:
    latest_team_file = sorted(team_files)[-1]
    print(f"Loading {latest_team_file}...")
    
    team_data = pd.read_csv(f'../data/raw/{latest_team_file}')
    print(f"Shape: {team_data.shape}")
    
    # Display the first few rows
    team_data.head()

# Data Quality Check

Let's check the quality of the collected data.

In [7]:
# Check for missing values in player data
if 'player_data' in locals():
    print("Missing values in player data:")
    print(player_data.isnull().sum())
    
    # Check data types
    print("\nData types in player data:")
    print(player_data.dtypes)
    
    # Check for duplicates
    print(f"\nDuplicate rows in player data: {player_data.duplicated().sum()}")

In [8]:
# Check for missing values in team data
if 'team_data' in locals():
    print("Missing values in team data:")
    print(team_data.isnull().sum())
    
    # Check data types
    print("\nData types in team data:")
    print(team_data.dtypes)
    
    # Check for duplicates
    print(f"\nDuplicate rows in team data: {team_data.duplicated().sum()}")