In [1]:
import sys
sys.path.append('../')
from src.processor import FantasyDataProcessor
from src.scraper import ProFootballReferenceScraper

# Data Collection

Let's use the ProFootballReferenceScraper to collect NFL player and team statistics.

In [2]:
pfr = ProFootballReferenceScraper(data_dir="../data")
pfr.scrape_years(start_year=2023, end_year=2024)

Scraping years:   0%|          | 0/2 [00:00<?, ?it/s]2025-07-09 10:23:00,476 - src.scraper - INFO - Scraping data from 2023
2025-07-09 10:23:04,954 - src.scraper - INFO - Requesting https://www.pro-football-reference.com/years/2023/fantasy.htm
2025-07-09 10:23:05,377 - src.scraper - INFO - Adjusted headers from 43 to 33
2025-07-09 10:23:05,382 - src.scraper - INFO - Saved fantasy stats for 2023 to ../data/bronze/2023_player_fantasy_stats.csv
2025-07-09 10:23:09,672 - src.scraper - INFO - Requesting https://www.pro-football-reference.com/years/2023/passing.htm
2025-07-09 10:23:09,859 - src.scraper - INFO - Headers and data columns match for 2023
2025-07-09 10:23:09,862 - src.scraper - INFO - Saved passing stats for 2023 to ../data/bronze/2023_player_passing_stats.csv
2025-07-09 10:23:14,144 - src.scraper - INFO - Requesting https://www.pro-football-reference.com/years/2023/rushing.htm
2025-07-09 10:23:14,461 - src.scraper - INFO - Adjusted headers from 22 to 18
2025-07-09 10:23:14,464 -

# Data Processing

Build the silver layer from the scraped data

In [2]:
processor = FantasyDataProcessor(data_dir="../data")

In [3]:
# processor.build_player_fantasy_stats()
# processor.build_player_receiving_stats()
# processor.build_player_receiving_advanced_stats()
# processor.build_player_rushing_stats()
# processor.build_player_rushing_advanced_stats()
processor.build_player_passing_stats()
# processor.build_team_stats()


Processing files matching: *_player_passing_stats.csv: 100%|██████████| 2/2 [00:00<00:00, 156.83it/s]
2025-07-09 11:53:25,618 - src.processor - INFO - Saved player_passing_stats.csv to ../data/silver/player_passing_stats.csv


In [4]:
processor.process_all_data()
processor.join_stats()

Processing files matching: *_player_fantasy_stats.csv: 100%|██████████| 2/2 [00:00<00:00, 153.42it/s]
2025-07-09 11:54:04,795 - src.processor - INFO - Saved player_fantasy_stats.csv to ../data/silver/player_fantasy_stats.csv
Processing files matching: *_player_receiving_stats.csv: 100%|██████████| 2/2 [00:00<00:00, 111.45it/s]
2025-07-09 11:54:05,267 - src.processor - INFO - Saved player_receiving_stats.csv to ../data/silver/player_receiving_stats.csv
Processing files matching: *_player_receiving_advanced_stats.csv: 100%|██████████| 2/2 [00:00<00:00, 196.33it/s]
2025-07-09 11:54:05,611 - src.processor - INFO - Saved player_receiving_advanced_stats.csv to ../data/silver/player_receiving_advanced_stats.csv
Processing files matching: *_player_rushing_stats.csv: 100%|██████████| 2/2 [00:00<00:00, 155.91it/s]
2025-07-09 11:54:05,939 - src.processor - INFO - Saved player_rushing_stats.csv to ../data/silver/player_rushing_stats.csv
Processing files matching: *_player_rushing_advanced_stats.cs

In [5]:
# Load and preview a player stats file
player_files = [f for f in raw_files if 'player_stats' in f]
if player_files:
    latest_player_file = sorted(player_files)[-1]
    print(f"Loading {latest_player_file}...")
    
    player_data = pd.read_csv(f'../data/raw/{latest_player_file}')
    print(f"Shape: {player_data.shape}")
    
    # Display the first few rows
    player_data.head()

In [6]:
# Load and preview a team stats file
team_files = [f for f in raw_files if 'team_stats' in f]
if team_files:
    latest_team_file = sorted(team_files)[-1]
    print(f"Loading {latest_team_file}...")
    
    team_data = pd.read_csv(f'../data/raw/{latest_team_file}')
    print(f"Shape: {team_data.shape}")
    
    # Display the first few rows
    team_data.head()

# Data Quality Check

Let's check the quality of the collected data.

In [7]:
# Check for missing values in player data
if 'player_data' in locals():
    print("Missing values in player data:")
    print(player_data.isnull().sum())
    
    # Check data types
    print("\nData types in player data:")
    print(player_data.dtypes)
    
    # Check for duplicates
    print(f"\nDuplicate rows in player data: {player_data.duplicated().sum()}")

In [8]:
# Check for missing values in team data
if 'team_data' in locals():
    print("Missing values in team data:")
    print(team_data.isnull().sum())
    
    # Check data types
    print("\nData types in team data:")
    print(team_data.dtypes)
    
    # Check for duplicates
    print(f"\nDuplicate rows in team data: {team_data.duplicated().sum()}")