In [1]:
# Import necessary libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add the src directory to the path so we can import our modules
sys.path.append('../')
from src.scraper import main as scraper_main

# Data Collection

Let's use the ProFootballReferenceScraper to collect NFL player and team statistics.

In [2]:
# Check if data directory exists, create if not
os.makedirs('../data/raw', exist_ok=True)
os.makedirs('../data/processed', exist_ok=True)
os.makedirs('../data/final', exist_ok=True)

In [3]:
# Run the scraper's main method
print("Starting data collection...")
scraper_main()
print("Data collection complete!")

2025-02-27 22:55:41,929 - src.scraper - INFO - Starting scrape for years 2015 to 2024


Starting data collection...


Scraping years:   0%|          | 0/10 [00:00<?, ?it/s]2025-02-27 22:55:41,955 - src.scraper - INFO - Scraping data for 2015
2025-02-27 22:55:46,243 - src.scraper - INFO - Requesting https://www.pro-football-reference.com/years/2015/fantasy.htm
2025-02-27 22:55:46,944 - src.scraper - INFO - Saved fantasy stats for 2015 to /Users/vivek/Code/ff-predictor-2025/data/processed/fantasy_stats_2015.csv
2025-02-27 22:55:50,674 - src.scraper - INFO - Requesting https://www.pro-football-reference.com/years/2015/
2025-02-27 22:55:50,819 - src.scraper - ERROR - Team stats table not found for 2015
2025-02-27 22:55:54,870 - src.scraper - INFO - Requesting https://www.pro-football-reference.com/years/2015/opp.htm
  dfs = pd.read_html(str(table))
2025-02-27 22:55:55,024 - src.scraper - INFO - Saved advanced team stats for 2015 to /Users/vivek/Code/ff-predictor-2025/data/processed/advanced_team_stats_2015.csv
2025-02-27 22:55:55,027 - src.scraper - INFO - Found 0 relevant players for 2015
Scraping player

Data collection complete!


# Data Exploration

Let's explore the collected data to make sure it looks good.

In [4]:
# List the files in the raw data directory
raw_files = os.listdir('../data/raw')
print(f"Found {len(raw_files)} files in the raw data directory:")
for file in raw_files:
    print(f" - {file}")

Found 30 files in the raw data directory:
 - years_2017_opp.htm.html
 - years_2020_fantasy.htm.html
 - years_2021_opp.htm.html
 - years_2015_fantasy.htm.html
 - years_2016_opp.htm.html
 - years_2024_fantasy.htm.html
 - years_2020_opp.htm.html
 - years_2021_fantasy.htm.html
 - years_2017.html
 - years_2021.html
 - years_2020.html
 - years_2016.html
 - years_2015.html
 - years_2017_fantasy.htm.html
 - years_2019.html
 - years_2022_fantasy.htm.html
 - years_2023.html
 - years_2018_opp.htm.html
 - years_2022.html
 - years_2018_fantasy.htm.html
 - years_2018.html
 - years_2019_opp.htm.html
 - years_2016_fantasy.htm.html
 - years_2022_opp.htm.html
 - years_2023_fantasy.htm.html
 - years_2019_fantasy.htm.html
 - years_2015_opp.htm.html
 - years_2024.html
 - years_2023_opp.htm.html
 - years_2024_opp.htm.html


In [5]:
# Load and preview a player stats file
player_files = [f for f in raw_files if 'player_stats' in f]
if player_files:
    latest_player_file = sorted(player_files)[-1]
    print(f"Loading {latest_player_file}...")
    
    player_data = pd.read_csv(f'../data/raw/{latest_player_file}')
    print(f"Shape: {player_data.shape}")
    
    # Display the first few rows
    player_data.head()

In [6]:
# Load and preview a team stats file
team_files = [f for f in raw_files if 'team_stats' in f]
if team_files:
    latest_team_file = sorted(team_files)[-1]
    print(f"Loading {latest_team_file}...")
    
    team_data = pd.read_csv(f'../data/raw/{latest_team_file}')
    print(f"Shape: {team_data.shape}")
    
    # Display the first few rows
    team_data.head()

# Data Quality Check

Let's check the quality of the collected data.

In [7]:
# Check for missing values in player data
if 'player_data' in locals():
    print("Missing values in player data:")
    print(player_data.isnull().sum())
    
    # Check data types
    print("\nData types in player data:")
    print(player_data.dtypes)
    
    # Check for duplicates
    print(f"\nDuplicate rows in player data: {player_data.duplicated().sum()}")

In [8]:
# Check for missing values in team data
if 'team_data' in locals():
    print("Missing values in team data:")
    print(team_data.isnull().sum())
    
    # Check data types
    print("\nData types in team data:")
    print(team_data.dtypes)
    
    # Check for duplicates
    print(f"\nDuplicate rows in team data: {team_data.duplicated().sum()}")