In [1]:
import sys
sys.path.append('../')
from src.processor import FantasyDataProcessor
from src.scraper import ProFootballReferenceScraper
from src.feature_engineering import FantasyFeatureEngineer

scraper = ProFootballReferenceScraper(data_dir="../data")
processor = FantasyDataProcessor(data_dir="../data")
engineer = FantasyFeatureEngineer(data_dir="../data")

# Data Collection

Let's use the ProFootballReferenceScraper to collect NFL player and team statistics.

In [None]:
pfr.scrape_years(start_year=2000, end_year=2022)

Scraping years:   0%|          | 0/23 [00:00<?, ?it/s]2025-07-15 12:37:59,218 - src.scraper - INFO - Scraping data from 2000
2025-07-15 12:38:03,707 - src.scraper - INFO - Requesting https://www.pro-football-reference.com/years/2000/fantasy.htm
2025-07-15 12:38:04,379 - src.scraper - INFO - Adjusted headers from 43 to 33
2025-07-15 12:38:04,389 - src.scraper - INFO - Saved fantasy stats for 2000 to ../data/bronze/2000_player_fantasy_stats.csv
2025-07-15 12:38:07,923 - src.scraper - INFO - Requesting https://www.pro-football-reference.com/years/2000/passing.htm
2025-07-15 12:38:08,130 - src.scraper - INFO - Headers and data columns match for 2000
2025-07-15 12:38:08,132 - src.scraper - INFO - Saved passing stats for 2000 to ../data/bronze/2000_player_passing_stats.csv
2025-07-15 12:38:12,015 - src.scraper - INFO - Requesting https://www.pro-football-reference.com/years/2000/rushing.htm
2025-07-15 12:38:12,357 - src.scraper - INFO - Adjusted headers from 22 to 18
2025-07-15 12:38:12,360 

# Data Processing

Build the silver layer from the scraped data

In [2]:
processor.process_all_data()

Processing files matching: *_player_fantasy_stats.csv: 100%|██████████| 25/25 [00:00<00:00, 251.34it/s]
2025-07-17 14:43:00,809 - src.processor - INFO - Saved player_fantasy_stats.csv to ../data/silver/player_fantasy_stats.csv
Processing files matching: *_player_receiving_stats.csv: 100%|██████████| 25/25 [00:00<00:00, 188.71it/s]
2025-07-17 14:43:07,050 - src.processor - INFO - Saved player_receiving_stats.csv to ../data/silver/player_receiving_stats.csv
Processing files matching: *_player_receiving_advanced_stats.csv: 100%|██████████| 7/7 [00:00<00:00, 94.97it/s]
2025-07-17 14:43:08,542 - src.processor - INFO - Saved player_receiving_advanced_stats.csv to ../data/silver/player_receiving_advanced_stats.csv
Processing files matching: *_player_rushing_stats.csv: 100%|██████████| 25/25 [00:00<00:00, 231.86it/s]
2025-07-17 14:43:12,826 - src.processor - INFO - Saved player_rushing_stats.csv to ../data/silver/player_rushing_stats.csv
Processing files matching: *_player_rushing_advanced_sta

## Feature Engineering

Determine which features have the most potential
Prepare features for use in the model


Let's see what the silver table looks like.

In [2]:
silver_data = engineer.load_silver_table()

2025-07-17 15:57:59,741 - src.feature_engineering - INFO - Loaded silver table: 13235 rows


In [None]:
print("Missing values in silver data:")
print(silver_data.isnull().sum())

# Check data types
print("\nData types in silver data:")
print(silver_data.dtypes)

# Check for duplicates
print(f"\nDuplicate rows in silver data: {silver_data.duplicated().sum()}")

Missing values in silver data:
player                          0
year                            0
team                            0
age                             0
standard_fantasy_points         0
                               ..
team_points_3_yr_avg            0
team_yards_3_yr_avg             0
team_plays_3_yr_avg             0
team_yards_per_play_3_yr_avg    0
awards                          0
Length: 185, dtype: int64

Data types in silver data:
player                           object
year                              int64
team                             object
age                               int64
standard_fantasy_points         float64
                                 ...   
team_points_3_yr_avg            float64
team_yards_3_yr_avg             float64
team_plays_3_yr_avg             float64
team_yards_per_play_3_yr_avg    float64
awards                          float64
Length: 185, dtype: object

Duplicate rows in silver data: 0


In [3]:
metadata_cols = ["player", "year", "team"]
target_cols = ["standard_fantasy_points", "standard_fantasy_points_per_game", "ppr_fantasy_points", "ppr_fantasy_points_per_game", "value_over_replacement"]

non_feature_cols = metadata_cols + target_cols
features = [col for col in silver_data.columns if col not in non_feature_cols]

#### Lets see how much features of the same stat are correlated

In [4]:
rec_stats_corr_matrix = engineer.get_correlation_between_similar_stats(silver_data, "rec")
rush_stats_corr_matrix = engineer.get_correlation_between_similar_stats(silver_data, "rush")
pass_stats_corr_matrix = engineer.get_correlation_between_similar_stats(silver_data, "pass")

engineer.plot_correlation_matrix(rec_stats_corr_matrix, "Receiving Stats Correlation Matrix", "rec_stats_corr_matrix.png")
engineer.plot_correlation_matrix(rush_stats_corr_matrix, "Rushing Stats Correlation Matrix", "rush_stats_corr_matrix.png")
engineer.plot_correlation_matrix(pass_stats_corr_matrix, "Passing Stats Correlation Matrix", "pass_stats_corr_matrix.png")


#### Lets see how much features of the same stat correlate with different targets

In [5]:
engineer.pearsons_correlation_with_target(silver_data, features, "ppr_fantasy_points", "ppr_fantasy_points_corr_matrix.csv")
engineer.pearsons_correlation_with_target(silver_data, features, "ppr_fantasy_points_per_game", "ppr_fantasy_points_per_game_corr_matrix.csv")
engineer.pearsons_correlation_with_target(silver_data, features, "standard_fantasy_points", "standard_fantasy_points_corr_matrix.csv")
engineer.pearsons_correlation_with_target(silver_data, features, "standard_fantasy_points_per_game", "standard_fantasy_points_per_game_corr_matrix.csv")
engineer.pearsons_correlation_with_target(silver_data, features, "value_over_replacement", "value_over_replacement_corr_matrix.csv")

Unnamed: 0,value_over_replacement
awards,0.35
rush_first_downs,0.32
rush_yards,0.32
rush_first_downs_2_yr_avg,0.31
rush_touchdowns,0.31
...,...
team_fumbles_lost,-0.02
team_penalties,-0.02
team_turnover_percent,-0.03
team_turnovers,-0.03


In [6]:
engineer.mutual_information_with_target(silver_data, features, "ppr_fantasy_points", "ppr_fantasy_points_mutual_info.csv")
engineer.mutual_information_with_target(silver_data, features, "ppr_fantasy_points_per_game", "ppr_fantasy_points_per_game_mutual_info.csv")
engineer.mutual_information_with_target(silver_data, features, "standard_fantasy_points", "standard_fantasy_points_mutual_info.csv")
engineer.mutual_information_with_target(silver_data, features, "standard_fantasy_points_per_game", "standard_fantasy_points_per_game_mutual_info.csv")
engineer.mutual_information_with_target(silver_data, features, "value_over_replacement", "value_over_replacement_mutual_info.csv")

games_2_yr_avg                      0.07
rec_receptions_3_yr_avg             0.07
rec_yards                           0.06
rec_targets_per_game_2_yr_avg       0.06
rec_receptions_per_game_2_yr_avg    0.06
                                    ... 
team_yards                          0.00
team_plays                          0.00
team_first_downs                    0.00
team_pass_touchdowns                0.00
pass_sacks_3_yr_avg                 0.00
Length: 177, dtype: float64