In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import sys
sys.path.append('../')
from src.processor import FantasyDataProcessor
from src.scraper import ProFootballReferenceScraper
from src.feature_engineering import FantasyFeatureEngineer
from src.modelling import FantasyModel

scraper = ProFootballReferenceScraper(data_dir="../data")
processor = FantasyDataProcessor(data_dir="../data")
engineer = FantasyFeatureEngineer(
    data_dir="../data",
    metadata_cols=["player", "year" , "team"],
    target_cols=["standard_fantasy_points", "standard_fantasy_points_per_game", "ppr_fantasy_points", "ppr_fantasy_points_per_game", "value_over_replacement"],
    must_include_features=['age'],
    redundancy_threshold=0.75
)
model = FantasyModel(data_dir="../data", target_col="ppr_fantasy_points")

2025-07-27 08:11:11,507 - src.feature_engineering - INFO - Loaded silver table: 13740 rows


# Data Collection

Let's use the ProFootballReferenceScraper to collect NFL player and team statistics.

In [3]:
scraper.scrape_years(start_year=2003, end_year=2022)

Scraping years:   0%|          | 0/20 [00:00<?, ?it/s]2025-07-26 14:24:39,949 - src.scraper - INFO - Scraping data from 2003
2025-07-26 14:24:44,237 - src.scraper - INFO - Requesting https://www.pro-football-reference.com/years/2003/fantasy.htm
2025-07-26 14:24:44,598 - src.scraper - INFO - Adjusted headers from 43 to 33
2025-07-26 14:24:44,601 - src.scraper - INFO - Saved fantasy stats for 2003 to ../data/bronze/2003_player_fantasy_stats.csv
2025-07-26 14:24:48,348 - src.scraper - INFO - Requesting https://www.pro-football-reference.com/years/2003/passing.htm
2025-07-26 14:24:48,530 - src.scraper - INFO - Headers and data columns match for 2003
2025-07-26 14:24:48,532 - src.scraper - INFO - Saved passing stats for 2003 to ../data/bronze/2003_player_passing_stats.csv
2025-07-26 14:24:52,763 - src.scraper - INFO - Requesting https://www.pro-football-reference.com/years/2003/rushing.htm
2025-07-26 14:24:53,115 - src.scraper - INFO - Adjusted headers from 22 to 18
2025-07-26 14:24:53,117 

# Data Processing

Build the silver layer from the scraped data

In [3]:
processor.process_all_data()

Processing files matching: *_player_fantasy_stats.csv: 100%|██████████| 27/27 [00:00<00:00, 143.27it/s]
2025-07-26 15:52:04,863 - src.processor - INFO - Saved player_fantasy_stats.csv to ../data/silver/player_fantasy_stats.csv
Processing files matching: *_player_receiving_stats.csv: 100%|██████████| 25/25 [00:00<00:00, 197.00it/s]
2025-07-26 15:52:11,221 - src.processor - INFO - Saved player_receiving_stats.csv to ../data/silver/player_receiving_stats.csv
Processing files matching: *_player_rushing_stats.csv: 100%|██████████| 25/25 [00:00<00:00, 248.75it/s]
2025-07-26 15:52:15,480 - src.processor - INFO - Saved player_rushing_stats.csv to ../data/silver/player_rushing_stats.csv
Processing files matching: *_player_passing_stats.csv:   0%|          | 0/25 [00:00<?, ?it/s]2025-07-26 15:52:15,485 - src.processor - INFO - Added missing pass_qbr column to ../data/bronze/2001_player_passing_stats.csv
2025-07-26 15:52:15,487 - src.processor - INFO - Added missing pass_qbr column to ../data/bro

## Feature Engineering

Determine which features have the most potential
Prepare features for use in the model


Let's see what the silver table looks like.

In [4]:
silver_data = engineer.load_silver_table()

2025-07-26 15:52:41,475 - src.feature_engineering - INFO - Loaded silver table: 13740 rows


In [5]:
print(silver_data[silver_data['player'] == 'malik_nabers'][['player', 'year', 'age', 'team','games', 'games_2yr_avg', 'games_3yr_avg']])

            player  year  age team  games  games_2yr_avg  games_3yr_avg
3764  malik_nabers  2024   21  NYG    0.0            0.0            0.0


In [4]:
print("Missing values in silver data:")
print(silver_data.isnull().sum())

# Check data types
print("\nData types in silver data:")
print(silver_data.dtypes)

# Check for duplicates
print(f"\nDuplicate rows in silver data: {silver_data.duplicated().sum()}")

Missing values in silver data:
player                          0
year                            0
team                            0
age                             0
standard_fantasy_points         0
                               ..
team_points_3_yr_avg            0
team_yards_3_yr_avg             0
team_plays_3_yr_avg             0
team_yards_per_play_3_yr_avg    0
awards                          0
Length: 185, dtype: int64

Data types in silver data:
player                           object
year                              int64
team                             object
age                               int64
standard_fantasy_points         float64
                                 ...   
team_points_3_yr_avg            float64
team_yards_3_yr_avg             float64
team_plays_3_yr_avg             float64
team_yards_per_play_3_yr_avg    float64
awards                          float64
Length: 185, dtype: object

Duplicate rows in silver data: 0


In [3]:
engineer.build_gold_table()

Selecting features for standard_fantasy_points: 100%|██████████| 2/2 [00:00<00:00, 1458.89it/s]
Selecting features for standard_fantasy_points_per_game: 100%|██████████| 2/2 [00:00<00:00, 1681.76it/s]
Selecting features for ppr_fantasy_points: 100%|██████████| 2/2 [00:00<00:00, 1974.72it/s]
Selecting features for ppr_fantasy_points_per_game: 100%|██████████| 2/2 [00:00<00:00, 1791.67it/s]
Selecting features for value_over_replacement: 100%|██████████| 2/2 [00:00<00:00, 1633.93it/s]
Selecting features for all targets: 100%|██████████| 5/5 [00:22<00:00,  4.59s/it]


## Modelling

- Start with linear regression
- Become more advanced
- Forward, Backward, Feature Selection
- Models per target

In [3]:
gold_data = model.load_gold_table()

In [2]:
data = model.split_data()

estimator = LinearRegression()
score, preds = model.run_pipeline(estimator, data)

print(f"Score: {score}")
print(preds.head(10))

Score: 0.5678538625478929
                               id  ppr_fantasy_points
12730          cam_akers_2023_2TM          106.555934
12480       devin_asiasi_2022_CIN           44.315182
3946        jerame_tuman_2006_PIT           26.479530
3059        arlen_harris_2005_STL           44.221448
3559   cadillac_williams_2006_TAM          175.827075
1225           jon_kitna_2001_CIN          185.186720
5538        john_gilmore_2010_TAM           36.400243
10623       alvin_kamara_2019_NOR          239.632750
7585          chase_ford_2013_MIN           39.528647
9639       albert_wilson_2017_KAN           87.555506
