In [1]:
import pandas as pd

import sys
sys.path.append('../')

# Data Collection

Let's use the ProFootballReferenceScraper to collect NFL player and team statistics.

In [None]:
from src.scraper import ProFootballReferenceScraper

scraper = ProFootballReferenceScraper(data_dir="../data")
scraper.scrape_years(start_year=1999, end_year=2024)

# Data Processing

Build the silver layer from the scraped data

In [2]:

from src.processor import FantasyDataProcessor

processor = FantasyDataProcessor(data_dir="../data")
processor.process_all_data()

Processing files matching: *_player_fantasy_stats.csv: 100%|██████████| 25/25 [00:00<00:00, 242.94it/s]
2025-07-27 22:13:18,751 - src.processor - INFO - Saved player_fantasy_stats.csv to ../data/silver/player_fantasy_stats.csv
Processing files matching: *_player_receiving_stats.csv: 100%|██████████| 25/25 [00:00<00:00, 186.90it/s]
2025-07-27 22:13:24,967 - src.processor - INFO - Saved player_receiving_stats.csv to ../data/silver/player_receiving_stats.csv
Processing files matching: *_player_rushing_stats.csv: 100%|██████████| 25/25 [00:00<00:00, 187.33it/s]
2025-07-27 22:13:29,538 - src.processor - INFO - Saved player_rushing_stats.csv to ../data/silver/player_rushing_stats.csv
Processing files matching: *_player_passing_stats.csv:   0%|          | 0/25 [00:00<?, ?it/s]2025-07-27 22:13:29,544 - src.processor - INFO - Added missing pass_qbr column to ../data/bronze/2001_player_passing_stats.csv
2025-07-27 22:13:29,547 - src.processor - INFO - Added missing pass_qbr column to ../data/bro

## Feature Analysis 

- Do some data quality checks on the final stats
- Do some sanity checks on the final stats
- Look at which features are the most and least informative

In [3]:

from src.feature_engineering import FantasyFeatureEngineer

engineer = FantasyFeatureEngineer(
    data_dir="../data",
    metadata_cols=["id"],
    target_cols=["standard_fantasy_points", "standard_fantasy_points_per_game", "ppr_fantasy_points", "ppr_fantasy_points_per_game", "value_over_replacement"],
)
final_stats = engineer.gold_data

2025-07-27 22:14:22,035 - src.feature_engineering - INFO - Loaded gold table: 12749 rows


In [None]:
# See shape
print(final_stats.shape)

# Check null values
null_values = final_stats.isnull().sum()
print(f"Rows with null values: {null_values[null_values > 0].count()}")

# Check data types
data_types = final_stats.dtypes
print(f"Non float data types: {data_types[data_types != 'float64'].count()}")

# Check for duplicates
print(f"\nDuplicate rows in final stats: {final_stats.duplicated().sum()}")

(12749, 194)
Rows with null values: 0
Non float data types: 1

Duplicate rows in final stats: 0


In [5]:
# Check that games are joined correctly (rookies should have 0 game played)
print(final_stats[final_stats['id'] == 'malik_nabers_2024'][['id', 'age','games', 'games_2yr_avg', 'games_3yr_avg']])

# Check that the correct year in player fantasy stats is joined with the correct year in player stats

# Check that the last year of data is dropped

# TODO: brainstorm other sanity checks

                     id   age  games  games_2yr_avg  games_3yr_avg
3278  malik_nabers_2024  21.0    0.0            0.0            0.0


In [6]:

engineer.generate_feature_analysis()

## Modelling

- Start with linear regression
- Become more advanced
- Forward, Backward, Feature Selection
- Models per target

In [2]:
from src.modelling import FantasyModel

ppr_model = FantasyModel(data_dir="../data", target_col="ppr_fantasy_points")

In [3]:
data = ppr_model.split_data()

eval_pipeline = ppr_model.create_pipeline()
grid_search = ppr_model.create_grid_search(eval_pipeline, data)

print(f"Best params: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")
print(f"Best estimator: {grid_search.best_estimator_}")

Best params: {'model': Lasso()}
Best score: 0.49134632913422394
Best estimator: Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('model', Lasso())])


In [6]:
# Get mean test score and std test score for each model and make a dataframe out of it
cv_results_df = pd.DataFrame(grid_search.cv_results_)
display(cv_results_df[['param_model', 'mean_test_score', 'std_test_score']].sort_values(by='mean_test_score', ascending=False))

Unnamed: 0,param_model,mean_test_score,std_test_score
2,Lasso(),0.491346,0.009205
5,HistGradientBoostingRegressor(),0.486934,0.010359
1,Ridge(),0.485777,0.008934
0,LinearRegression(),0.482249,0.009866
3,RandomForestRegressor(),0.461368,0.010171
4,SVR(),0.382666,0.010358
