In [1]:
import mlflow
import os
import pandas as pd
import sys

sys.path.append('../')

# Data Collection

Let's use the ProFootballReferenceScraper to collect NFL player and team statistics.

In [2]:
from src.scraper import ProFootballReferenceScraper

scraper = ProFootballReferenceScraper(data_dir="../data")
scraper.scrape_years(start_year=1999, end_year=2024)

Scraping years:   0%|          | 0/26 [00:00<?, ?it/s]2025-07-27 23:13:20,313 - src.scraper - INFO - scrape_years - Scraping data from 1999
Scraping years:   0%|          | 0/26 [00:01<?, ?it/s]


KeyboardInterrupt: 

# Data Processing

Build the silver layer from the scraped data

In [2]:

from src.processor import FantasyDataProcessor

processor = FantasyDataProcessor(data_dir="../data", drop_rookies=True)
processor.process_all_data()

Processing files matching: *_player_fantasy_stats.csv: 100%|██████████| 25/25 [00:00<00:00, 284.02it/s]
2025-07-27 23:08:53,316 - src.processor - INFO - Saved player_fantasy_stats.csv to ../data/silver/player_fantasy_stats.csv
Processing files matching: *_player_receiving_stats.csv: 100%|██████████| 25/25 [00:00<00:00, 205.73it/s]
2025-07-27 23:08:59,281 - src.processor - INFO - Saved player_receiving_stats.csv to ../data/silver/player_receiving_stats.csv
Processing files matching: *_player_rushing_stats.csv: 100%|██████████| 25/25 [00:00<00:00, 210.38it/s]
2025-07-27 23:09:03,708 - src.processor - INFO - Saved player_rushing_stats.csv to ../data/silver/player_rushing_stats.csv
Processing files matching: *_player_passing_stats.csv:   0%|          | 0/25 [00:00<?, ?it/s]2025-07-27 23:09:03,714 - src.processor - INFO - Added missing pass_qbr column to ../data/bronze/2001_player_passing_stats.csv
2025-07-27 23:09:03,716 - src.processor - INFO - Added missing pass_qbr column to ../data/bro

## Feature Analysis 

- Do some data quality checks on the final stats
- Do some sanity checks on the final stats
- Look at which features are the most and least informative

In [3]:

from src.feature_engineering import FantasyFeatureEngineer

engineer = FantasyFeatureEngineer(
    data_dir="../data",
    metadata_cols=["id"],
    target_cols=["standard_fantasy_points", "standard_fantasy_points_per_game", "ppr_fantasy_points", "ppr_fantasy_points_per_game", "value_over_replacement"],
)
final_stats = engineer.gold_data

2025-07-27 23:09:30,180 - src.feature_engineering - INFO - Loaded gold table: 9071 rows


In [4]:
# See shape
print(final_stats.shape)

# Check null values
null_values = final_stats.isnull().sum()
print(f"Rows with null values: {null_values[null_values > 0].count()}")

# Check data types
data_types = final_stats.dtypes
print(f"Non float data types: {data_types[data_types != 'float64'].count()}")

# Check for duplicates
print(f"\nDuplicate rows in final stats: {final_stats.duplicated().sum()}")

(12749, 194)
Rows with null values: 0
Non float data types: 1

Duplicate rows in final stats: 0


In [5]:
# Check that games are joined correctly (rookies should have 0 game played)
print(final_stats[final_stats['id'] == 'malik_nabers_2024'][['id', 'age','games', 'games_2_yr_avg', 'games_3_yr_avg']])

# Check that the correct year in player fantasy stats is joined with the correct year in player stats

# Check that the last year of data is dropped

# TODO: brainstorm other sanity checks

Empty DataFrame
Columns: [id, age, games, games_2_yr_avg, games_3_yr_avg]
Index: []


In [14]:
# print rookie algorithm
display(final_stats[(final_stats['games'] == 0) & (final_stats['games_2yr_avg'] == 0) & (final_stats['games_3yr_avg'] == 0)])

Unnamed: 0,id,age,standard_fantasy_points,ppr_fantasy_points,value_over_replacement,standard_fantasy_points_per_game,ppr_fantasy_points_per_game,rec_targets,rec_receptions,rec_yards,...,team_plays_2_yr_avg,team_yards_per_play_2_yr_avg,team_points_3_yr_avg,team_yards_3_yr_avg,team_plays_3_yr_avg,team_yards_per_play_3_yr_avg,awards,games,games_2yr_avg,games_3yr_avg
4,matt_forte_2008,23.0,242.0,304.5,95.0,15.12,19.03,0.0,0.0,0.0,...,1038.50,4.75,340.33,4664.00,1004.67,4.63,0.0,0.0,0.0,0.0
5,adrian_peterson_2008,23.0,241.0,261.5,94.0,15.06,16.34,0.0,0.0,0.0,...,994.50,5.20,317.67,4978.33,978.00,5.10,0.0,0.0,0.0,0.0
12,steve_slaton_2008,22.0,222.0,271.9,76.0,13.88,16.99,0.0,0.0,0.0,...,961.50,5.10,302.00,4617.67,959.00,4.80,0.0,0.0,0.0,0.0
20,chris_johnson_2008,23.0,207.0,249.8,61.0,13.80,16.65,0.0,0.0,0.0,...,991.00,4.95,308.00,4973.00,1001.33,4.97,0.0,0.0,0.0,0.0
27,antonio_bryant_2008,27.0,167.0,250.0,50.0,10.44,15.62,0.0,0.0,0.0,...,973.50,4.90,281.67,4755.33,977.33,4.87,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12740,jmon_moore_2018,23.0,-1.0,1.5,0.0,-0.08,0.12,0.0,0.0,0.0,...,1014.00,5.30,373.33,5381.33,1028.00,5.23,0.0,0.0,0.0,0.0
12742,rod_streater_2018,30.0,-1.0,-1.1,0.0,-0.33,-0.37,0.0,0.0,0.0,...,995.50,5.00,258.67,5076.33,1011.00,5.03,0.0,0.0,0.0,0.0
12745,jj_jones_2018,26.0,-2.0,-0.7,0.0,-0.50,-0.18,0.0,0.0,0.0,...,1018.41,5.38,359.50,5537.81,1022.20,5.42,0.0,0.0,0.0,0.0
12746,kyle_lauletta_2018,23.0,-2.0,-2.2,0.0,-1.00,-1.10,0.0,0.0,0.0,...,1027.00,5.05,325.33,5423.67,1035.67,5.27,0.0,0.0,0.0,0.0


In [6]:

engineer.generate_feature_analysis()

## Modelling

- Start with linear regression
- Become more advanced
- Forward, Backward, Feature Selection
- Models per target

In [2]:
from src.modelling import FantasyModel

model = FantasyModel(data_dir="../data", target_col="ppr_fantasy_points_per_game")
# model = FantasyModel(data_dir="../data", target_col="standard_fantasy_points_per_game")

In [3]:

data = model.split_data()

In [4]:
model_eval_search = model.run_model_eval(data)

model_eval_results_df = pd.DataFrame(model_eval_search.cv_results_)

display(model_eval_results_df[['param_model', 'mean_test_r2', 'mean_test_rmse', 'std_test_r2']].sort_values(by='mean_test_r2', ascending=False))

KeyboardInterrupt: 

In [5]:
ridge_eval_search = model.run_model_tuning(data, "random_forest")

ridge_eval_results_df = pd.DataFrame(ridge_eval_search.cv_results_)

display(
    ridge_eval_results_df[['param_model__n_estimators', 'mean_test_r2', 'mean_test_rmse', 'std_test_r2']]
    .sort_values(by=['mean_test_r2', 'mean_test_rmse'], ascending=[False, True])
)

Successfully registered model 'ppr_fantasy_points_per_game_random_forest'.
Created version '1' of model 'ppr_fantasy_points_per_game_random_forest'.


Unnamed: 0,param_model__n_estimators,mean_test_r2,mean_test_rmse,std_test_r2
103,200,0.592749,-3.680684,0.018762
89,300,0.592097,-3.683651,0.018668
107,300,0.591809,-3.684792,0.019597
98,300,0.591793,-3.684880,0.019567
90,100,0.591679,-3.685555,0.018587
...,...,...,...,...
19,200,0.497461,-4.088390,0.024731
1,200,0.497313,-4.088810,0.026106
24,100,0.496596,-4.091614,0.026916
6,100,0.496446,-4.092226,0.026736


In [None]:
preds_df = model.make_test_predictions(data, "ridge")

view_year = 2024
year_preds = model.view_year_test_predictions(preds_df, view_year)
year_preds.to_csv(os.path.join(model.predictions_dir, f"{model.target_col}_{view_year}_predictions.csv"), index=False)

print(f"Predictions for {model.target_col} in {view_year}:")
print(year_preds.head(10))

R^2 score: 0.613019172537814
RMSE: 3.563290710365429
Predictions for ppr_fantasy_points_per_game in 2024:
                         id  predictions  actual
2420       jalen_hurts_2024    20.817577   21.01
2446       tyreek_hill_2024    20.660520   12.84
2399  justin_jefferson_2024    19.795615   18.68
2458         cj_stroud_2024    19.520420   12.96
2485      keenan_allen_2024    18.774634   12.29
2483     justin_fields_2024    17.214695   11.91
2392      jamarr_chase_2024    16.411604   23.71
2611     brandon_aiyuk_2024    15.679273    8.91
2543      stefon_diggs_2024    15.566576   15.24
2455        geno_smith_2024    15.519014   15.65


  latest_version = client.get_latest_versions(self.target_col, stages=["None"])[0].version
