In [1]:
import mlflow
import os
import pandas as pd
import sys

sys.path.append('../')

# Data Collection

- Scrape 20 years of data from pro football reference

In [None]:
from src.scraper import ProFootballReferenceScraper

scraper = ProFootballReferenceScraper(data_dir="../data")
scraper.scrape_years(start_year=1999, end_year=2024)

# Data Processing

- Combine the stat tables for all years and save them to the silver layer
- Build the gold table from the different stat tables

In [None]:
from src.processor import DataProcessor

processor = DataProcessor(data_dir="../data")
processor.process_all_data()

## Data Analysis 

- Do some data quality checks on the final stats
- Look at which features are the most and least informative

In [2]:
from src.analysis import DataAnalysis

analysis = DataAnalysis()

2025-07-30 19:39:37,503 - src.analysis - INFO - Loaded training data: 9431 rows
2025-07-30 19:39:37,510 - src.analysis - INFO - Loaded live data: 374 rows


In [3]:
analysis.run_training_data_quality_checks()
analysis.run_live_data_quality_checks()

In [None]:

analysis.generate_feature_analysis()

## Modelling

- Start with linear regression
- Become more advanced
- Forward, Backward, Feature Selection
- Models per target

In [2]:
from src.modelling import FantasyModel

model = FantasyModel(data_dir="../data", target_col="ppr_fantasy_points_per_game")
# model = FantasyModel(data_dir="../data", target_col="standard_fantasy_points_per_game")

2025-07-30 19:55:52,848 - src.modelling - INFO - Loaded training data: 9431 rows
2025-07-30 19:55:52,853 - src.modelling - INFO - Loaded live data: 374 rows


In [3]:

data = model.split_data()

In [6]:
model_eval_search = model.run_model_eval(data)

model_eval_results_df = pd.DataFrame(model_eval_search.cv_results_)

display(model_eval_results_df[['param_model', 'mean_test_r2', 'mean_test_rmse', 'std_test_r2']].sort_values(by='mean_test_r2', ascending=False))

2025/07/30 19:40:40 INFO mlflow.tracking.fluent: Experiment with name 'ppr_fantasy_points_per_game' does not exist. Creating a new experiment.


Unnamed: 0,param_model,mean_test_r2,mean_test_rmse,std_test_r2
1,Ridge(),0.584414,-3.69492,0.022259
0,LinearRegression(),0.579847,-3.715279,0.021968
3,RandomForestRegressor(),0.577159,-3.727369,0.021417
5,HistGradientBoostingRegressor(),0.573703,-3.741997,0.024334
4,SVR(),0.561753,-3.794068,0.024292
2,Lasso(),0.439427,-4.293155,0.014534


In [7]:
ridge_eval_search = model.run_model_tuning(data, "ridge")

ridge_eval_results_df = pd.DataFrame(ridge_eval_search.cv_results_)

display(
    ridge_eval_results_df[['param_model__alpha', 'mean_test_r2', 'mean_test_rmse', 'std_test_r2']]
    .sort_values(by=['mean_test_r2', 'mean_test_rmse'], ascending=[False, True])
)

Successfully registered model 'ppr_fantasy_points_per_game_ridge'.
Created version '1' of model 'ppr_fantasy_points_per_game_ridge'.


Unnamed: 0,param_model__alpha,mean_test_r2,mean_test_rmse,std_test_r2
7,166.810054,0.59136,-3.663744,0.022616
8,1291.549665,0.589934,-3.670037,0.023319
6,21.544347,0.589337,-3.672794,0.02282
5,2.782559,0.586115,-3.6873,0.022453
4,0.359381,0.582968,-3.701371,0.022181
3,0.046416,0.581095,-3.709679,0.0223
2,0.005995,0.580401,-3.712755,0.022341
1,0.000774,0.580113,-3.714056,0.022228
0,0.0001,0.579887,-3.715085,0.022088
9,10000.0,0.573754,-3.741577,0.024944


In [None]:
random_forest_eval_search = model.run_model_tuning(data, "random_forest")

random_forest_eval_results_df = pd.DataFrame(random_forest_eval_search.cv_results_)

display(
    ridge_eval_results_df[['param_model__n_estimators', 'param_model__max_depth' 'mean_test_r2', 'mean_test_rmse', 'std_test_r2']]
    .sort_values(by=['mean_test_r2', 'mean_test_rmse'], ascending=[False, True])
)

In [None]:
svr_eval_search = model.run_model_tuning(data, "svr")

svr_eval_results_df = pd.DataFrame(svr_eval_search.cv_results_)

display(
    ridge_eval_results_df[['param_model__C', 'param_model__kernel', 'param_model__gamma', 'mean_test_r2', 'mean_test_rmse', 'std_test_r2']]
    .sort_values(by=['mean_test_r2', 'mean_test_rmse'], ascending=[False, True])
)

## Final Predictions!

#### First on the test set

In [4]:
test_preds_df = model.make_test_predictions(data, "ridge")

view_year = 2024
year_test_preds = model.view_year_test_predictions(test_preds_df, view_year)
year_test_preds.to_csv(os.path.join(model.predictions_dir, f"{model.target_col}_{view_year}_predictions.csv"), index=False)

print(f"Predictions for {model.target_col} in {view_year}:")
print(year_test_preds.head(10))

R^2 score: 0.6443421370314563
RMSE: 3.4813927470810007
Predictions for ppr_fantasy_points_per_game in 2024:
                        id  predictions  actual
2446      tyreek_hill_2024    20.947287   12.84
2420      jalen_hurts_2024    20.871811   21.01
2453  patrick_mahomes_2024    20.375811   17.69
2484     dak_prescott_2024    18.652807   14.56
2458        cj_stroud_2024    18.405447   12.96
2392     jamarr_chase_2024    17.358314   23.71
2485     keenan_allen_2024    17.116867   12.29
2611    brandon_aiyuk_2024    15.202578    8.91
2543     stefon_diggs_2024    15.027564   15.24
2465       derek_carr_2024    13.916413   15.09


  latest_version = client.get_latest_versions(f"{self.target_col}_{model_type}", stages=["None"])[0].version


#### Now for 2025

In [5]:
live_preds_df = model.make_live_predictions(data, "ridge")

live_preds_df.to_csv(os.path.join(model.predictions_dir, f"{model.target_col}_live_predictions.csv"), index=False)

print(f"Live predictions for {model.target_col}:")
print(live_preds_df.head(10))

Live predictions for ppr_fantasy_points_per_game:
    predictions            player position
24    23.657430     lamar_jackson       qb
22    22.577369        josh_allen       qb
29    21.890221    jayden_daniels       qb
0     20.867830      jamarr_chase       wr
43    19.903659        joe_burrow       qb
36    19.709070       jalen_hurts       qb
3     19.246498  justin_jefferson       wr
67    18.903988            bo_nix       qb
58    18.495768   patrick_mahomes       qb
8     18.356523  amon_ra_st_brown       wr


  latest_version = client.get_latest_versions(f"{self.target_col}_{model_type}", stages=["None"])[0].version
