In [None]:
import mlflow
import os
import pandas as pd
import sys

sys.path.append('../')

# Data Collection

- Scrape 20 years of data from pro football reference

In [None]:
from src.scraper import ProFootballReferenceScraper

scraper = ProFootballReferenceScraper(data_dir="../data")
scraper.scrape_years(start_year=1999, end_year=2024)

# Data Processing

- Combine the stat tables for all years and save them to the silver layer
- Build the gold table from the different stat tables

In [None]:
from src.processor import DataProcessor

processor = DataProcessor(data_dir="../data")
processor.process_all_data()

## Data Analysis 

- Do some data quality checks on the final stats
- Look at which features are the most and least informative

In [None]:
from src.analysis import DataAnalysis

analysis = DataAnalysis()

In [None]:
analysis.run_training_data_quality_checks()
analysis.run_live_data_quality_checks()

In [None]:

analysis.generate_feature_analysis()

## Modelling

- Start with linear regression
- Become more advanced
- Forward, Backward, Feature Selection
- Models per target

In [None]:
from src.modelling import FantasyModel

model = FantasyModel(data_dir="../data", target_col="ppr_fantasy_points_per_game")
# model = FantasyModel(data_dir="../data", target_col="standard_fantasy_points_per_game")

In [None]:

data = model.split_data()

In [None]:
model_eval_search = model.run_model_eval(data)

model_eval_results_df = pd.DataFrame(model_eval_search.cv_results_)

display(model_eval_results_df[['param_model', 'mean_test_r2', 'mean_test_rmse', 'std_test_r2']].sort_values(by='mean_test_r2', ascending=False))

In [None]:
ridge_eval_search = model.run_model_tuning(data, "random_forest")

ridge_eval_results_df = pd.DataFrame(ridge_eval_search.cv_results_)

display(
    ridge_eval_results_df[['param_model__n_estimators', 'mean_test_r2', 'mean_test_rmse', 'std_test_r2']]
    .sort_values(by=['mean_test_r2', 'mean_test_rmse'], ascending=[False, True])
)

In [None]:
random_forest_eval_search = model.run_model_tuning(data, "random_forest")

random_forest_eval_results_df = pd.DataFrame(random_forest_eval_search.cv_results_)

display(
    ridge_eval_results_df[['param_model__n_estimators', 'param_model__max_depth' 'mean_test_r2', 'mean_test_rmse', 'std_test_r2']]
    .sort_values(by=['mean_test_r2', 'mean_test_rmse'], ascending=[False, True])
)

In [None]:
svr_eval_search = model.run_model_tuning(data, "svr")

svr_eval_results_df = pd.DataFrame(svr_eval_search.cv_results_)

display(
    ridge_eval_results_df[['param_model__C', 'param_model__kernel', 'param_model__gamma', 'mean_test_r2', 'mean_test_rmse', 'std_test_r2']]
    .sort_values(by=['mean_test_r2', 'mean_test_rmse'], ascending=[False, True])
)

## Final Predictions!

#### First on the test set

In [None]:
test_preds_df = model.make_test_predictions(data, "ridge")

view_year = 2024
year_test_preds = model.view_year_test_predictions(test_preds_df, view_year)
year_test_preds.to_csv(os.path.join(model.predictions_dir, f"{model.target_col}_{view_year}_predictions.csv"), index=False)

print(f"Predictions for {model.target_col} in {view_year}:")
print(year_preds.head(10))

#### Now for 2025

In [None]:
live_preds_df = model.make_live_predictions(data, "ridge")

live_preds_df.to_csv(os.path.join(model.predictions_dir, f"{model.target_col}_live_predictions.csv"), index=False)

print(f"Live predictions for {model.target_col}:")
print(live_preds_df.head(10))