In [None]:
import mlflow
import os
import pandas as pd
import sys

sys.path.append('../')

# Data Collection

- Scrape 20 years of data from pro football reference

In [None]:
from src.scraper import ProFootballReferenceScraper

scraper = ProFootballReferenceScraper(data_dir="../data")
scraper.scrape_years(start_year=1999, end_year=2024)

# Data Processing

- Combine the stat tables for all years and save them to the silver layer
- Build the gold table from the different stat tables

In [None]:

from src.processor import FantasyDataProcessor

processor = FantasyDataProcessor(data_dir="../data", drop_rookies=True)
processor.process_all_data()

## Data Analysis 

- Do some data quality checks on the final stats
- Do some sanity checks on the final stats
- Look at which features are the most and least informative

In [None]:
from src.data_analysis import DataAnalysis

data_analysis = DataAnalysis(
    data_dir="../data",
    metadata_cols=["id"],
    target_cols=["standard_fantasy_points", "standard_fantasy_points_per_game", "ppr_fantasy_points", "ppr_fantasy_points_per_game", "value_over_replacement"],
)

final_stats = data_analysis.gold_data

#### Data Quality Checks

In [None]:
# See shape
print(final_stats.shape)

# Check null values
null_values = final_stats.isnull().sum()
print(f"Rows with null values: {null_values[null_values > 0].count()}")

# Check data types
data_types = final_stats.dtypes
print(f"Non float data types: {data_types[data_types != 'float64'].count()}")

# Check for duplicates
print(f"\nDuplicate rows in final stats: {final_stats.duplicated().sum()}")

#### Feature Sanity Checks

In [None]:
# Check that games are joined correctly (rookies should have 0 game played)
print(final_stats[final_stats['id'] == 'malik_nabers_2024'][['id', 'age','games', 'games_2_yr_avg', 'games_3_yr_avg']])

# Check that the correct year in player fantasy stats is joined with the correct year in player stats

# Check that the last year of data is dropped

# TODO: brainstorm other sanity checks

In [None]:

data_analysis.generate_feature_analysis()

## Modelling

- Start with linear regression
- Become more advanced
- Forward, Backward, Feature Selection
- Models per target

In [None]:
from src.modelling import FantasyModel

model = FantasyModel(data_dir="../data", target_col="ppr_fantasy_points_per_game")
# model = FantasyModel(data_dir="../data", target_col="standard_fantasy_points_per_game")

In [None]:

data = model.split_data()

In [None]:
model_eval_search = model.run_model_eval(data)

model_eval_results_df = pd.DataFrame(model_eval_search.cv_results_)

display(model_eval_results_df[['param_model', 'mean_test_r2', 'mean_test_rmse', 'std_test_r2']].sort_values(by='mean_test_r2', ascending=False))

In [None]:
ridge_eval_search = model.run_model_tuning(data, "random_forest")

ridge_eval_results_df = pd.DataFrame(ridge_eval_search.cv_results_)

display(
    ridge_eval_results_df[['param_model__n_estimators', 'mean_test_r2', 'mean_test_rmse', 'std_test_r2']]
    .sort_values(by=['mean_test_r2', 'mean_test_rmse'], ascending=[False, True])
)

In [None]:
random_forest_eval_search = model.run_model_tuning(data, "random_forest")

random_forest_eval_results_df = pd.DataFrame(random_forest_eval_search.cv_results_)

display(
    ridge_eval_results_df[['param_model__n_estimators', 'param_model__max_depth' 'mean_test_r2', 'mean_test_rmse', 'std_test_r2']]
    .sort_values(by=['mean_test_r2', 'mean_test_rmse'], ascending=[False, True])
)

In [None]:
svr_eval_search = model.run_model_tuning(data, "svr")

svr_eval_results_df = pd.DataFrame(svr_eval_search.cv_results_)

display(
    ridge_eval_results_df[['param_model__C', 'param_model__kernel', 'param_model__gamma', 'mean_test_r2', 'mean_test_rmse', 'std_test_r2']]
    .sort_values(by=['mean_test_r2', 'mean_test_rmse'], ascending=[False, True])
)

## Final Predictions!

#### First on the test set

In [None]:
preds_df = model.make_test_predictions(data, "ridge")

view_year = 2024
year_preds = model.view_year_test_predictions(preds_df, view_year)
year_preds.to_csv(os.path.join(model.predictions_dir, f"{model.target_col}_{view_year}_predictions.csv"), index=False)

print(f"Predictions for {model.target_col} in {view_year}:")
print(year_preds.head(10))

#### Now for 2025

In [None]:
# Make final predictions