In [None]:
import duckdb
import polars as pl
import seaborn as sns

# Set up connection and test query
conn = duckdb.connect("../../data/ff_platform.duckdb")

# Goal
In this workbook, my main goal is to asses some benchmarks around what metrics determine a "starter" in the NFL - with specific focus on Fantasy Football relevant positions. So `QB`, `RB`, `WR`, & `TE`. 

## Method & Process
Essentially, I am assuming that that `snap_counts` is the best signal on whether someone is a starter or not. Since I'm mostly interested in understanding position wide trends, I can somewhat ignore `injuries` - as if it a starter is injured, a back up will certainly take their place. 

I'm totall open to this assumption being wrong. But let's start here before over optimizing


In [None]:
# Establish the query
query = """
select * 
from core.fct_player_game_stats 
"""
# Run Query
player_game_stats = conn.execute(query).pl()

# grab relevant players
target_stats = player_game_stats.filter(
    (pl.col('offense_snaps') >= 0.01) &
    (pl.col('position').is_in(['QB', 'RB', 'WR', 'TE']))
)

# show preview
target_stats.show()

In [None]:
# let's quanitfy the data I'm working with - date ranges first
season_min = target_stats['season'].min()
season_max = target_stats['season'].max()
min_week = target_stats.filter((pl.col('season')==season_min))['week'].min()
max_week = target_stats.filter((pl.col('season')==season_max))['week'].max()
# share the date range
print(f"Dataset Range: { season_min } Week { min_week } - { season_max } Week { max_week } ")

In [None]:
# aggregates by position across all seasons
aggregates_by_position = target_stats.group_by('position').agg(
    min=(pl.col('offense_pct').min()), 
    median=(pl.col('offense_pct').median()), 
    mean=pl.mean('offense_pct'), 
    max=pl.max('offense_pct')
    ).sort(by='position')
# print the aggregates
aggregates_by_position

In [None]:
# let's check out the distribution of each position
TARGET_POSITION = 'WR'
player_dist = sns.displot(
        target_stats.filter(pl.col('position')==TARGET_POSITION).group_by('player_id').agg(pl.mean('offense_pct')), 
            x='offense_pct',
            )

# axis labels
player_dist.set_axis_labels(
                x_var="perc", 
                y_var="players"
            )
# title - for some reason this isn't setting, but it's fine for now
player_dist.set_titles(
                f"{TARGET_POSITION} Offensive Snap Count Perc"
            )
# display chart
player_dist


In [None]:
# alright - so this is growing towards helpful. 