In [None]:
def simulate_data(
    size: int,
    logit_mu_mu: float=0.5,
    logit_mu_sd: float=0.1,
    intercept: float=0.0,
    beta: float=4.0,
    x_mu: float=0.0,
    x_sd: float=1.0,
    n_mu: float=10.0
) -> pl.DataFrame:

    logit_mu = norm.rvs(
        loc = logit_mu_mu, 
        scale = logit_mu_sd,
        size = size
    )
    
    
    x = norm.rvs(
        loc = x_mu,
        scale = x_sd,
        size = size
    )

    mu = expit(logit_mu)
    nu = 1 / expit(intercept + beta * x) - 1
    a = mu * nu
    b = (1 - mu) * nu
    n = poisson.rvs(n_mu, size=size)
    y = betabinom.rvs(n, a, b)

    return pl.DataFrame({
        'mu': mu,
        'x': x,
        'n': n,
        'y': y
    })

In [None]:
N = 10_000
N_BINS = 10
df = simulate_data(N)

In [None]:
def beta_binom_variance_model(df:pl.DataFrame) -> pm.model:

    assert 'mu' in df.columns, 'df must have column "mu"'
    assert 'y' in df.columns, 'df must have column "y"'
    assert 'n' in df.columns, 'df must have column "n"'
    assert 'x' in df.columns, 'df must have column "x"'

    with pm.Model() as model:

        # Data
        mu_data = pm.Data('mu_data', df['mu'].to_numpy())
        y_data = pm.Data('y_data', df['y'].to_numpy())
        n_data = pm.Data('n_data', df['n'].to_numpy())
        x_data = pm.Data('x_data', df['x'].to_numpy())
        
        # Priors
        intercept = pm.Normal('intercept', 0, 1)
        beta = pm.Normal('beta', 0, 1)

        # Variance Scaling Parameter
        nu = 1 / pm.math.invlogit(intercept + beta * x_data) - 1

        # Outcomes
        y_obs = pm.BetaBinomial(
            'y_obs',
            n = n_data,
            alpha = mu_data * nu,
            beta = (1 - mu_data) * nu,
            observed = y_data
        )

    return model

beta_binom_model = beta_binom_variance_model(df)
beta_binom_model.debug()

In [None]:
with beta_binom_model:
    idata = pm.sample()

In [None]:
# def add_precipitation_probability(data, threshold):

#     data = data.with_columns(
#         y = (pl.col('precipitation_sum') > threshold).cast(pl.Int16)
#     )
#     y = data['y'].to_numpy()

#     for i in range(6):
#         X = data[f'precipitation_sum_pred{i}'].to_numpy().reshape(-1,1)
#         mod = LogisticRegression().fit(X = X, y = y)
#         data = data.with_columns(pl.Series(f'precipitation_prob{i}', mod.predict_proba(X)[:,1]))

#     return data

# mean_precip = data.select(pl.col('precipitation_sum').mean()).to_numpy()[0][0]
# print('Mean Precipitation:', mean_precip)
# data = add_precipitation_probability(data, mean_precip)

In [None]:
def calibration_plot_multi(models, n_bins=5, figsize=(7,7)):
    """
    models: list of tuples
        [(pred1, y1, "label1"),
        (pred2, y2, "label2"),
        ...]
    """
    fig, ax = plt.subplots(figsize=figsize)

    # Perfect calibration diagonal
    ax.plot([0,1], [0,1], "k--", alpha=0.4, label="Perfect")

    for pred, y, label in models:
        df = pd.DataFrame({"pred": pred, "y": y})
        df["bin"] = pd.qcut(df["pred"], q=n_bins, duplicates="drop")

        agg = df.groupby("bin").agg(
            pred_mean=("pred", "mean"),
            obs_rate=("y", "mean")
        ).reset_index()

        ax.plot(
            agg["pred_mean"],
            agg["obs_rate"],
            marker="o",
            linewidth=2,
            label=label
        )

    ax.set_xlabel("Predicted Probability")
    ax.set_ylabel("Observed Frequency")
    ax.set_title("Calibration Plot (Multiple Models)")
    ax.legend()
    ax.grid(alpha=0.3)

    return fig, ax



calibration_plot_multi(
    [
        (data['precipitation_prob0'].to_numpy(), data['y'].to_numpy(), '0 days out'),
        (data['precipitation_prob1'].to_numpy(), data['y'].to_numpy(), '1 days out'),
        (data['precipitation_prob2'].to_numpy(), data['y'].to_numpy(), '2 days out'),
        (data['precipitation_prob3'].to_numpy(), data['y'].to_numpy(), '3 days out'),
        (data['precipitation_prob4'].to_numpy(), data['y'].to_numpy(), '4 days out'),
        (data['precipitation_prob5'].to_numpy(), data['y'].to_numpy(), '5 days out'),
    ]
);


In [None]:
data_long = (
    data
    .unpivot(
        on = [f'precipitation_prob{i}' for i in range(6)],
        index = ['date','y', 'precipitation_sum'],
        value_name = 'pred'
    )
    .with_columns(
        days_ahead = pl.col('variable').str.replace('precipitation_prob', '').cast(pl.Int16),
    )
    .select('date','days_ahead','pred','y', 'precipitation_sum')
)

In [None]:
def add_precipitation_probability_long(data, threshold):
    data = data.with_columns(y = (pl.col('precipitation_sum') > threshold).cast(pl.Int16))
    y = data['y'].to_numpy()
    X = data[f'pred'].to_numpy().reshape(-1,1)
    mod = LogisticRegression().fit(X = X, y = y)
    return data.with_columns(pl.Series(f'pred', mod.predict_proba(X)[:,1]))

data_long2 = add_precipitation_probability_long(data_long, mean_precip)

In [None]:
# low < 0.2
# low_medium = 0.2 - 0.5
# medium-high = 0.5 - 0.8
# high = 0.8 - 1.0

data_mod = (
    data_long2
    .with_columns(
        pred_bucket = (
            pl.when(pl.col('pred') < 0.2).then(pl.lit('low'))
            .when(pl.col('pred') < 0.5)).then(pl.lit('low-medium'))
            .when(pl.col('pred') < 0.8).then(pl.lit('medium-high'))
            .when(pl.col('pred') >= 0.8).then(pl.lit('high'))
    )
    .with_columns(
        mu = pl.col('pred').mean().over('pred_bucket')
    )
    .group_by('pred_bucket', 'days_ahead')
    .agg(
        mu = pl.col('mu').mean(),
        n = pl.col('y').count(),
        y = pl.col('y').sum()
    )
    .with_columns(
        x = pl.col('days_ahead')
    )
    .select('mu','x','n','y')
    .filter(pl.col('x') >= pl.lit(1))
)



In [None]:
mod = beta_binom_variance_model(data_mod)
mod.debug()