In [1]:
import numpy as np
from scipy.stats import t

In [2]:
# scores = np.array([0.8, 0.85, 0.82, 0.8, 0.83])
scores = np.array([0.8, 0.85])
score_mean = scores.mean()
score_std = scores.std(ddof=1)
score_se = score_std / np.sqrt(len(scores))
score_ci = t.interval(0.95, len(scores) - 1, loc=score_mean, scale=score_se)
print(f"Mean: {score_mean:.3f}")
print(f"Std: {score_std:.3f}")
print(f"Std Error: {score_se:.3f}")
print(f"CI: ({score_ci[0]:.3f}, {score_ci[1]:.3f})")

Mean: 0.825
Std: 0.035
Std Error: 0.025
CI: (0.507, 1.143)


In [3]:
score_mean - score_std, score_mean + score_std

(0.7896446609406726, 0.8603553390593273)

In [4]:
scores.size

2

In [5]:
a = np.array([0.8, 0.85])
b = np.array([100, 120])
a @ b, 0.8 * 100 + 0.85 * 120


(182.0, 182.0)

In [6]:
import numpy as np
from scipy.stats import norm, t

# --- Primary: Wilson CI on micro accuracy ---
def wilson_ci(num_correct, num_samples, alpha=0.05):
    """Wilson CI on micro accuracy
    
    Args:
        num_correct: number correct from ONE out-of-fold prediction per sample (possibly after averaging probs across repeats)
        num_samples: total samples
        alpha: significance level (default: 0.05)

    Returns:
        p: proportion correct
        ci: tuple of lower and upper bounds of the Wilson CI
    """
    z = norm.ppf(1 - alpha/2)
    p = num_correct / num_samples  # proportion correct
    denom = 1 + z**2 / num_samples
    center = (p + z**2/(2*num_samples)) / denom
    half = (z / denom) * np.sqrt(p*(1-p)/num_samples + z**2/(4*num_samples**2))
    ci = (center - half, center + half)
    return p, ci

# x = number correct from ONE OOF prediction per sample (possibly after averaging probs across repeats)
# n = total samples
# acc = x/n; wilson_ci(x,n)

# --- Optional: t-interval across repeats ---
def repeat_mean_t_ci(scores, alpha=0.05):
    scores = np.asarray(scores)
    R = scores.size
    assert R >= 2
    m = scores.mean()
    sd = scores.std(ddof=1)
    se = sd / np.sqrt(R)
    tcrit = t.ppf(1 - alpha/2, R-1)
    return m, (m - tcrit*se, m + tcrit*se)


wilson_ci(451, 618), 451/618

((0.7297734627831716, (0.6934218906320906, 0.7632861592599165)),
 0.7297734627831716)

In [7]:
# dummy test classes
from dataclasses import dataclass
import dataclasses
from typing import List


@dataclass
class RepeatResult:
    """Result from a single repeat (collection of folds)"""

    repeat_id: int
    mean_score: float
    std_score: float

@dataclass
class EvaluationResult:
    """Complete evaluation result for a model"""

    model_name: str
    model_format: str
    metric_name: str
    repeat_results: List[RepeatResult]

repeat_results = [
    RepeatResult(repeat_id=id, mean_score=score, std_score=std)
    for id, score, std in
    zip(range(3), [0.8, 0.85, 0.82], [0.05, 0.03, 0.04])
]

evaluation_result = EvaluationResult(
    model_name="test_model",
    model_format="test_format",
    metric_name="test_metric",
    repeat_results=repeat_results)

evaluation_result

EvaluationResult(model_name='test_model', model_format='test_format', metric_name='test_metric', repeat_results=[RepeatResult(repeat_id=0, mean_score=0.8, std_score=0.05), RepeatResult(repeat_id=1, mean_score=0.85, std_score=0.03), RepeatResult(repeat_id=2, mean_score=0.82, std_score=0.04)])

In [8]:
dataclasses.asdict(evaluation_result)

{'model_name': 'test_model',
 'model_format': 'test_format',
 'metric_name': 'test_metric',
 'repeat_results': [{'repeat_id': 0, 'mean_score': 0.8, 'std_score': 0.05},
  {'repeat_id': 1, 'mean_score': 0.85, 'std_score': 0.03},
  {'repeat_id': 2, 'mean_score': 0.82, 'std_score': 0.04}]}

In [9]:
# reconstruct from dict
EvaluationResult(**dataclasses.asdict(evaluation_result))


EvaluationResult(model_name='test_model', model_format='test_format', metric_name='test_metric', repeat_results=[{'repeat_id': 0, 'mean_score': 0.8, 'std_score': 0.05}, {'repeat_id': 1, 'mean_score': 0.85, 'std_score': 0.03}, {'repeat_id': 2, 'mean_score': 0.82, 'std_score': 0.04}])

In [10]:
scores

array([0.8 , 0.85])

In [11]:
import numpy as np

repeat_scores = np.array([
    [.67, .70],
    [.70, .73],
    [.48, .47],
    [.53, .52]
])

repeat_scores.mean(axis=0)


array([0.595, 0.605])

In [12]:
import numpy as np
from scipy.stats import t

def repeat_mean_t_ci(scores, alpha=0.05):
    """Repeat mean t-confidence interval
    
    Args:
        scores: array of scores
        alpha: significance level (default: 0.05)

    Returns:
        m: mean score
        ci: tuple of lower and upper bounds of the t-confidence interval
    """
    scores = np.asarray(scores)
    m = scores.mean()
    R = scores.size
    # assert R >= 2
    if R < 2:
        return m, (np.nan, np.nan)
    sd = scores.std(ddof=1)
    se = sd / np.sqrt(R)
    tcrit = t.ppf(1 - alpha/2, R-1)
    return m, (m - tcrit*se, m + tcrit*se)


# scores = np.array([0.8, 0.85, 0.82, 0.8, 0.83])
scores = np.array([0.8])
print(f"repeat_mean_t_ci(scores): {repeat_mean_t_ci(scores)}")

score_macro_avg = scores.mean()
score_macro_std = scores.std(ddof=1)  # sample SD (unbiased)
score_macro_se = score_macro_std / np.sqrt(scores.shape[0])  # sample Std Error of the mean
score_macro_t_95ci = t.interval(0.95, scores.shape[0] - 1, loc=score_macro_avg, scale=score_macro_se)
print(f"manual: {score_macro_avg}, {score_macro_t_95ci}")

# check if equal
repeat_mean_t_ci(scores) == (score_macro_avg, score_macro_t_95ci)

repeat_mean_t_ci(scores): (0.8, (nan, nan))
manual: 0.8, (nan, nan)


  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


False

In [13]:
np.nan == np.nan

False

In [14]:
score_macro_t_95ci[0] == np.nan


False

In [20]:
from typing import Iterable
import numpy as np

def bootstrap_mean_ci(values: Iterable[float], B=10_000, alpha=0.05, seed=0) -> tuple[float, tuple[float, float]]:
    """Bootstrap mean confidence interval

    Args:
        values: array of values
        B: number of bootstrap samples (default: 10_000)
        alpha: significance level (default: 0.05)
        seed: random seed (default: 0)
    
    Returns:
        m: mean of values
        ci: tuple of lower and upper bounds of the bootstrap confidence interval
    """
    rng = np.random.default_rng(seed)
    values = np.array(values)
    n = len(values)
    boots = np.empty(B)
    for b in range(B):
        idx = rng.integers(0, n, size=n)
        boots[b] = values[idx].mean()
    lo, hi = np.percentile(boots, [100*alpha/2, 100*(1-alpha/2)])
    return values.mean(), (lo, hi)

values = np.array([0.8, 0.85, 0.82, 0.8, 0.83])
bootstrap_mean_ci(values)


(0.82, (0.8039999999999999, 0.836))

In [23]:
from scipy import stats

spb = stats.bootstrap(values.reshape(1, -1), np.mean, confidence_level=0.95, method='basic')
spb

BootstrapResult(confidence_interval=ConfidenceInterval(low=0.8039999999999999, high=0.836), bootstrap_distribution=array([0.808, 0.804, 0.81 , ..., 0.814, 0.828, 0.826]), standard_error=0.00853231052416974)

In [24]:
spb.confidence_interval

ConfidenceInterval(low=0.8039999999999999, high=0.836)

In [25]:
spb.standard_error

0.00853231052416974

In [28]:
values.std(ddof=0) / np.sqrt(values.size)

0.008485281374238558

In [16]:
values = np.array([0.8, 0.85, 0.82, 0.8, 0.83])

rng = np.random.default_rng(0)
n = len(values)
print(f"n: {n}")
idx = rng.integers(0, n, size=n)
print(f"idx: {idx}")
print(f"values[idx]: {values[idx]}")
print(f"values[idx].mean(): {values[idx].mean()}")




n: 5
idx: [4 3 2 1 1]
values[idx]: [0.83 0.8  0.82 0.85 0.85]
values[idx].mean(): 0.8299999999999998


In [18]:
from typing import Any, Iterable
import numpy as np

from tqdm import trange


def stratified_bootstrap_mean_ci(
    groups: Iterable[Any],
    values: Iterable[float],
    B: int = 10_000,
    alpha: float = 0.05,
    seed: int = 0
) -> tuple[float, tuple[float, float]]:
    """Stratified bootstrap mean confidence interval

    Args:
        groups: array of group labels. Shape: (n,)
        values: array of values. Shape: (n,)
        B: number of bootstrap samples (default: 10_000)
        alpha: significance level (default: 0.05)
        seed: random seed (default: 0)
    
    Returns:
        m: mean of values
        ci: tuple of lower and upper bounds of the stratified bootstrap confidence interval
    """
    rng = np.random.default_rng(seed)
    groups = np.asarray(groups)
    values = np.asarray(values)

    assert len(groups) == len(values), (
        f"groups and values must have the same length, "
        f"got {len(groups)} and {len(values)}"
    )

    uniq = np.unique(groups)
    boots = np.empty(B)
    
    for b in trange(B, desc=f"Stratified Bootstrap CI ({len(uniq)} groups)"):
        parts = []
        # resample within each group (stratum)
        for g in uniq:
            idx = np.where(groups == g)[0]  # indices for this group
            bs = rng.choice(idx, size=len(idx), replace=True)  # n random indices w/ replacement
            parts.append(values[bs])
        boots[b] = np.concatenate(parts).mean()  # combine groups and compute mean
    
    # compute confidence interval from bootstrap distribution
    lo, hi = np.percentile(boots, [100*alpha/2, 100*(1-alpha/2)])
    return values.mean(), (lo, hi)


groups = np.array([0, 0, 1, 1, 1, 2, 3, 3])
values = np.array([0.8, 0.85, 0.62, 0.6, 0.63, 0.91, 0.74, 0.76])
stratified_bootstrap_mean_ci(groups, values)

Stratified Bootstrap CI (4 groups): 100%|██████████| 10000/10000 [00:00<00:00, 25391.52it/s]


(0.73875, (0.72875, 0.74875))