In [1]:
import math

import numpy as np
import scipy
from scipy import special, stats

import plotly
import plotly.express as px
import plotly.graph_objects as go
import os
import plotly.io as pio

pio.templates.default = "plotly_white"
pio.renderers.default = os.environ.get("PLOTLY_RENDERER", "notebook")

SEED = 7
rng = np.random.default_rng(SEED)

np.set_printoptions(precision=4, suppress=True)

print("numpy ", np.__version__)
print("scipy ", scipy.__version__)
print("plotly", plotly.__version__)


numpy  1.26.2
scipy  1.15.0
plotly 6.5.2


In [2]:
SQRT_2PI = math.sqrt(2.0 * math.pi)


def norm_pdf(x: np.ndarray, loc: float = 0.0, scale: float = 1.0) -> np.ndarray:
    x = np.asarray(x, dtype=float)
    if scale <= 0:
        raise ValueError("scale must be > 0")
    z = (x - loc) / scale
    return np.exp(-0.5 * z**2) / (scale * SQRT_2PI)


def norm_cdf(x: np.ndarray, loc: float = 0.0, scale: float = 1.0) -> np.ndarray:
    x = np.asarray(x, dtype=float)
    if scale <= 0:
        raise ValueError("scale must be > 0")
    z = (x - loc) / scale
    return special.ndtr(z)


def norm_logpdf(x: np.ndarray, loc: float = 0.0, scale: float = 1.0) -> np.ndarray:
    x = np.asarray(x, dtype=float)
    if scale <= 0:
        raise ValueError("scale must be > 0")
    z = (x - loc) / scale
    return -0.5 * z**2 - math.log(scale) - 0.5 * math.log(2.0 * math.pi)


def norm_loglik(loc: float, scale: float, x: np.ndarray) -> float:
    x = np.asarray(x, dtype=float)
    if scale <= 0 or np.any(~np.isfinite(x)):
        return -np.inf
    return float(np.sum(norm_logpdf(x, loc=loc, scale=scale)))


def norm_mle(x: np.ndarray) -> tuple[float, float]:
    """MLE for (μ, σ) under iid N(μ, σ²).

    Note: the MLE for σ uses ddof=0 (biased as an estimator of σ).
    """

    x = np.asarray(x, dtype=float)
    mu_hat = float(np.mean(x))
    sigma_hat = float(np.sqrt(np.mean((x - mu_hat) ** 2)))
    return mu_hat, sigma_hat


def sample_norm_box_muller(
    n: int,
    loc: float = 0.0,
    scale: float = 1.0,
    rng: np.random.Generator | None = None,
) -> np.ndarray:
    """NumPy-only sampling via the Box–Muller transform.

    Returns n iid samples from N(loc, scale^2).
    """

    if rng is None:
        rng = np.random.default_rng()
    if n < 0:
        raise ValueError("n must be >= 0")
    if scale <= 0:
        raise ValueError("scale must be > 0")

    m = (n + 1) // 2  # number of (Z0, Z1) pairs
    u1 = rng.random(m)
    u2 = rng.random(m)

    # Avoid log(0) when u1 is exactly 0.
    u1 = np.maximum(u1, np.nextafter(0.0, 1.0))

    r = np.sqrt(-2.0 * np.log(u1))
    theta = 2.0 * math.pi * u2

    z0 = r * np.cos(theta)
    z1 = r * np.sin(theta)

    z = np.empty(2 * m, dtype=float)
    z[0::2] = z0
    z[1::2] = z1
    z = z[:n]

    return loc + scale * z


In [3]:
x = np.linspace(-8, 8, 800)

params = [
    (0.0, 1.0),
    (0.0, 2.0),
    (1.5, 1.0),
    (-2.0, 0.6),
]

fig = go.Figure()
for mu, sigma in params:
    fig.add_trace(
        go.Scatter(
            x=x,
            y=norm_pdf(x, loc=mu, scale=sigma),
            mode="lines",
            name=f"μ={mu:g}, σ={sigma:g}",
        )
    )
    fig.add_vline(x=mu, line_dash="dot", opacity=0.25)

fig.update_layout(title="Normal PDFs for different (μ, σ)", xaxis_title="x", yaxis_title="f(x)")
fig.show()


In [4]:
# MLE demo on simulated data
true_mu = 1.5
true_sigma = 0.8
n = 600

x = sample_norm_box_muller(n, loc=true_mu, scale=true_sigma, rng=rng)

mu_hat, sigma_hat = norm_mle(x)

loglik_true = norm_loglik(true_mu, true_sigma, x)
loglik_hat = norm_loglik(mu_hat, sigma_hat, x)

true_mu, true_sigma, mu_hat, sigma_hat, loglik_true, loglik_hat


(1.5,
 0.8,
 1.4563135974860988,
 0.7969433986282367,
 -716.0835277507142,
 -715.1801474751153)

In [5]:
# Sampling: compare histogram to the true PDF
mu = 0.7
sigma = 1.3
n = 60_000

samples = sample_norm_box_muller(n, loc=mu, scale=sigma, rng=rng)

x_grid = np.linspace(mu - 4.5 * sigma, mu + 4.5 * sigma, 500)

fig = px.histogram(
    samples,
    nbins=70,
    histnorm="probability density",
    title=f"Monte Carlo samples vs PDF (n={n}, μ={mu:g}, σ={sigma:g})",
    labels={"value": "x"},
)
fig.add_trace(go.Scatter(x=x_grid, y=norm_pdf(x_grid, mu, sigma), mode="lines", name="true pdf"))
fig.update_layout(yaxis_title="density")
fig.show()

samples.mean(), samples.std(ddof=0)


(0.7031752075013465, 1.2937846790607508)

In [6]:
# PDF and CDF for multiple scales
mu = 0.0
sigmas = [0.5, 1.0, 2.0]
x = np.linspace(-8, 8, 800)

fig_pdf = go.Figure()
fig_cdf = go.Figure()

for s in sigmas:
    fig_pdf.add_trace(go.Scatter(x=x, y=norm_pdf(x, mu, s), mode="lines", name=f"σ={s:g}"))
    fig_cdf.add_trace(go.Scatter(x=x, y=norm_cdf(x, mu, s), mode="lines", name=f"σ={s:g}"))

fig_pdf.update_layout(title="Normal PDF (μ=0)", xaxis_title="x", yaxis_title="f(x)")
fig_cdf.update_layout(title="Normal CDF (μ=0)", xaxis_title="x", yaxis_title="F(x)")

fig_pdf.show()
fig_cdf.show()


In [7]:
# Empirical CDF vs true CDF
mu = -0.5
sigma = 1.2
n = 25_000
samples = sample_norm_box_muller(n, loc=mu, scale=sigma, rng=rng)

xs = np.sort(samples)
ys = np.arange(1, n + 1) / n

x_grid = np.linspace(mu - 4.5 * sigma, mu + 4.5 * sigma, 600)

fig = go.Figure()
fig.add_trace(go.Scatter(x=xs, y=ys, mode="lines", name="empirical CDF"))
fig.add_trace(go.Scatter(x=x_grid, y=norm_cdf(x_grid, mu, sigma), mode="lines", name="true CDF"))
fig.update_layout(
    title=f"Empirical CDF vs true CDF (n={n}, μ={mu:g}, σ={sigma:g})",
    xaxis_title="x",
    yaxis_title="F(x)",
)
fig.show()


In [8]:
mu = 0.7
sigma = 1.3
dist = stats.norm(loc=mu, scale=sigma)

x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 7)
pdf_vals = dist.pdf(x)
cdf_vals = dist.cdf(x)

# Sampling
samples = dist.rvs(size=5, random_state=rng)

# Fit (MLE)
big_sample = dist.rvs(size=5_000, random_state=rng)
mu_fit, sigma_fit = stats.norm.fit(big_sample)

x, pdf_vals, cdf_vals, samples, (mu_fit, sigma_fit)


(array([-3.2, -1.9, -0.6,  0.7,  2. ,  3.3,  4.6]),
 array([0.0034, 0.0415, 0.1861, 0.3069, 0.1861, 0.0415, 0.0034]),
 array([0.0013, 0.0228, 0.1587, 0.5   , 0.8413, 0.9772, 0.9987]),
 array([-0.3951,  1.3869,  0.9551,  0.3398, -0.1837]),
 (0.6852965688237751, 1.3020645349649365))

In [9]:
# Tail-stability: logcdf/logsf vs log(cdf/sf)
z = -40.0
cdf_direct = stats.norm.cdf(z)
logcdf_stable = stats.norm.logcdf(z)

z2 = 40.0
sf_direct = stats.norm.sf(z2)
logsf_stable = stats.norm.logsf(z2)

(cdf_direct, logcdf_stable), (sf_direct, logsf_stable)


((0.0, -804.6084420137539), (0.0, -804.6084420137539))

In [10]:
# Hypothesis test example: two-sided z-test for a mean (σ known)
mu0 = 0.0
sigma_known = 2.0
n = 40

# Simulated measurements with true mean != mu0
true_mu = 0.9
data = sample_norm_box_muller(n, loc=true_mu, scale=sigma_known, rng=rng)

xbar = data.mean()
z_obs = (xbar - mu0) / (sigma_known / math.sqrt(n))
p_two_sided = 2.0 * stats.norm.sf(abs(z_obs))

alpha = 0.05
z_crit = stats.norm.ppf(1 - alpha / 2)
ci = (
    xbar - z_crit * sigma_known / math.sqrt(n),
    xbar + z_crit * sigma_known / math.sqrt(n),
)

xbar, z_obs, p_two_sided, ci


(1.1249403105684774,
 3.5573736131335747,
 0.0003745812231249413,
 (0.5051452782639159, 1.744735342873039))

In [11]:
# Bayesian update for μ with known σ (Normal–Normal)
mu0 = 0.0
tau0 = 1.5  # prior std dev
sigma = sigma_known

xbar = data.mean()
tau_n2 = 1.0 / (1.0 / tau0**2 + n / sigma**2)
mu_n = tau_n2 * (mu0 / tau0**2 + n * xbar / sigma**2)
tau_n = math.sqrt(tau_n2)

mu_n, tau_n


(1.077070510118755, 0.309426373877638)

In [12]:
# Visualize prior vs posterior over μ
mu_grid = np.linspace(mu_n - 5 * tau0, mu_n + 5 * tau0, 600)

prior = stats.norm(loc=mu0, scale=tau0)
post = stats.norm(loc=mu_n, scale=tau_n)

fig = go.Figure()
fig.add_trace(go.Scatter(x=mu_grid, y=prior.pdf(mu_grid), mode="lines", name="prior"))
fig.add_trace(go.Scatter(x=mu_grid, y=post.pdf(mu_grid), mode="lines", name="posterior"))
fig.update_layout(title="Bayesian update for μ (σ known)", xaxis_title="μ", yaxis_title="density")
fig.show()


In [13]:
# Generative modeling example: 2D correlated Gaussian via a linear transform
n = 3_000
mu_vec = np.array([1.0, -1.0])
Sigma = np.array([[1.0, 0.8], [0.8, 2.0]])
L = np.linalg.cholesky(Sigma)

z = sample_norm_box_muller(2 * n, loc=0.0, scale=1.0, rng=rng).reshape(n, 2)
x = mu_vec + z @ L.T

df = {"x1": x[:, 0], "x2": x[:, 1]}
fig = px.scatter(df, x="x1", y="x2", opacity=0.35, title="Samples from a correlated 2D Gaussian")
fig.update_layout(xaxis_title="x1", yaxis_title="x2")
fig.show()

x.mean(axis=0), np.cov(x.T)


(array([ 0.9797, -1.0441]),
 array([[1.0095, 0.8152],
        [0.8152, 1.9817]]))