# Project 1 — Berry–Esseen Rate for Fixed Degree _d_

Empirical confirmation that the overlap  
$$
X_N := \sqrt{N}\,\langle q, u_2\rangle
$$
between a fixed test vector $q\perp\mathbf 1$ and the second eigenvector $u_2$ of a **random $d$-regular graph** converges to 𝒩(0, 1) at the **optimal Berry–Esseen rate**
$$
\sup_x \bigl|\mathbb P(X_N\le x)-\Phi(x)\bigr| \;=\; \Theta\!\bigl(N^{-1/6}\bigr),
$$
as established in Nagel (2025) and Huang–Yau (2023).

**Credits**: This notebook was written by [Hershraj Niranjani](https://hershrajn.com)

---

## Experimental Design

| Stage | What we do | Key parameters |
|-------|------------|----------------|
| **Graph generation** | Build $M \approx 2,000$ simple $d$‑regular graphs for every $(N, d)$. | $d \in \{3,5,10,20\};\; N \in \{5\,000, 10\,000, 20\,000, 40\,000\}$; `igraph.Graph.K_Regular`. |
| **Spectral step** | Form the normalised adjacency $\tilde A = A / \sqrt{d-1}$ and extract $u_2$. | `scipy.sparse.linalg.eigsh` |
| **Statistic** | Compute $X_N$ with $q = e_1 - \dfrac{1}{N}\mathbf 1$ (normalised). | Any deterministic $q \perp \mathbf 1$ works. |
| **Distance metric** | Kolmogorov–Smirnov distance $D_N = \sup_x \lvert F_N(x) - \Phi(x) \rvert$. | Use SciPy’s `stats.kstest` or a manual CDF grid. |
| **Rate extraction** | Linear regression of $\log D_N$ on $\log N$. | Slope $\approx -1/6$ corroborates theory. |


In [None]:
# Note: This was ran in a python virtual environment with python version 3.13.5
# Install the necessary dependencies
%pip install -r requirements.txt

In [None]:
from pathlib import Path
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import math
import os 
from multiprocessing import cpu_count
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import re
import scipy.sparse.linalg as spla
from scipy.stats import kstest, norm, gaussian_kde
import pandas as pd
from typing import List, Sequence

RANDOM_SEED = 42 # 42 for reproducibility

BASE = Path(os.path.abspath(''))

FIGURES = BASE / "figures"
FIGURES.mkdir(exist_ok=True) # Ensure that the figures directory exists

DATA = BASE / "data"
DATA.mkdir(exist_ok=True) # Ensure that the data directory exists


NUM_GRAPHS_TO_LOAD = 2_000 # Number of graphs to load in per (n, d) configuration

print("Setup complete.")

In [None]:
def load_graphs(file_names: Sequence[str], in_dir: str | Path) -> List[nx.Graph]:
    """Load .npz edge lists (saved by generate_graphs.py) into NetworkX graphs."""
    graphs: List[nx.Graph] = []
    for file_name in file_names:
        file_path = Path(in_dir) / file_name
        data = np.load(file_path)
        edges = data["edges"]          # shape (m, 2), dtype uint32
        g = nx.Graph()
        g.add_edges_from(map(tuple, edges))
        graphs.append(g)
    return graphs

## Run the analysis
Make sure that you have generated the graphs using the `generate_graphs.py` script

In [None]:
BATCH_SIZE         = 2_048 * 4     # tune for RAM / I/O
N_JOBS             = max(cpu_count() - 1, 1)

def _x_stat_from_file(path: Path, q: np.ndarray) -> float:
    """Load one .npz, build Graph, return X_N."""
    edges = np.load(path)["edges"]
    g = nx.Graph()
    g.add_edges_from(map(tuple, edges))

    n = g.number_of_nodes()
    d = int(2 * g.number_of_edges() / n)

    A = nx.to_scipy_sparse_array(g, dtype=float) / math.sqrt(d - 1)
    vals, vecs = spla.eigsh(A, k=2, which="LM", tol=1e-2)
    u2 = vecs[:, np.argmax(vals) ^ 1]
    return math.sqrt(n) * float(q @ u2)

graph_files = [
    p for d in ("d3", "d5", "d10", "d20") for p in (DATA / d).glob("*.npz")
]

pat_n = re.compile(r"_n(\d+)")
groups: dict[tuple[int, int], list[Path]] = {}

for p in graph_files:
    d = int(p.parent.name[1:].lstrip("d"))          # folder "d5" -> 5
    n = int(pat_n.search(p.stem).group(1))
    groups.setdefault((n, d), []).append(p)

rows = []
for (n, d), paths in sorted(groups.items()):
    paths = paths[:NUM_GRAPHS_TO_LOAD]              # deterministic slice
    q     = np.append([1.0], np.zeros(n - 1)) - 1.0 / n
    q    /= np.linalg.norm(q)

    X_vals = []
    for i in range(0, len(paths), BATCH_SIZE):
        batch = paths[i : i + BATCH_SIZE]
        X_vals += Parallel(
            n_jobs=N_JOBS,
            backend="loky",
            pre_dispatch="2*n_jobs"                 # keeps memory bounded
        )(delayed(_x_stat_from_file)(p, q) for p in batch)

        tqdm.write(f"[{n=}, {d=}] processed {i + len(batch):,}/{len(paths):,}")

    D_n, _ = kstest(X_vals, norm.cdf)
    var_n    = float(np.var(X_vals, ddof=1))            # unbiased sample variance
    hist_y, hist_x = np.histogram(
        X_vals, bins="auto", density=True
    ) 
    kde_x = np.linspace(min(X_vals), max(X_vals), 200)
    kde_y = gaussian_kde(X_vals)(kde_x)
    rows.append(
        dict(
            n=n,
            d=d,
            D_n=D_n,
            var_n=var_n,
            n_graphs=len(X_vals),
            hist_x=hist_x,           # bin edges  (len = k+1)
            hist_y=hist_y,           # densities  (len = k)
            kde_x=kde_x,           # uncomment if you keep KDE
            kde_y=kde_y,
        )
    )
    tqdm.write(
        f"[n={n:,}, d={d}] D_n={D_n:.4f}, Var={var_n:.3f} from {len(X_vals):,} graphs"
    )

print("All batches done.")
full_df = pd.DataFrame(rows)
full_df.head(50)

res_df = full_df.copy()

In [None]:
# Visualize the Berry-Esseen rate for fixed d (KS distance)
fig, ax = plt.subplots(figsize=(6, 4))

for d, sub in res_df.groupby("d"):
    sub = sub.sort_values("n")

    xs = np.log10(sub["n"].values)
    ys = np.log10(sub["D_n"].clip(lower=1e-12).values)  # avoid log10(0)

    # least‑squares slope on log‑log scale
    m, b = np.polyfit(xs, ys, 1)
    ax.plot(xs, ys, "o-", label=fr"$d={d}$  slope={m:.3f}")

# reference line with slope −1/6
N_min, N_max = res_df["n"].min(), res_df["n"].max()
ref_x = np.array([N_min, N_max])
y0 = np.log10(res_df["D_n"].max())          # anchor at left edge
ref_y = y0 - (1/6) * (np.log10(ref_x) - np.log10(N_min))
ax.plot(np.log10(ref_x), ref_y, "--", color="gray", lw=1.2,
        label=r"reference slope $-1/6$")

ax.set_xlabel(r"$\log_{10} N$")
ax.set_ylabel(r"$\log_{10} D_N$")
ax.set_title("Berry-Esseen rate for fixed $d$ (KS distance)")
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
# Visualize the Variance
fig, ax = plt.subplots(figsize=(6, 4))

for d, sub in res_df.groupby("d"):
    sub = sub.sort_values("n")
    ax.plot(
        np.log10(sub["n"]),
        sub["var_n"],
        "o-",
        label=f"d={d}",
    )

ax.axhline(1.0, ls="--", color="gray", lw=1.2, label="theory σ² = 1")
ax.set_xlabel(r"$\log_{10} N$")
ax.set_ylabel(r"sample variance  $\operatorname{Var}[X_N]$")
ax.set_title("Variance of $X_N$ for fixed $d$")
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
# Visualize the empirical density of X_N vs Standard Normal
fig, ax = plt.subplots(figsize=(6, 4))

for d in sorted(res_df["d"].unique()):
    row = res_df[res_df["d"] == d].sort_values("n").iloc[-1]  # largest N for this d
    ax.plot(
        row["kde_x"], row["kde_y"],
        label=f"d={d}, N={row['n']:,}"
    )

# standard normal pdf for reference
x_grid = np.linspace(-4, 4, 400)
ax.plot(x_grid, norm.pdf(x_grid), "k--", lw=1.5, label="N(0, 1)")

ax.set_xlabel(r"$x$")
ax.set_ylabel("density")
ax.set_title("Empirical density of $X_N$ vs Standard Normal")
ax.legend(fontsize=8)
plt.tight_layout()
plt.show()

# Project 2 — Berry–Esseen Rate in the Sparse‑Growing‑Degree Regime

We now let the degree grow slowly with graph size,  
$$
d = d(N) \;\le\; N^{1/4},
$$
and test the refined prediction (Nagel 2025; Huang–Yau 2023) that the **rescaled KS error**

$$
\tilde D_N \;=\; \frac{D_N}{\sqrt{d}}
$$

still obeys the optimal rate  

$$
\tilde D_N \;=\; \Theta\!\bigl(N^{-1/6}\bigr).
$$

Equivalently,

$$
\sup_x \Bigl|\,\mathbb P\!\bigl(X_N \le x\bigr) - \Phi(x)\Bigr|
\;=\;
\Theta\!\bigl(\sqrt{d}\,N^{-1/6}\bigr),
\qquad
X_N \;=\; \sqrt{N}\,\langle q, u_2\rangle.
$$

---

## Experimental Design

| Stage | What we do | Key parameters |
|-------|------------|----------------|
| **Graph generation** | Build simple $d(N)$‑regular graphs with degrees that scale as powers of $N$. | $d(N) \in \{N^{0.00},\, N^{0.10},\, N^{0.25}\}$; &nbsp;$N \in \{5\,000,\;10\,000,\;20\,000,\;40\,000\}$; &nbsp;≈ 50 k graphs per $(N,d)$. |
| **Spectral step** | Form the normalised adjacency $\tilde A = A / \sqrt{d-1}$ and extract $u_2$. | `scipy.sparse.linalg.eigsh` |
| **Statistic** | $X_N = \sqrt{N}\,\langle q, u_2\rangle$ with $q = e_1 - \tfrac1N\mathbf 1$ (normalised). | Same deterministic $q \perp \mathbf 1$ as in Project 1. |
| **Distance metric** | $\tilde D_N = D_N / \sqrt{d}$ where $D_N$ is the KS distance between $X_N$ and $\mathcal N(0,1)$. | `scipy.stats.kstest` |
| **Rate extraction** | Regress $\log_{10}\tilde D_N$ on $\log_{10}N$. | Slope $\approx -1/6$ confirms universality after the $\sqrt{d}$ factor. |


In [None]:
limited_df = full_df[ full_df["d"] <= full_df["n"] ** 0.25 ] # Ensure d(N) ≤ N^0.25

In [None]:
# Visualize the scaled KS distance for growing degree d(N) ≤ N^0.25
fig, ax = plt.subplots(figsize=(6, 4))

# points: all (n,d) pairs, colour‑by‑d for readability
scatter = ax.scatter(
    np.log10(limited_df["n"]),
    np.log10(limited_df["D_n"] / np.sqrt(limited_df["d"])),
    c=limited_df["d"], cmap="viridis", s=35
)

# global least‑squares slope in log–log space
xs = np.log10(limited_df["n"].values)
ys = np.log10(limited_df["D_n"].values / np.sqrt(limited_df["d"].values))
m, b = np.polyfit(xs, ys, 1)
ax.plot(xs, m * xs + b, "k--", lw=1.5, label=f"fit slope = {m:.3f}")

# reference – theoretical slope −1/6
N_min, N_max = limited_df["n"].min(), limited_df["n"].max()
ref_x = np.array([N_min, N_max])
ref_y = b - (1 / 6) * (np.log10(ref_x) - np.log10(N_min))
ax.plot(np.log10(ref_x), ref_y, "r:", lw=1.2, label="expected slope -1/6")

ax.set_xlabel("log10 N")
ax.set_ylabel("log10 [D_N / sqrt(d)]")
ax.set_title("Scaled KS distance for growing degree d(N) ≤ N^0.25")
ax.legend()
cbar = plt.colorbar(scatter, ax=ax, label="degree d")
plt.tight_layout()
plt.show()

In [None]:
# See how this looks for points not covered by the bound d(N) ≤ N^0.25
# ── Points with d(N) > N^0.25  ──────────────────────────────────────────────
outside_df = full_df[full_df["d"] > full_df["n"] ** 0.25]

fig, ax = plt.subplots(figsize=(6, 4))

# scatter: log‑log of scaled KS distance, colour by degree
scatter = ax.scatter(
    np.log10(outside_df["n"]),
    np.log10(outside_df["D_n"] / np.sqrt(outside_df["d"])),
    c=outside_df["d"],
    cmap="plasma",
    s=35,
    marker="o",
    label="outside bound"
)

# least‑squares slope for these out‑of‑bound points
xs = np.log10(outside_df["n"].values)
ys = np.log10(outside_df["D_n"].values / np.sqrt(outside_df["d"].values))
m, b = np.polyfit(xs, ys, 1)
ax.plot(xs, m * xs + b, "k--", lw=1.5, label=f"fit slope = {m:.3f}")

# theoretical reference: slope −1/6 anchored at smallest N
N_min, N_max = outside_df["n"].min(), outside_df["n"].max()
ref_x = np.array([N_min, N_max])
ref_y = b - (1 / 6) * (np.log10(ref_x) - np.log10(N_min))
ax.plot(np.log10(ref_x), ref_y, "r:", lw=1.2, label="expected slope -1/6")

ax.set_xlabel("log10 N")
ax.set_ylabel("log10 [D_N / sqrt(d)]")
ax.set_title("Scaled KS distance for d(N) > N^0.25 (outside bound)")
ax.legend()
plt.colorbar(scatter, ax=ax, label="degree d")
plt.tight_layout()
plt.show()
