# Project 1 — Berry–Esseen Rate for Fixed Degree _d_

Empirical confirmation that the overlap  
$$
X_N := \sqrt{N}\,\langle q, u_2\rangle
$$
between a fixed test vector $q\perp\mathbf 1$ and the second eigenvector $u_2$ of a **random $d$-regular graph** converges to 𝒩(0, 1) at the **optimal Berry–Esseen rate**
$$
\sup_x \bigl|\mathbb P(X_N\le x)-\Phi(x)\bigr| \;=\; \Theta\!\bigl(N^{-1/6}\bigr),
$$
as established in Nagel (2025) and Huang–Yau (2023).

**Credits**: This notebook was written by [Hershraj Niranjani](https://hershrajn.com)

---

## Experimental Design

| Stage | What we do | Key parameters |
|-------|------------|----------------|
| **Graph generation** | Build $M \approx 1000$ simple $d$‑regular graphs for every size $N$. | $d \in \{3,5,10\};\; N \in \{5\,000, 10\,000, 20\,000, 40\,000\}$; `networkx.random_regular_graph` + rejection. |
| **Spectral step** | Form the normalised adjacency $\tilde A = A / \sqrt{d-1}$ and extract $u_2$. | `scipy.sparse.linalg.eigsh` (k = 2, which='LM'). |
| **Statistic** | Compute $X_N$ with $q = e_1 - \dfrac{1}{N}\mathbf 1$ (normalised). | Any deterministic $q \perp \mathbf 1$ works. |
| **Distance metric** | Kolmogorov–Smirnov distance $D_N = \sup_x \lvert F_N(x) - \Phi(x) \rvert$. | Use SciPy’s `stats.kstest` or a manual CDF grid. |
| **Rate extraction** | Linear regression of $\log D_N$ on $\log N$. | Slope $\approx -1/6$ corroborates theory. |


In [None]:
# Note: This was ran in a python virtual environment with python version 3.13.5
# Install the necessary dependencies
%pip install -r requirements.txt

In [None]:
from pathlib import Path
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import math
import os 
from multiprocessing import cpu_count
from pathlib import Path
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import re
import scipy.sparse.linalg as spla
from scipy.stats import kstest, norm
import pandas as pd
from typing import Union, List, Dict, Tuple, Sequence

RANDOM_SEED = 42 # 42 for reproducibility

BASE = Path(os.path.abspath(''))

FIGURES = BASE / "figures"
FIGURES.mkdir(exist_ok=True) # Ensure that the figures directory exists

DATA = BASE / "data"
DATA.mkdir(exist_ok=True) # Ensure that the data directory exists


NUM_GRAPHS_TO_LOAD = 10_000 # Number of graphs to load in per (n, d) configuration

print("Setup complete.")

In [None]:
def load_graphs(file_names: Sequence[str], in_dir: str | Path) -> List[nx.Graph]:
    """Load .npz edge lists (saved by generate_graphs.py) into NetworkX graphs."""
    graphs: List[nx.Graph] = []
    for file_name in file_names:
        file_path = Path(in_dir) / file_name
        data = np.load(file_path)
        edges = data["edges"]          # shape (m, 2), dtype uint32
        g = nx.Graph()
        g.add_edges_from(map(tuple, edges))
        graphs.append(g)
    return graphs

## Run the analysis
Make sure that you have generated the graphs using the `generate_graphs.py` script

In [None]:
graph_files = [
    p for d in ("d3", "d5", "d10") for p in (DATA / d).glob("*.npz")
]

pat_n = re.compile(r"_n(\d+)")
graphs_by_cfg: dict[tuple[int, int], list[Path]] = {}

for p in graph_files:
    # folder name “d3” -> degree 3
    try:
        d = int(p.parent.name[1:])
    except ValueError as exc:
        raise ValueError(f"Bad folder name {p.parent.name!r}; expected 'dX'") from exc

    m = pat_n.search(p.stem)
    if m is None:
        raise ValueError(f"Cannot parse n from filename {p.name}")
    n = int(m.group(1))

    graphs_by_cfg.setdefault((n, d), []).append(p)

loaded_graphs: dict[tuple[int, int], list[nx.Graph]] = {}

for (n, d), paths in graphs_by_cfg.items():
    selected = paths[:NUM_GRAPHS_TO_LOAD]     # deterministic slice
    file_names = [p.name for p in selected]   # load_graphs expects names
    in_dir = selected[0].parent               # all files share the same folder

    # optional progress bar
    with tqdm(total=len(selected), desc=f"n={n}, d={d}", unit="graph") as bar:
        graphs = load_graphs(file_names, in_dir)
        bar.update(len(graphs))

    loaded_graphs[(n, d)] = graphs
    print(f"Loaded {len(graphs):4d} graphs for n={n}, d={d}")

In [None]:
# Helped function to compute the Berry-Esseen rate for a single graph
def _x_stat(graph: nx.Graph, q: np.ndarray) -> float:
    """
    Compute X_N = sqrt(N) * <q, u2> for one graph.
    q must be L2-normalised and orthogonal to 1.
    """
    n = graph.number_of_nodes()
    d = int(2 * graph.number_of_edges() / n)

    # sparse adjacency, normalised
    A = nx.to_scipy_sparse_array(graph, dtype=float) / math.sqrt(d - 1)

    # second largest eigenpair
    # eigsh returns eigenvalues ASC by default → ask for the top two (LM),
    # then take the second column.
    vals, vecs = spla.eigsh(A, k=2, which="LM", tol=1e-2)
    idx_u2 = np.argmax(vals) ^ 1        # toggles between 0 and 1
    u2 = vecs[:, idx_u2]

    return math.sqrt(n) * float(q @ u2)

In [None]:
# Compute q for every graph in the loaded graphs
q_by_n = {}
for n, _ in loaded_graphs.keys():
    e1 = np.zeros(n)
    e1[0] = 1.0
    q = e1 - 1.0 / n                     # orthogonal to 1
    q /= np.linalg.norm(q)
    q_by_n[n] = q

# Collect X_N statistics for each (n, d) configuration
rows = []
n_jobs = max(cpu_count() - 1, 1)

for (n, d), graphs in loaded_graphs.items():
    q = q_by_n[n]

    X_vals = Parallel(n_jobs=n_jobs, backend="loky")(
        delayed(_x_stat)(g, q) for g in tqdm(graphs, desc=f"n={n}, d={d}", unit="graph")
    )

    # Kolmogorov–Smirnov distance vs N(0,1)
    D_n, _ = kstest(X_vals, norm.cdf)

    rows.append(dict(n=n, d=d, D_n=D_n, n_graphs=len(X_vals)))

print("Computed D_n for all (n, d) configurations.")

# tidy DF: one row per (n,d)
res_df = pd.DataFrame(rows)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

for d, sub in res_df.groupby("d"):
    sub = sub.sort_values("n")

    xs = np.log10(sub["n"].values)
    ys = np.log10(sub["D_n"].clip(lower=1e-12).values)  # avoid log10(0)

    # least‑squares slope on log‑log scale
    m, b = np.polyfit(xs, ys, 1)
    ax.plot(xs, ys, "o-", label=fr"$d={d}$  slope={m:.3f}")

# reference line with slope −1/6
N_min, N_max = res_df["n"].min(), res_df["n"].max()
ref_x = np.array([N_min, N_max])
y0 = np.log10(res_df["D_n"].max())          # anchor at left edge
ref_y = y0 - (1/6) * (np.log10(ref_x) - np.log10(N_min))
ax.plot(np.log10(ref_x), ref_y, "--", color="gray", lw=1.2,
        label=r"reference slope $-1/6$")

ax.set_xlabel(r"$\log_{10} N$")
ax.set_ylabel(r"$\log_{10} D_N$")
ax.set_title("Berry-Esseen rate for fixed $d$ (KS distance)")
ax.legend()
plt.tight_layout()
plt.show()