In [24]:
# %%
import json
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [25]:
# %%
def find_repo_root(start: Path | None = None) -> Path:
    start = start or Path.cwd()
    cur = start.resolve()
    for p in [cur] + list(cur.parents):
        if (p / "pyproject.toml").exists() or (p / "data").exists():
            return p
    return cur

ROOT = find_repo_root()
DATA_DIR = ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed"
RUNS_DIR = PROCESSED_DIR / "_runs"
CLEARED_DIR = PROCESSED_DIR / "data_cleared"

print("ROOT:", ROOT)
print("CLEARED_DIR:", CLEARED_DIR)
print("RUNS_DIR:", RUNS_DIR)


ROOT: /mnt/d/community-detection
CLEARED_DIR: /mnt/d/community-detection/data/processed/data_cleared
RUNS_DIR: /mnt/d/community-detection/data/processed/_runs


In [26]:
# %%
def find_repo_root(start: Path | None = None) -> Path:
    start = start or Path.cwd()
    cur = start.resolve()
    for p in [cur] + list(cur.parents):
        if (p / "pyproject.toml").exists() or (p / "data").exists():
            return p
    return cur

ROOT = find_repo_root()
DATA_DIR = ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed"
RUNS_DIR = PROCESSED_DIR / "_runs"
CLEARED_DIR = PROCESSED_DIR / "data_cleared"

print("ROOT:", ROOT)
print("CLEARED_DIR:", CLEARED_DIR)
print("RUNS_DIR:", RUNS_DIR)


ROOT: /mnt/d/community-detection
CLEARED_DIR: /mnt/d/community-detection/data/processed/data_cleared
RUNS_DIR: /mnt/d/community-detection/data/processed/_runs


In [27]:
# %%
COMPARE_PAIR = ("brightkite", "lbsn2vec")

RUN_ID_BY_DATASET = {
    "brightkite": "20251214_192049",
    "lbsn2vec": "20251214_183903",
}

def dataset_paths(dataset: str) -> dict:
    dataset = dataset.lower()
    cleared = CLEARED_DIR / dataset
    run_id = RUN_ID_BY_DATASET.get(dataset)
    run_dir = (RUNS_DIR / run_id / dataset) if run_id else None
    run_cfg = (RUNS_DIR / run_id / "run_config.json") if run_id else None
    return {
        "dataset": dataset,
        "cleared_dir": cleared,
        "run_id": run_id,
        "run_dir": run_dir,
        "run_config": run_cfg,
    }

for ds in COMPARE_PAIR:
    print(dataset_paths(ds))


{'dataset': 'brightkite', 'cleared_dir': PosixPath('/mnt/d/community-detection/data/processed/data_cleared/brightkite'), 'run_id': '20251214_192049', 'run_dir': PosixPath('/mnt/d/community-detection/data/processed/_runs/20251214_192049/brightkite'), 'run_config': PosixPath('/mnt/d/community-detection/data/processed/_runs/20251214_192049/run_config.json')}
{'dataset': 'lbsn2vec', 'cleared_dir': PosixPath('/mnt/d/community-detection/data/processed/data_cleared/lbsn2vec'), 'run_id': '20251214_183903', 'run_dir': PosixPath('/mnt/d/community-detection/data/processed/_runs/20251214_183903/lbsn2vec'), 'run_config': PosixPath('/mnt/d/community-detection/data/processed/_runs/20251214_183903/run_config.json')}


In [28]:
# %%
def read_json_safe(p: Path) -> dict | None:
    try:
        with open(p, "r", encoding="utf-8") as f:
            return json.load(f)
    except Exception as e:
        print(f"[WARN] JSON read failed: {p} -> {type(e).__name__}: {e}")
        return None

def read_parquet_safe(p: Path, columns=None) -> pd.DataFrame | None:
    try:
        return pd.read_parquet(p, columns=columns)
    except Exception as e:
        print(f"[WARN] Parquet read failed: {p} -> {type(e).__name__}: {e}")
        return None

def read_npy_safe(p: Path):
    try:
        return np.load(p, allow_pickle=False)
    except Exception as e:
        print(f"[WARN] NPY read failed: {p} -> {type(e).__name__}: {e}")
        return None


In [29]:
# %%
def normalize_user_id(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip()
    s = s.str.replace(r"\.0$", "", regex=True)  # "123.0" -> "123"
    s = s.replace({"nan": np.nan, "None": np.nan})
    return s

def normalize_edges(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "u" not in df.columns or "v" not in df.columns:
        df = df.rename(columns={df.columns[0]: "u", df.columns[1]: "v"})
    df["u"] = normalize_user_id(df["u"])
    df["v"] = normalize_user_id(df["v"])
    df = df.dropna(subset=["u", "v"])
    return df

def normalize_users(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "user_id" not in df.columns:
        df = df.rename(columns={df.columns[0]: "user_id"})
    df["user_id"] = normalize_user_id(df["user_id"])
    df = df.dropna(subset=["user_id"])
    return df

def normalize_checkins(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["user_id"] = normalize_user_id(df["user_id"])
    df = df.dropna(subset=["user_id"])
    return df


In [30]:
# %%
def load_comm_df(run_dir: Path) -> pd.DataFrame:
    p_rep = run_dir / "comm_df.repaired.parquet"
    p_plain = run_dir / "comm_df.parquet"
    if p_rep.exists():
        df = pd.read_parquet(p_rep)
    elif p_plain.exists():
        df = pd.read_parquet(p_plain)
    else:
        raise FileNotFoundError(f"Missing comm_df(.repaired) in {run_dir}")

    if "user_id" not in df.columns:
        raise ValueError("comm_df missing user_id")

    if "community_id" not in df.columns:
        for alt in ["community", "comm_id", "cluster_id"]:
            if alt in df.columns:
                df = df.rename(columns={alt: "community_id"})
                break
    if "community_id" not in df.columns:
        raise ValueError("comm_df missing community_id (or alt)")

    df = df[["user_id", "community_id"]].copy()
    df["user_id"] = normalize_user_id(df["user_id"])
    df = df.dropna(subset=["user_id"])
    df["community_id"] = df["community_id"].astype(int)
    return df


In [31]:
# %%
REQUIRED_CLEARED = ["edges_final.parquet", "users_final.parquet", "checkins_final.parquet"]

def load_step8_bundle(ds: str) -> dict:
    info = dataset_paths(ds)
    if not info["cleared_dir"].exists():
        raise FileNotFoundError(info["cleared_dir"])
    if info["run_dir"] is None or not info["run_dir"].exists():
        raise FileNotFoundError(info["run_dir"])

    for fn in REQUIRED_CLEARED:
        p = info["cleared_dir"] / fn
        if not p.exists():
            raise FileNotFoundError(p)

    edges = read_parquet_safe(info["cleared_dir"] / "edges_final.parquet")
    users = read_parquet_safe(info["cleared_dir"] / "users_final.parquet")
    checkins = read_parquet_safe(info["cleared_dir"] / "checkins_final.parquet")
    if edges is None or users is None or checkins is None:
        raise RuntimeError("Failed to load cleared parquet(s)")

    edges = normalize_edges(edges)
    users = normalize_users(users)
    checkins = normalize_checkins(checkins)

    comm_df = load_comm_df(info["run_dir"])

    metrics_global = read_json_safe(info["run_dir"] / "metrics_global.json")
    run_config = read_json_safe(info["run_config"]) if info["run_config"] and info["run_config"].exists() else None

    return dict(
        dataset=ds,
        edges_final=edges,
        users_final=users,
        checkins_final=checkins,
        comm_df=comm_df,
        metrics_global_cache=metrics_global,
        run_config=run_config,
        paths=info,
    )

data_A = load_step8_bundle(COMPARE_PAIR[0])
data_B = load_step8_bundle(COMPARE_PAIR[1])

print("Loaded bundles:", data_A["dataset"], data_B["dataset"])
print("A:", data_A["users_final"].shape, data_A["edges_final"].shape, data_A["checkins_final"].shape, data_A["comm_df"].shape)
print("B:", data_B["users_final"].shape, data_B["edges_final"].shape, data_B["checkins_final"].shape, data_B["comm_df"].shape)


Loaded bundles: brightkite lbsn2vec
A: (15092, 1) (116506, 2) (3656191, 5) (8556, 2)
B: (47389, 1) (279816, 2) (10328914, 5) (11829, 2)


In [32]:
# %%
def load_X_users_from_cleared(cleared_dir: Path) -> np.ndarray:
    p = cleared_dir / "X_users.npy"
    if not p.exists():
        raise FileNotFoundError(p)
    X = np.load(p)
    if X.ndim != 2:
        raise ValueError(f"X_users must be 2D, got {X.shape}")
    return X

def attach_features(bundle: dict) -> dict:
    X = load_X_users_from_cleared(bundle["paths"]["cleared_dir"])
    user_ids = bundle["users_final"]["user_id"].astype(str).to_numpy()
    if len(user_ids) != X.shape[0]:
        raise RuntimeError(f"Mismatch users_final vs X_users: {len(user_ids)} vs {X.shape[0]}")
    out = bundle.copy()
    out["user_ids_full"] = user_ids
    out["X_users"] = X.astype(np.float32, copy=False)
    return out

data_A = attach_features(data_A)
data_B = attach_features(data_B)
print("A X:", data_A["X_users"].shape, "users:", len(data_A["user_ids_full"]))
print("B X:", data_B["X_users"].shape, "users:", len(data_B["user_ids_full"]))


A X: (15092, 43) users: 15092
B X: (47389, 43) users: 47389


In [33]:
# %%
def make_labeled_views(bundle: dict) -> dict:
    users_all = set(bundle["users_final"]["user_id"].unique())

    comm_df = bundle["comm_df"].copy()
    comm_df = comm_df[comm_df["user_id"].isin(users_all)].copy()
    labeled_users = set(comm_df["user_id"].unique())

    edges = bundle["edges_final"].copy()
    edges_labeled = edges[edges["u"].isin(labeled_users) & edges["v"].isin(labeled_users)].copy()

    checkins = bundle["checkins_final"].copy()
    checkins_labeled = checkins[checkins["user_id"].isin(labeled_users)].copy()

    cov = {
        "users_total": int(len(users_all)),
        "users_labeled": int(len(labeled_users)),
        "label_coverage": float(len(labeled_users) / max(1, len(users_all))),

        "edges_total": int(len(edges)),
        "edges_labeled": int(len(edges_labeled)),
        "edge_labeled_ratio": float(len(edges_labeled) / max(1, len(edges))),

        "checkins_total": int(len(checkins)),
        "checkins_labeled": int(len(checkins_labeled)),
        "checkins_labeled_ratio": float(len(checkins_labeled) / max(1, len(checkins))),
    }

    out = bundle.copy()
    out["comm_df_labeled"] = comm_df
    out["edges_labeled"] = edges_labeled
    out["checkins_labeled"] = checkins_labeled
    out["coverage"] = cov
    return out

data_A = make_labeled_views(data_A)
data_B = make_labeled_views(data_B)

print("Coverage A:", data_A["coverage"])
print("Coverage B:", data_B["coverage"])


Coverage A: {'users_total': 15092, 'users_labeled': 5346, 'label_coverage': 0.35422740524781343, 'edges_total': 116506, 'edges_labeled': 49536, 'edge_labeled_ratio': 0.42517981906511254, 'checkins_total': 3656191, 'checkins_labeled': 2068469, 'checkins_labeled_ratio': 0.565744240385691}
Coverage B: {'users_total': 47389, 'users_labeled': 771, 'label_coverage': 0.016269598430015405, 'edges_total': 279816, 'edges_labeled': 1227, 'edge_labeled_ratio': 0.004385024444635046, 'checkins_total': 10328914, 'checkins_labeled': 160359, 'checkins_labeled_ratio': 0.015525252703236759}


In [34]:
# %%
def l2_normalize_rows(X: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    nrm = np.linalg.norm(X, axis=1, keepdims=True)
    return X / (nrm + eps)

def augment_labels_knn_majority(bundle: dict, k: int = 15, seed: int = 42) -> pd.DataFrame:

    user_ids = bundle["user_ids_full"]
    X = bundle["X_users"]
    comm = bundle["comm_df_labeled"][["user_id", "community_id"]].copy()

    labeled_set = set(comm["user_id"])
    user_to_idx = {u: i for i, u in enumerate(user_ids.tolist())}

    labeled_idx = np.array([user_to_idx[u] for u in comm["user_id"].tolist() if u in user_to_idx], dtype=int)
    labeled_comm = comm.set_index("user_id")["community_id"].to_dict()

    # normalize for cosine
    Xn = l2_normalize_rows(X.astype(np.float32, copy=False))

    # split indices
    all_idx = np.arange(len(user_ids))
    unlabeled_mask = np.array([u not in labeled_set for u in user_ids], dtype=bool)
    unlabeled_idx = all_idx[unlabeled_mask]

    # if no unlabeled or no labeled -> return original
    if len(unlabeled_idx) == 0 or len(labeled_idx) == 0:
        out = comm.copy()
        out["label_source"] = "original"
        return out

    # nearest neighbors (prefer sklearn)
    try:
        from sklearn.neighbors import NearestNeighbors
        nn = NearestNeighbors(n_neighbors=min(k, len(labeled_idx)), metric="cosine")
        nn.fit(Xn[labeled_idx])
        dist, nbr = nn.kneighbors(Xn[unlabeled_idx], return_distance=True)
        # cosine sim = 1 - dist
        sim = 1.0 - dist
    except Exception:
        # fallback: brute force (OK for smaller, but can be slower)
        # compute in blocks to avoid RAM blow
        rng = np.random.default_rng(seed)
        k2 = min(k, len(labeled_idx))
        sim = np.empty((len(unlabeled_idx), k2), dtype=np.float32)
        nbr = np.empty((len(unlabeled_idx), k2), dtype=np.int32)

        L = Xn[labeled_idx]
        for i, ui in enumerate(unlabeled_idx):
            s = L @ Xn[ui]
            top = np.argpartition(-s, k2-1)[:k2]
            top = top[np.argsort(-s[top])]
            sim[i] = s[top]
            nbr[i] = top

    labeled_users_list = user_ids[labeled_idx].tolist()

    preds = []
    for i, ui in enumerate(unlabeled_idx):
        neigh_pos = nbr[i].tolist()
        neigh_users = [labeled_users_list[j] for j in neigh_pos]
        neigh_labels = [labeled_comm.get(u, None) for u in neigh_users]

        # weighted vote by similarity
        weights = sim[i].astype(float)
        vote = {}
        for lab, w in zip(neigh_labels, weights):
            if lab is None:
                continue
            vote[lab] = vote.get(lab, 0.0) + float(w)

        if len(vote) == 0:
            # fallback: leave unlabeled
            continue

        pred_lab = max(vote.items(), key=lambda x: x[1])[0]
        preds.append((user_ids[ui], int(pred_lab)))

    aug = pd.DataFrame(preds, columns=["user_id", "community_id"])
    aug["label_source"] = "propagated"

    base = comm.copy()
    base["label_source"] = "original"

    comm_all = pd.concat([base, aug], ignore_index=True)
    comm_all = comm_all.drop_duplicates(subset=["user_id"], keep="first")
    comm_all["community_id"] = comm_all["community_id"].astype(int)
    return comm_all

commA_all = augment_labels_knn_majority(data_A, k=15)
commB_all = augment_labels_knn_majority(data_B, k=15)

print("Aug labels A:", commA_all.shape, "coverage =", commA_all["user_id"].nunique() / data_A["users_final"]["user_id"].nunique())
print("Aug labels B:", commB_all.shape, "coverage =", commB_all["user_id"].nunique() / data_B["users_final"]["user_id"].nunique())
print(commA_all["label_source"].value_counts())
print(commB_all["label_source"].value_counts())


Aug labels A: (15092, 3) coverage = 1.0
Aug labels B: (47389, 3) coverage = 1.0
label_source
propagated    9746
original      5346
Name: count, dtype: int64
label_source
propagated    46618
original        771
Name: count, dtype: int64


In [35]:
# %%
def l2_normalize_rows(X: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    n = np.linalg.norm(X, axis=1, keepdims=True)
    return X / (n + eps)

def build_knn_graph_from_X(bundle: dict, k: int = 20, mutual: bool = True) -> pd.DataFrame:
    user_ids = bundle["user_ids_full"]
    X = l2_normalize_rows(bundle["X_users"])

    from sklearn.neighbors import NearestNeighbors
    nn = NearestNeighbors(n_neighbors=min(k + 1, len(user_ids)), metric="cosine")
    nn.fit(X)
    dist, nbr = nn.kneighbors(X, return_distance=True)
    sim = 1.0 - dist

    rows = []
    for i in range(len(user_ids)):
        ui = user_ids[i]
        for jpos in range(1, nbr.shape[1]):  # skip self
            j = int(nbr[i, jpos])
            vj = user_ids[j]
            w = float(sim[i, jpos])
            rows.append((ui, vj, w))

    df = pd.DataFrame(rows, columns=["u", "v", "w"])

    if mutual:
        key = (df["u"].astype(str) + "||" + df["v"].astype(str))
        inv = (df["v"].astype(str) + "||" + df["u"].astype(str))
        df = df[key.isin(set(inv))].copy()

    # undirected canonical + keep max weight
    umin = df[["u", "v"]].astype(str).min(axis=1)
    vmax = df[["u", "v"]].astype(str).max(axis=1)
    df["u"] = umin
    df["v"] = vmax
    df = df.sort_values("w", ascending=False).drop_duplicates(subset=["u", "v"], keep="first")
    return df[["u", "v", "w"]].reset_index(drop=True)

knnA = build_knn_graph_from_X(data_A, k=20, mutual=True)
knnB = build_knn_graph_from_X(data_B, k=20, mutual=True)

print("kNN A:", knnA.shape, "kNN B:", knnB.shape)
print("kNN A cols:", knnA.columns.tolist())


kNN A: (68426, 3) kNN B: (217074, 3)
kNN A cols: ['u', 'v', 'w']


In [36]:
# %%
def leiden_on_knn(knn_edges: pd.DataFrame, user_ids_full: np.ndarray, resolution: float = 1.0) -> pd.DataFrame:
    try:
        import igraph as ig
        import leidenalg
    except Exception as e:
        raise RuntimeError("Need igraph + leidenalg installed for Leiden-on-kNN.") from e

    nodes = pd.Index(user_ids_full.astype(str))
    node2idx = {u: i for i, u in enumerate(nodes.tolist())}

    u_idx = knn_edges["u"].astype(str).map(node2idx).to_numpy()
    v_idx = knn_edges["v"].astype(str).map(node2idx).to_numpy()
    w = knn_edges["w"].astype(float).to_numpy()

    # drop any NaN mapping (shouldn't happen if kNN built from same users)
    mask = (~pd.isna(u_idx)) & (~pd.isna(v_idx))
    u_idx = u_idx[mask].astype(int)
    v_idx = v_idx[mask].astype(int)
    w = w[mask]

    g = ig.Graph(n=len(nodes), edges=list(zip(u_idx, v_idx)), directed=False)
    g.es["weight"] = w
    g.simplify(multiple=True, loops=True, combine_edges="max")

    part = leidenalg.find_partition(
        g,
        leidenalg.RBConfigurationVertexPartition,
        weights="weight",
        resolution_parameter=float(resolution),
    )
    membership = np.array(part.membership, dtype=int)
    return pd.DataFrame({"user_id": nodes.astype(str), "community_id": membership})

commA_aug = leiden_on_knn(knnA, data_A["user_ids_full"], resolution=1.0)
commB_aug = leiden_on_knn(knnB, data_B["user_ids_full"], resolution=1.0)

print("Aug A:", commA_aug.shape, "coverage:", commA_aug["user_id"].nunique()/len(data_A["user_ids_full"]))
print("Aug B:", commB_aug.shape, "coverage:", commB_aug["user_id"].nunique()/len(data_B["user_ids_full"]))
print("A #communities:", commA_aug["community_id"].nunique())
print("B #communities:", commB_aug["community_id"].nunique())


Aug A: (15092, 2) coverage: 1.0
Aug B: (47389, 2) coverage: 1.0
A #communities: 262
B #communities: 651


In [37]:
# %%
def try_compute_modularity_igraph(edges_df: pd.DataFrame, comm_df: pd.DataFrame) -> float | None:
    try:
        import igraph as ig
    except Exception:
        return None

    nodes = pd.Index(pd.unique(pd.concat([edges_df["u"], edges_df["v"]], ignore_index=True)))
    node2idx = {u: i for i, u in enumerate(nodes.tolist())}
    edgelist = list(zip(edges_df["u"].map(node2idx), edges_df["v"].map(node2idx)))

    g = ig.Graph(n=len(nodes), edges=edgelist, directed=False)
    g.simplify(multiple=True, loops=True)

    comm_map = comm_df.set_index("user_id")["community_id"].to_dict()
    membership = [comm_map.get(u, -1) for u in nodes.tolist()]
    valid_idx = [i for i, m in enumerate(membership) if m != -1]

    if len(valid_idx) != len(membership):
        g = g.induced_subgraph(valid_idx)
        membership = [membership[i] for i in valid_idx]

    uniq = {c: i for i, c in enumerate(sorted(set(membership)))}
    membership = [uniq[c] for c in membership]
    return float(g.modularity(membership))

def compute_structural_comm_metrics(edges_df: pd.DataFrame, comm_df: pd.DataFrame) -> pd.DataFrame:
    u2c = comm_df.set_index("user_id")["community_id"]

    df = edges_df.copy()
    df["cu"] = df["u"].map(u2c)
    df["cv"] = df["v"].map(u2c)
    df = df.dropna(subset=["cu", "cv"])
    df["cu"] = df["cu"].astype(int)
    df["cv"] = df["cv"].astype(int)

    deg = pd.concat([edges_df["u"], edges_df["v"]], ignore_index=True).value_counts()
    deg = deg.rename_axis("user_id").reset_index(name="deg")
    deg["user_id"] = deg["user_id"].astype(str)
    deg = deg.merge(comm_df[["user_id","community_id"]], on="user_id", how="inner")

    vol_by_comm = deg.groupby("community_id")["deg"].sum().rename("volume")
    total_vol = float(vol_by_comm.sum())

    internal = (
        df[df["cu"] == df["cv"]]
        .groupby("cu").size()
        .rename("internal_edges")
        .rename_axis("community_id")
    )

    cut_df = df[df["cu"] != df["cv"]][["cu", "cv"]]
    cut_counts = (
        pd.concat([cut_df["cu"].value_counts(), cut_df["cv"].value_counts()], axis=0)
        .groupby(level=0).sum()
        .rename("cut_edges")
        .rename_axis("community_id")
    )

    sizes = comm_df.groupby("community_id")["user_id"].nunique().rename("n_nodes").rename_axis("community_id")

    out = pd.concat([sizes, vol_by_comm, internal, cut_counts], axis=1).fillna(0).reset_index()

    denom = np.minimum(out["volume"].values, total_vol - out["volume"].values)
    denom = np.where(denom <= 0, np.nan, denom)
    out["conductance"] = out["cut_edges"].values / denom

    n = out["n_nodes"].values.astype(float)
    denom_d = n * (n - 1.0) / 2.0
    denom_d = np.where(denom_d <= 0, np.nan, denom_d)
    out["density"] = out["internal_edges"].values / denom_d

    return out

def summarize_structural(comm_metrics: pd.DataFrame, modularity: float | None) -> dict:
    s = {"modularity": modularity}
    for col in ["conductance", "density"]:
        vals = comm_metrics[col].replace([np.inf, -np.inf], np.nan).dropna()
        s[f"{col}_median"] = float(vals.median()) if len(vals) else None
        s[f"{col}_iqr"] = float(vals.quantile(0.75) - vals.quantile(0.25)) if len(vals) else None
    return s


In [38]:
# %%
def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0
    lat1 = np.radians(lat1); lon1 = np.radians(lon1)
    lat2 = np.radians(lat2); lon2 = np.radians(lon2)
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    c = 2*np.arcsin(np.sqrt(a))
    return R*c

def compute_user_centroids(checkins_df: pd.DataFrame) -> pd.DataFrame:
    return checkins_df.groupby("user_id", as_index=False)[["lat", "lon"]].mean()

def compute_spatial_comm_metrics(user_centroids: pd.DataFrame, comm_df: pd.DataFrame) -> pd.DataFrame:
    tmp = user_centroids.merge(comm_df[["user_id","community_id"]], on="user_id", how="inner")
    sizes = tmp.groupby("community_id")["user_id"].nunique().rename("n_nodes")
    comm_cent = tmp.groupby("community_id")[["lat", "lon"]].mean().rename(columns={"lat": "comm_lat", "lon": "comm_lon"})
    tmp = tmp.merge(comm_cent, on="community_id", how="left")

    dist = haversine_km(tmp["lat"].values, tmp["lon"].values, tmp["comm_lat"].values, tmp["comm_lon"].values)
    tmp["dist_km"] = dist

    spatial = tmp.groupby("community_id")["dist_km"].median().rename("spatial_median_km")
    out = pd.concat([sizes, comm_cent, spatial], axis=1).reset_index()
    return out

def spatial_global_stat(comm_spatial: pd.DataFrame) -> float:
    vals = comm_spatial["spatial_median_km"].replace([np.inf, -np.inf], np.nan).dropna()
    return float(vals.median()) if len(vals) else float("nan")

def spatial_baseline_zscore(user_centroids: pd.DataFrame,
                           comm_df: pd.DataFrame,
                           n_shuffles: int = 150,
                           seed: int = 42,
                           min_comm_size: int = 20,
                           sample_users: int = 200000) -> dict:
    rng = np.random.default_rng(seed)
    tmp = user_centroids.merge(comm_df[["user_id","community_id"]], on="user_id", how="inner")
    if len(tmp) == 0:
        return dict(observed=np.nan, mu=np.nan, sd=np.nan, z=np.nan, used_users=0, min_comm_size_used=None)

    # filter by comm size
    comm_sizes = tmp["community_id"].value_counts()
    keep = comm_sizes[comm_sizes >= min_comm_size].index.values
    if len(keep) == 0:
        # relax
        min_comm_size = max(5, int(comm_sizes.max())) if len(comm_sizes) else 2
        keep = comm_sizes[comm_sizes >= min_comm_size].index.values

    tmp = tmp[tmp["community_id"].isin(keep)].copy()
    if len(tmp) > sample_users:
        tmp = tmp.sample(n=sample_users, random_state=seed)

    obs_comm = compute_spatial_comm_metrics(tmp[["user_id","lat","lon"]], tmp[["user_id","community_id"]])
    observed = spatial_global_stat(obs_comm)

    # size-preserving shuffle
    comm_sizes2 = tmp["community_id"].value_counts().sort_index()
    comm_ids = comm_sizes2.index.values.astype(int)
    sizes = comm_sizes2.values.astype(int)
    labels = np.repeat(comm_ids, sizes)

    user_lat = tmp["lat"].values
    user_lon = tmp["lon"].values

    baseline_vals = []
    for _ in range(n_shuffles):
        rng.shuffle(labels)
        tcoords = pd.DataFrame({"community_id": labels, "lat": user_lat, "lon": user_lon})
        cc = tcoords.groupby("community_id")[["lat","lon"]].mean().rename(columns={"lat":"comm_lat","lon":"comm_lon"})
        comm_lat = pd.Series(labels).map(cc["comm_lat"]).values
        comm_lon = pd.Series(labels).map(cc["comm_lon"]).values
        dist = haversine_km(user_lat, user_lon, comm_lat, comm_lon)
        tcoords["dist_km"] = dist
        cm = tcoords.groupby("community_id")["dist_km"].median()
        baseline_vals.append(float(cm.median()))

    baseline_vals = np.array(baseline_vals, dtype=float)
    mu = float(np.mean(baseline_vals))
    sd = float(np.std(baseline_vals) + 1e-12)
    z = (observed - mu) / sd

    return dict(observed=observed, mu=mu, sd=sd, z=float(z),
                used_users=int(len(tmp)), min_comm_size_used=int(min_comm_size))


In [39]:
# %%
def _take_first_col_as_series(df: pd.DataFrame, col: str) -> pd.Series:
    """
    If df has duplicate column labels, df[col] returns a DataFrame.
    This helper always returns a 1D Series (take the first occurrence).
    """
    x = df.loc[:, col]
    if isinstance(x, pd.DataFrame):
        return x.iloc[:, 0]
    return x

def try_compute_modularity_igraph(edges_df: pd.DataFrame, comm_df: pd.DataFrame) -> float | None:
    try:
        import igraph as ig
    except Exception:
        return None

    # robust u/v extraction
    u = _take_first_col_as_series(edges_df, "u").astype(str)
    v = _take_first_col_as_series(edges_df, "v").astype(str)

    nodes = pd.Index(pd.unique(pd.concat([u, v], ignore_index=True)))
    node2idx = {node: i for i, node in enumerate(nodes.tolist())}
    edgelist = list(zip(u.map(node2idx), v.map(node2idx)))

    g = ig.Graph(n=len(nodes), edges=edgelist, directed=False)
    g.simplify(multiple=True, loops=True)

    comm_map = comm_df.set_index("user_id")["community_id"].to_dict()
    membership = [comm_map.get(node, -1) for node in nodes.tolist()]
    valid_idx = [i for i, m in enumerate(membership) if m != -1]

    if len(valid_idx) != len(membership):
        g = g.induced_subgraph(valid_idx)
        membership = [membership[i] for i in valid_idx]

    uniq = {c: i for i, c in enumerate(sorted(set(membership)))}
    membership = [uniq[c] for c in membership]
    return float(g.modularity(membership))


def compute_metrics(bundle: dict, comm_all: pd.DataFrame, knn_edges: pd.DataFrame,
                    baseline_shuffles: int = 150, seed: int = 42) -> dict:
    ds = bundle["dataset"]

    # coverage original vs augmented
    cov0 = bundle["coverage"]
    users_total = bundle["users_final"]["user_id"].nunique()
    cov1 = {
        "users_total": int(users_total),
        "users_labeled": int(comm_all["user_id"].nunique()),
        "label_coverage": float(comm_all["user_id"].nunique() / max(1, users_total)),
    }

    labeled_set = set(comm_all["user_id"].astype(str))

    # ---- Structural-A: friendship graph ----
    edgesA = bundle["edges_final"][["u", "v"]].copy()
    edgesA["u"] = edgesA["u"].astype(str)
    edgesA["v"] = edgesA["v"].astype(str)
    edgesA = edgesA[edgesA["u"].isin(labeled_set) & edgesA["v"].isin(labeled_set)].copy()

    modularity_A = try_compute_modularity_igraph(edgesA, comm_all) if len(edgesA) else None
    comm_struct_A = compute_structural_comm_metrics(edgesA, comm_all) if len(edgesA) else pd.DataFrame()
    sum_A = summarize_structural(comm_struct_A, modularity_A)

    # ---- Structural-B: kNN graph (from X_users) ----
    edgesB = knn_edges[["u", "v"]].copy()
    edgesB["u"] = edgesB["u"].astype(str)
    edgesB["v"] = edgesB["v"].astype(str)
    edgesB = edgesB[edgesB["u"].isin(labeled_set) & edgesB["v"].isin(labeled_set)].copy()

    modularity_B = try_compute_modularity_igraph(edgesB, comm_all) if len(edgesB) else None
    comm_struct_B = compute_structural_comm_metrics(edgesB, comm_all) if len(edgesB) else pd.DataFrame()
    sum_B = summarize_structural(comm_struct_B, modularity_B)

    # ---- Spatial (augmented labels) ----
    chk = bundle["checkins_final"][["user_id", "lat", "lon"]].copy()
    chk["user_id"] = chk["user_id"].astype(str)
    chk = chk[chk["user_id"].isin(labeled_set)].copy()

    if len(chk):
        user_cent = compute_user_centroids(chk)
        comm_spatial = compute_spatial_comm_metrics(user_cent, comm_all)
        base = spatial_baseline_zscore(
            user_centroids=user_cent,
            comm_df=comm_all,
            n_shuffles=baseline_shuffles,
            seed=seed,
            min_comm_size=20,
            sample_users=200000,
        )
        spatial_median_km = float(base["observed"])
        baseline_mean_km = float(base["mu"])
        baseline_std_km = float(base["sd"])
        spatial_z = float(base["z"])
        baseline_sample_users = int(base["used_users"])
        baseline_min_comm_size_used = base["min_comm_size_used"]
    else:
        comm_spatial = pd.DataFrame()
        spatial_median_km = float("nan")
        baseline_mean_km = float("nan")
        baseline_std_km = float("nan")
        spatial_z = float("nan")
        baseline_sample_users = 0
        baseline_min_comm_size_used = None

    # community size stats
    comm_sizes = comm_all["community_id"].value_counts()
    largest_comm = int(comm_sizes.max()) if len(comm_sizes) else 0
    median_comm = float(comm_sizes.median()) if len(comm_sizes) else float("nan")

    return dict(
        dataset=ds,
        coverage_original=cov0,
        coverage_augmented=cov1,

        communities_augmented=int(comm_all["community_id"].nunique()),
        largest_comm=largest_comm,
        median_comm=median_comm,

        # Structural-A
        structuralA_modularity=sum_A["modularity"],
        structuralA_conductance_median=sum_A["conductance_median"],
        structuralA_conductance_iqr=sum_A["conductance_iqr"],
        structuralA_density_median=sum_A["density_median"],
        structuralA_density_iqr=sum_A["density_iqr"],

        # Structural-B
        structuralB_modularity=sum_B["modularity"],
        structuralB_conductance_median=sum_B["conductance_median"],
        structuralB_conductance_iqr=sum_B["conductance_iqr"],
        structuralB_density_median=sum_B["density_median"],
        structuralB_density_iqr=sum_B["density_iqr"],

        # Spatial
        spatial_median_km=spatial_median_km,
        baseline_mean_km=baseline_mean_km,
        baseline_std_km=baseline_std_km,
        spatial_z=spatial_z,
        baseline_shuffles=int(baseline_shuffles),
        baseline_sample_users=baseline_sample_users,
        baseline_min_comm_size_used=baseline_min_comm_size_used,

        # details
        comm_all=comm_all,
        comm_struct_A=comm_struct_A,
        comm_struct_B=comm_struct_B,
        comm_spatial=comm_spatial,
        run_config=bundle.get("run_config"),
        metrics_global_cache=bundle.get("metrics_global_cache"),
    )


In [40]:
# %%
# Guard: ensure metrics_A / metrics_B exist before building table1

need = []
for var in ["metrics_A", "metrics_B"]:
    if var not in globals() or globals()[var] is None:
        need.append(var)

if need:
    print("[INFO] Missing:", need, "-> recomputing metrics now...")

    # sanity checks (must exist from previous cells)
    required = ["data_A", "data_B", "commA_all", "commB_all", "knnA", "knnB"]
    missing2 = [x for x in required if x not in globals()]
    if missing2:
        raise RuntimeError(f"Cannot compute metrics because missing objects: {missing2}. "
                           f"Run earlier cells that create: {missing2}")

    metrics_A = compute_metrics(data_A, commA_all, knnA)
    metrics_B = compute_metrics(data_B, commB_all, knnB)

print("[OK] metrics_A keys:", list(metrics_A.keys())[:10], "...")
print("[OK] metrics_B keys:", list(metrics_B.keys())[:10], "...")


[OK] metrics_A keys: ['dataset', 'coverage_original', 'coverage_augmented', 'communities_augmented', 'largest_comm', 'median_comm', 'structuralA_modularity', 'structuralA_conductance_median', 'structuralA_conductance_iqr', 'structuralA_density_median'] ...
[OK] metrics_B keys: ['dataset', 'coverage_original', 'coverage_augmented', 'communities_augmented', 'largest_comm', 'median_comm', 'structuralA_modularity', 'structuralA_conductance_median', 'structuralA_conductance_iqr', 'structuralA_density_median'] ...


In [41]:
# %%
def build_table1_row(m: dict) -> dict:
    cov0 = m["coverage_original"]
    cov1 = m["coverage_augmented"]

    return {
        "dataset": m["dataset"],

        # original coverage
        "users_total": cov0["users_total"],
        "users_labeled_original": cov0["users_labeled"],
        "label_coverage_original": cov0["label_coverage"],
        "edges_total": cov0["edges_total"],
        "edges_labeled_original": cov0["edges_labeled"],
        "edge_labeled_ratio_original": cov0["edge_labeled_ratio"],
        "checkins_total": cov0["checkins_total"],
        "checkins_labeled_original": cov0["checkins_labeled"],
        "checkins_labeled_ratio_original": cov0["checkins_labeled_ratio"],

        # augmented coverage
        "users_labeled_aug": cov1["users_labeled"],
        "label_coverage_aug": cov1["label_coverage"],

        "communities_aug": m["communities_augmented"],
        "largest_comm": m["largest_comm"],
        "median_comm": m["median_comm"],

        # Structural-A
        "A_modularity": m["structuralA_modularity"],
        "A_conductance_median": m["structuralA_conductance_median"],
        "A_conductance_iqr": m["structuralA_conductance_iqr"],
        "A_density_median": m["structuralA_density_median"],
        "A_density_iqr": m["structuralA_density_iqr"],

        # Structural-B
        "B_modularity": m["structuralB_modularity"],
        "B_conductance_median": m["structuralB_conductance_median"],
        "B_conductance_iqr": m["structuralB_conductance_iqr"],
        "B_density_median": m["structuralB_density_median"],
        "B_density_iqr": m["structuralB_density_iqr"],

        # Spatial
        "spatial_median_km": m["spatial_median_km"],
        "baseline_mean_km": m["baseline_mean_km"],
        "baseline_std_km": m["baseline_std_km"],
        "spatial_z": m["spatial_z"],
        "baseline_shuffles": m["baseline_shuffles"],
        "baseline_sample_users": m["baseline_sample_users"],
        "baseline_min_comm_size_used": m["baseline_min_comm_size_used"],
    }

table1 = pd.DataFrame([build_table1_row(metrics_A), build_table1_row(metrics_B)])
table1


Unnamed: 0,dataset,users_total,users_labeled_original,label_coverage_original,edges_total,edges_labeled_original,edge_labeled_ratio_original,checkins_total,checkins_labeled_original,checkins_labeled_ratio_original,...,B_conductance_iqr,B_density_median,B_density_iqr,spatial_median_km,baseline_mean_km,baseline_std_km,spatial_z,baseline_shuffles,baseline_sample_users,baseline_min_comm_size_used
0,brightkite,15092,5346,0.354227,116506,49536,0.42518,3656191,2068469,0.565744,...,0.153846,0.5,0.733333,2776.853813,9190.566185,227.701613,-28.16718,150,1291,20
1,lbsn2vec,47389,771,0.01627,279816,1227,0.004385,10328914,160359,0.015525,...,0.115058,0.036635,0.034977,1929.754706,9523.219344,22.845851,-332.378279,150,45476,20


In [42]:
# %%
def build_case_studies(m: dict) -> pd.DataFrame:
    comm_spatial = m["comm_spatial"].copy()
    if len(comm_spatial) == 0:
        return pd.DataFrame(columns=["dataset","community_id","n_nodes","spatial_median_km","comm_lat","comm_lon"])

    comm_spatial["dataset"] = m["dataset"]

    # top 2 by size
    top_size = comm_spatial.sort_values("n_nodes", ascending=False).head(2)

    # top 2 by spatial compactness (exclude too small)
    min_size = max(10, int(np.nanmedian(comm_spatial["n_nodes"].values)))
    cand = comm_spatial[comm_spatial["n_nodes"] >= min_size].copy()
    if len(cand) == 0:
        cand = comm_spatial.copy()

    top_spatial = cand.sort_values("spatial_median_km", ascending=True).head(2)

    picked = pd.concat([top_size, top_spatial], ignore_index=True)
    picked = picked.drop_duplicates(subset=["community_id"]).reset_index(drop=True)

    cols = ["dataset","community_id","n_nodes","spatial_median_km","comm_lat","comm_lon"]
    return picked[cols]

table2 = pd.concat([build_case_studies(metrics_A), build_case_studies(metrics_B)], ignore_index=True)
table2


Unnamed: 0,dataset,community_id,n_nodes,spatial_median_km,comm_lat,comm_lon
0,brightkite,1,136,6088.049378,29.947662,-27.186163
1,brightkite,0,118,4710.477367,38.353088,-48.468878
2,brightkite,2879,18,80.082683,35.695438,138.845531
3,brightkite,3055,37,147.932686,35.980826,138.091714
4,lbsn2vec,10767,533,1547.397299,14.393729,109.566631
5,lbsn2vec,11,466,3644.573385,-8.538571,-40.065792
6,lbsn2vec,1684,208,181.077759,40.988958,28.405977
7,lbsn2vec,1740,212,333.133407,41.007806,28.751232


In [43]:
# %%
def build_structural_tables(metrics_A: dict, metrics_B: dict) -> tuple[pd.DataFrame, pd.DataFrame]:
    rows_A = []
    rows_B = []

    for m in [metrics_A, metrics_B]:
        ds = m["dataset"]

        # Structural-A: social graph (friendship edges_final)
        rows_A.append({
            "dataset": ds,
            "modularity": m.get("A_modularity", m.get("structuralA_modularity")),
            "conductance_median": m.get("A_conductance_median", m.get("structuralA_conductance_median")),
            "conductance_iqr": m.get("A_conductance_iqr", m.get("structuralA_conductance_iqr")),
            "density_median": m.get("A_density_median", m.get("structuralA_density_median")),
            "density_iqr": m.get("A_density_iqr", m.get("structuralA_density_iqr")),
            "labels_used": "original/augmented (depends on your compute_metrics)",
        })

        # Structural-B: embedding graph (kNN graph)
        rows_B.append({
            "dataset": ds,
            "modularity": m.get("B_modularity", m.get("structuralB_modularity")),
            "conductance_median": m.get("B_conductance_median", m.get("structuralB_conductance_median")),
            "conductance_iqr": m.get("B_conductance_iqr", m.get("structuralB_conductance_iqr")),
            "density_median": m.get("B_density_median", m.get("structuralB_density_median")),
            "density_iqr": m.get("B_density_iqr", m.get("structuralB_density_iqr")),
            "labels_used": "augmented if Leiden-on-kNN-full was used",
        })

    structural_A = pd.DataFrame(rows_A)
    structural_B = pd.DataFrame(rows_B)

    # helper: highlight “đậm social hơn” = higher modularity + lower conductance
    def _winner(df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        # convert to numeric safely
        for c in ["modularity","conductance_median","density_median","conductance_iqr","density_iqr"]:
            df[c] = pd.to_numeric(df[c], errors="coerce")

        # score: z(modularity) - z(conductance_median)
        z_mod = (df["modularity"] - df["modularity"].mean()) / (df["modularity"].std() + 1e-12)
        z_con = (df["conductance_median"] - df["conductance_median"].mean()) / (df["conductance_median"].std() + 1e-12)
        df["social_cohesion_score"] = z_mod - z_con

        best = df.loc[df["social_cohesion_score"].idxmax(), "dataset"]
        df["winner_by_score"] = best
        return df.sort_values("social_cohesion_score", ascending=False).reset_index(drop=True)

    return _winner(structural_A), _winner(structural_B)

structural_A, structural_B = build_structural_tables(metrics_A, metrics_B)

print("Structural-A (friendship graph) — social ties cohesion")
display(structural_A)

print("\nStructural-B (kNN/embedding graph) — behavioral similarity cohesion")
display(structural_B)


Structural-A (friendship graph) — social ties cohesion


Unnamed: 0,dataset,modularity,conductance_median,conductance_iqr,density_median,density_iqr,labels_used,social_cohesion_score,winner_by_score
0,lbsn2vec,0.012983,0.995381,0.011864,0.000699,0.001932,original/augmented (depends on your compute_me...,1.414214,lbsn2vec
1,brightkite,0.003149,1.0,0.0,0.0,0.0,original/augmented (depends on your compute_me...,-1.414214,lbsn2vec



Structural-B (kNN/embedding graph) — behavioral similarity cohesion


Unnamed: 0,dataset,modularity,conductance_median,conductance_iqr,density_median,density_iqr,labels_used,social_cohesion_score,winner_by_score
0,lbsn2vec,0.270016,0.800289,0.115058,0.036635,0.034977,augmented if Leiden-on-kNN-full was used,1.414214,lbsn2vec
1,brightkite,0.142777,0.92,0.153846,0.5,0.733333,augmented if Leiden-on-kNN-full was used,-1.414214,lbsn2vec


In [45]:
# %%
import pandas as pd
import numpy as np

def _winner_higher(a, b):
    if pd.isna(a) or pd.isna(b): 
        return None
    return "brightkite" if a > b else ("lbsn2vec" if b > a else "tie")

def _winner_lower(a, b):
    if pd.isna(a) or pd.isna(b): 
        return None
    return "brightkite" if a < b else ("lbsn2vec" if b < a else "tie")

def build_structural_compare_table(metrics_A: dict, metrics_B: dict) -> pd.DataFrame:
    rows = []

    # Structural-A (friendship graph)
    A_mod_A = metrics_A.get("A_modularity")
    A_mod_B = metrics_B.get("A_modularity")
    A_cond_A = metrics_A.get("A_conductance_median")
    A_cond_B = metrics_B.get("A_conductance_median")

    rows.append({
        "structural_view": "Structural-A (friendship graph)",
        "brightkite_modularity": A_mod_A,
        "lbsn2vec_modularity": A_mod_B,
        "winner_modularity(higher=better)": _winner_higher(A_mod_A, A_mod_B),
        "brightkite_conductance_median": A_cond_A,
        "lbsn2vec_conductance_median": A_cond_B,
        "winner_conductance(lower=better)": _winner_lower(A_cond_A, A_cond_B),
    })

    # Structural-B (mutual-kNN embedding graph)
    B_mod_A = metrics_A.get("B_modularity")
    B_mod_B = metrics_B.get("B_modularity")
    B_cond_A = metrics_A.get("B_conductance_median")
    B_cond_B = metrics_B.get("B_conductance_median")

    rows.append({
        "structural_view": "Structural-B (mutual-kNN embedding graph)",
        "brightkite_modularity": B_mod_A,
        "lbsn2vec_modularity": B_mod_B,
        "winner_modularity(higher=better)": _winner_higher(B_mod_A, B_mod_B),
        "brightkite_conductance_median": B_cond_A,
        "lbsn2vec_conductance_median": B_cond_B,
        "winner_conductance(lower=better)": _winner_lower(B_cond_A, B_cond_B),
    })

    return pd.DataFrame(rows)

# --- build now (requires metrics_A, metrics_B already computed) ---
struct_table = build_structural_compare_table(metrics_A, metrics_B)
display(struct_table)

# --- save ---
(struct_table
 .to_csv(STEP8_OUT / "step8_structural_compare.csv", index=False))
(struct_table
 .to_parquet(STEP8_OUT / "step8_structural_compare.parquet", index=False))
print("Saved:", STEP8_OUT / "step8_structural_compare.csv")


Unnamed: 0,structural_view,brightkite_modularity,lbsn2vec_modularity,winner_modularity(higher=better),brightkite_conductance_median,lbsn2vec_conductance_median,winner_conductance(lower=better)
0,Structural-A (friendship graph),,,,,,
1,Structural-B (mutual-kNN embedding graph),,,,,,


Saved: /mnt/d/community-detection/data/processed/_runs/20251218_013727/step8_compare/step8_structural_compare.csv
