In [3]:
from __future__ import annotations

import json
import math
import re
from pathlib import Path
from typing import Any, Dict, Optional, Tuple, List

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 160)

def find_repo_root(start: Path | None = None) -> Path:
    """
    Tìm PROJECT_ROOT bằng cách scan lên trên cho tới khi thấy:
    - pyproject.toml hoặc
    - README.md hoặc
    - configs/default.yaml hoặc
    - src/
    """
    cur = (start or Path.cwd()).resolve()
    markers = [
        "pyproject.toml",
        "README.md",
        "configs/default.yaml",
        "src",
        ".git",
    ]
    for _ in range(20):
        for m in markers:
            if (cur / m).exists():
                return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    # fallback: assume notebook under PROJECT_ROOT/notebooks
    return (Path.cwd().resolve().parent)

PROJECT_ROOT = find_repo_root()
print("PROJECT_ROOT =", PROJECT_ROOT)


PROJECT_ROOT = /mnt/d/community-detection


In [4]:
RUN_BRIGHTKITE = PROJECT_ROOT / "data/processed/_runs/20251214_192049"
RUN_LBSN2VEC   = PROJECT_ROOT / "data/processed/_runs/20251214_183903"

assert RUN_BRIGHTKITE.exists(), f"Không thấy run: {RUN_BRIGHTKITE}"
assert RUN_LBSN2VEC.exists(), f"Không thấy run: {RUN_LBSN2VEC}"

print("Brightkite run:", RUN_BRIGHTKITE)
print("LBSN2Vec++ run:", RUN_LBSN2VEC)


Brightkite run: /mnt/d/community-detection/data/processed/_runs/20251214_192049
LBSN2Vec++ run: /mnt/d/community-detection/data/processed/_runs/20251214_183903


In [5]:
def list_run_files(run_dir: Path, limit: int = 120) -> pd.DataFrame:
    rows = []
    for p in sorted([x for x in run_dir.rglob("*") if x.is_file()]):
        rows.append({
            "path": str(p.relative_to(run_dir)),
            "ext": p.suffix.lower(),
            "size_kb": round(p.stat().st_size / 1024, 2)
        })
    df = pd.DataFrame(rows).sort_values(["ext", "size_kb"], ascending=[True, False])
    return df.head(limit)

print("=== Brightkite run files (top) ===")
display(list_run_files(RUN_BRIGHTKITE))

print("=== LBSN2Vec++ run files (top) ===")
display(list_run_files(RUN_LBSN2VEC))


=== Brightkite run files (top) ===


Unnamed: 0,path,ext,size_kb
11,run_config.json,.json,1.34
9,brightkite/metrics_global.json,.json,0.76
1,brightkite/Z.npy,.npy,4292.34
0,brightkite/X_users.npy,.npy,1440.87
2,brightkite/checkins_clean.parquet,.parquet,70077.75
3,brightkite/checkins_final.parquet,.parquet,48604.95
8,brightkite/feat_df.parquet,.parquet,1892.42
6,brightkite/edges_clean.parquet,.parquet,477.91
7,brightkite/edges_final.parquet,.parquet,202.74
4,brightkite/comm_df.parquet,.parquet,56.64


=== LBSN2Vec++ run files (top) ===


Unnamed: 0,path,ext,size_kb
11,run_config.json,.json,1.34
9,lbsn2vec/metrics_global.json,.json,0.93
1,lbsn2vec/Z.npy,.npy,5934.79
0,lbsn2vec/X_users.npy,.npy,1992.68
2,lbsn2vec/checkins_clean.parquet,.parquet,494019.84
3,lbsn2vec/checkins_final.parquet,.parquet,115684.54
8,lbsn2vec/feat_df.parquet,.parquet,2663.53
6,lbsn2vec/edges_clean.parquet,.parquet,647.93
7,lbsn2vec/edges_final.parquet,.parquet,282.33
4,lbsn2vec/comm_df.parquet,.parquet,90.9


In [6]:
def read_json(path: Path) -> Dict[str, Any]:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def read_df(path: Path) -> pd.DataFrame:
    suf = path.suffix.lower()
    if suf == ".parquet":
        return pd.read_parquet(path)
    if suf == ".csv":
        return pd.read_csv(path)
    if suf in (".tsv", ".txt"):
        return pd.read_csv(path, sep="\t")
    raise ValueError(f"Unsupported file: {path}")


In [7]:
EXPECTED = {
    "run_config": ["run_config.json"],
    "comm_stats": ["comm_stats.json", "community_stats.json"],
    "metrics_global": ["metrics_global.json", "global_metrics.json"],
    "comm_metrics_df": ["comm_metrics_df.parquet", "community_metrics.parquet", "comm_metrics.parquet", "comm_metrics_df.csv", "community_metrics.csv"],
    "comm_df": ["comm_df.parquet", "community_labels.parquet", "labels.parquet", "comm_df.csv", "community_labels.csv", "labels.csv"],
    # optional for semantic:
    "checkins": ["checkins.parquet", "checkins_final.parquet", "checkins.csv", "checkins_final.csv"],
    "pois": ["pois.parquet", "poi.parquet", "venues.parquet", "pois.csv", "poi.csv", "venues.csv"],
    "feat_df": ["feat_df.parquet", "features.parquet", "user_features.parquet", "feat_df.csv", "features.csv", "user_features.csv"],
}

def find_first_existing(run_dir: Path, names: List[str]) -> Optional[Path]:
    for n in names:
        p = run_dir / n
        if p.exists():
            return p
    return None

def find_by_fuzzy(run_dir: Path, kind: str) -> Optional[Path]:
    """
    Fallback: tìm file có chứa keyword trong tên.
    """
    files = [p for p in run_dir.rglob("*") if p.is_file()]
    kind_l = kind.lower()

    # heuristic patterns
    patterns = {
        "comm_stats": [r"comm.*stats", r"community.*stats", r"cluster.*stats"],
        "metrics_global": [r"metrics.*global", r"global.*metrics", r"summary.*metrics"],
        "comm_metrics_df": [r"comm.*metrics", r"community.*metrics", r"cohesion", r"per.*community"],
        "comm_df": [r"comm.*df", r"community.*labels", r"labels", r"community_id"],
        "checkins": [r"checkins", r"checkin"],
        "pois": [r"\bpoi\b", r"pois", r"venues", r"venue"],
        "feat_df": [r"feat", r"features", r"x_users", r"user_features"],
    }

    pats = patterns.get(kind_l, [kind_l])

    # preference: json for json kinds, parquet then csv for df kinds
    if kind_l in ("comm_stats", "metrics_global"):
        cands = [p for p in files if p.suffix.lower() == ".json"]
    else:
        cands = [p for p in files if p.suffix.lower() in (".parquet", ".csv", ".tsv", ".txt")]

    scored: List[Tuple[int, Path]] = []
    for p in cands:
        name = p.name.lower()
        score = 1000
        if p.suffix.lower() == ".parquet":
            score -= 50
        if p.suffix.lower() == ".json":
            score -= 10

        for i, pat in enumerate(pats):
            if re.search(pat, name):
                score -= (300 - i)

        # ưu tiên file nằm gần root run_dir
        depth = len(p.relative_to(run_dir).parts)
        score += depth * 3
        scored.append((score, p))

    scored.sort(key=lambda x: x[0])
    return scored[0][1] if scored else None

def resolve_artifact(run_dir: Path, kind: str, required: bool = True) -> Optional[Path]:
    # 1) exact expected
    p = find_first_existing(run_dir, EXPECTED.get(kind, []))
    if p:
        return p
    # 2) fuzzy fallback
    p = find_by_fuzzy(run_dir, kind)
    if p:
        print(f"[WARN] {kind}: không thấy tên chuẩn, dùng fallback -> {p.relative_to(run_dir)}")
        return p

    if required:
        print(f"[ERROR] Missing required artifact '{kind}' in {run_dir}")
        print("Files available:")
        display(list_run_files(run_dir, limit=80))
    return None


In [8]:
def load_run(run_dir: Path, label: str) -> Dict[str, Any]:
    out: Dict[str, Any] = {"label": label, "run_dir": run_dir}

    # required
    p_cfg = resolve_artifact(run_dir, "run_config", required=True)
    p_cs  = resolve_artifact(run_dir, "comm_stats", required=False)       # đôi khi pipeline không lưu
    p_mg  = resolve_artifact(run_dir, "metrics_global", required=False)   # đôi khi pipeline gộp vào cs
    p_cm  = resolve_artifact(run_dir, "comm_metrics_df", required=True)

    out["run_config_path"] = p_cfg
    out["comm_stats_path"] = p_cs
    out["metrics_global_path"] = p_mg
    out["comm_metrics_df_path"] = p_cm

    out["run_config"] = read_json(p_cfg) if p_cfg else None
    out["comm_stats"] = read_json(p_cs) if p_cs else None
    out["metrics_global"] = read_json(p_mg) if p_mg else None
    out["comm_metrics_df"] = read_df(p_cm) if p_cm else None

    # optional
    p_comm = resolve_artifact(run_dir, "comm_df", required=False)
    p_feat = resolve_artifact(run_dir, "feat_df", required=False)
    p_chk  = resolve_artifact(run_dir, "checkins", required=False)
    p_poi  = resolve_artifact(run_dir, "pois", required=False)

    out["comm_df_path"] = p_comm
    out["feat_df_path"] = p_feat
    out["checkins_path"] = p_chk
    out["pois_path"] = p_poi

    out["comm_df"] = read_df(p_comm) if p_comm else None
    out["feat_df"] = read_df(p_feat) if p_feat else None
    out["checkins"] = read_df(p_chk) if p_chk else None
    out["pois"] = read_df(p_poi) if p_poi else None

    return out

bk = load_run(RUN_BRIGHTKITE, "brightkite")
lb = load_run(RUN_LBSN2VEC, "lbsn2vec")

def show_loaded(bundle: Dict[str, Any]) -> pd.DataFrame:
    rows = []
    for k in ["run_config_path","comm_stats_path","metrics_global_path","comm_metrics_df_path","comm_df_path","feat_df_path","checkins_path","pois_path"]:
        p = bundle.get(k)
        rows.append({"artifact": k, "path": str(p) if p else None})
    return pd.DataFrame(rows)

print("=== Loaded artifacts: Brightkite ===")
display(show_loaded(bk))
print("=== Loaded artifacts: LBSN2Vec++ ===")
display(show_loaded(lb))


[WARN] comm_stats: không thấy tên chuẩn, dùng fallback -> run_config.json
[WARN] metrics_global: không thấy tên chuẩn, dùng fallback -> brightkite/metrics_global.json
[WARN] comm_metrics_df: không thấy tên chuẩn, dùng fallback -> brightkite/comm_metrics.parquet


OSError: Could not open Parquet input source '<Buffer>': Couldn't deserialize thrift: invalid TType


In [None]:
def deep_get(d: Dict[str, Any], path: str) -> Any:
    cur = d
    for part in path.split("."):
        if not isinstance(cur, dict) or part not in cur:
            return None
        cur = cur[part]
    return cur

CONFIG_KEYS = {
    "dataset": ["dataset", "cfg.dataset"],
    "sample_frac": ["sample_frac", "run.sample_frac", "cfg.run.sample_frac"],
    "train_edge_frac": ["train_edge_frac", "run.train_edge_frac", "cfg.run.train_edge_frac"],
    "epochs": ["epochs", "train.epochs", "cfg.train.epochs"],
    "batch_size": ["batch_size", "train.batch_size", "cfg.train.batch_size"],
    "neg": ["neg", "train.neg", "cfg.train.neg"],
    "neighbor_sampling": ["neighbor_sampling", "train.neighbor_sampling", "cfg.train.neighbor_sampling"],
    "knn_k": ["knn_k", "knn.k", "cfg.community.knn_k", "cfg.knn.k"],
    "mutual_knn": ["mutual", "knn.mutual", "cfg.community.mutual_knn", "cfg.knn.mutual"],
    "leiden_resolution": ["resolution", "leiden.resolution", "cfg.community.resolution", "cfg.leiden.resolution"],
    "seed": ["seed", "run.seed", "cfg.run.seed"],
}

def extract_cfg(cfg: Optional[Dict[str, Any]]) -> Dict[str, Any]:
    if cfg is None:
        return {}
    out = {}
    for k, paths in CONFIG_KEYS.items():
        v = None
        for p in paths:
            vv = deep_get(cfg, p)
            if vv is not None:
                v = vv
                break
        out[k] = v
    return out

cfg_compare = pd.DataFrame(
    [extract_cfg(bk["run_config"]), extract_cfg(lb["run_config"])],
    index=["brightkite", "lbsn2vec"]
)
display(cfg_compare)


In [None]:
def guess_col(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
    cols_lower = {c.lower(): c for c in df.columns}
    for c in candidates:
        if c.lower() in cols_lower:
            return cols_lower[c.lower()]
    # contains heuristic
    for c in df.columns:
        lc = c.lower()
        for cand in candidates:
            if cand.lower() in lc:
                return c
    return None

def stability_metrics(comm_metrics_df: pd.DataFrame) -> Dict[str, Any]:
    size_col = guess_col(comm_metrics_df, ["size", "community_size", "n_users", "count", "n"])
    if size_col is None:
        raise ValueError(f"Không tìm thấy cột size trong comm_metrics_df. Columns={list(comm_metrics_df.columns)[:40]}")
    sizes = comm_metrics_df[size_col].dropna().astype(float).values
    n_comm = len(sizes)
    n_users = float(np.sum(sizes))
    largest = float(np.max(sizes)) if n_comm else np.nan
    median = float(np.median(sizes)) if n_comm else np.nan
    return {
        "n_communities": int(n_comm),
        "n_users_est": int(n_users),
        "largest_comm": largest,
        "median_comm": median,
        "largest_ratio": (largest / n_users) if n_users > 0 else np.nan,
        "largest_over_median": (largest / median) if median > 0 else np.nan,
        "pct_comm_lt_5": float(np.mean(sizes < 5) * 100.0),
        "pct_comm_lt_10": float(np.mean(sizes < 10) * 100.0),
        "size_col": size_col,
    }

stab_bk = stability_metrics(bk["comm_metrics_df"])
stab_lb = stability_metrics(lb["comm_metrics_df"])

stab_tbl = pd.DataFrame([stab_bk, stab_lb], index=["brightkite", "lbsn2vec"])
display(stab_tbl)


In [None]:
def plot_sizes(label: str, comm_metrics_df: pd.DataFrame):
    size_col = guess_col(comm_metrics_df, ["size", "community_size", "n_users", "count", "n"])
    sizes = comm_metrics_df[size_col].dropna().astype(int).values

    plt.figure()
    plt.hist(sizes, bins=60)
    plt.title(f"{label} — community size histogram")
    plt.xlabel("community size")
    plt.ylabel("count communities")
    plt.show()

plot_sizes("brightkite", bk["comm_metrics_df"])
plot_sizes("lbsn2vec", lb["comm_metrics_df"])


In [None]:
def flatten(d: Optional[Dict[str, Any]], prefix: str = "") -> Dict[str, Any]:
    if d is None:
        return {}
    out = {}
    for k, v in d.items():
        kk = f"{prefix}{k}"
        if isinstance(v, dict):
            out.update(flatten(v, prefix=f"{kk}."))
        else:
            out[kk] = v
    return out

def pick_first_numeric(flat: Dict[str, Any], keywords: List[str]) -> Tuple[Optional[float], Optional[str]]:
    for k, v in flat.items():
        lk = k.lower()
        if all(kw in lk for kw in keywords):
            try:
                return float(v), k
            except Exception:
                continue
    return None, None

def spatial_summary(bundle: Dict[str, Any]) -> Dict[str, Any]:
    df = bundle["comm_metrics_df"]
    # find spatial median column
    spatial_med_col = guess_col(df, ["spatial_median_km", "spatial_median", "median_km", "spatial_median_distance_km"])
    if spatial_med_col is None:
        # heuristic: any col containing 'spatial' and 'median'
        for c in df.columns:
            lc = c.lower()
            if "spatial" in lc and "median" in lc:
                spatial_med_col = c
                break

    out = {"spatial_median_col": spatial_med_col}
    if spatial_med_col is not None:
        vals = df[spatial_med_col].dropna().astype(float).values
        out["spatial_median_km__median_over_comm"] = float(np.median(vals)) if len(vals) else None
        out["spatial_median_km__mean_over_comm"] = float(np.mean(vals)) if len(vals) else None

    flat = {
        **flatten(bundle.get("metrics_global"), prefix="mg."),
        **flatten(bundle.get("comm_stats"), prefix="cs."),
    }

    z, zkey = pick_first_numeric(flat, ["z", "score"])
    if z is None:
        # try more specific
        z, zkey = pick_first_numeric(flat, ["spatial", "z"])
    out["z_score"] = z
    out["z_score_key"] = zkey

    # observed vs random (if available)
    obs, obskey = pick_first_numeric(flat, ["spatial", "median"])
    rnd_mean, rnd_mean_key = pick_first_numeric(flat, ["random", "mean"])
    rnd_std, rnd_std_key = pick_first_numeric(flat, ["random", "std"])
    out.update({
        "observed_key": obskey,
        "observed_val": obs,
        "random_mean_key": rnd_mean_key,
        "random_mean_val": rnd_mean,
        "random_std_key": rnd_std_key,
        "random_std_val": rnd_std,
    })
    return out

sp_bk = spatial_summary(bk)
sp_lb = spatial_summary(lb)

sp_tbl = pd.DataFrame([sp_bk, sp_lb], index=["brightkite", "lbsn2vec"])
display(sp_tbl)


In [None]:
def plot_spatial(label: str, comm_metrics_df: pd.DataFrame):
    col = guess_col(comm_metrics_df, ["spatial_median_km", "spatial_median", "median_km", "spatial_median_distance_km"])
    if col is None:
        for c in comm_metrics_df.columns:
            lc = c.lower()
            if "spatial" in lc and "median" in lc:
                col = c
                break
    if col is None:
        print(f"[{label}] Không thấy spatial median column trong comm_metrics_df.")
        print("Columns:", list(comm_metrics_df.columns)[:50])
        return

    vals = comm_metrics_df[col].dropna().astype(float).values
    plt.figure()
    plt.hist(vals, bins=60)
    plt.title(f"{label} — intra-community spatial median (km)")
    plt.xlabel("km")
    plt.ylabel("count communities")
    plt.show()

plot_spatial("brightkite", bk["comm_metrics_df"])
plot_spatial("lbsn2vec", lb["comm_metrics_df"])


In [None]:
def structural_summary(bundle: Dict[str, Any]) -> Dict[str, Any]:
    flat = {
        **flatten(bundle.get("metrics_global"), prefix="mg."),
        **flatten(bundle.get("comm_stats"), prefix="cs."),
    }
    modularity, mod_key = pick_first_numeric(flat, ["modular"])
    conductance, cond_key = pick_first_numeric(flat, ["conduct"])
    density, dens_key = pick_first_numeric(flat, ["density"])
    return {
        "modularity": modularity, "modularity_key": mod_key,
        "conductance": conductance, "conductance_key": cond_key,
        "density": density, "density_key": dens_key,
    }

st_bk = structural_summary(bk)
st_lb = structural_summary(lb)

struct_tbl = pd.DataFrame([st_bk, st_lb], index=["brightkite", "lbsn2vec"])
display(struct_tbl)


In [None]:
def semantic_specialization(lb_bundle: Dict[str, Any], topk: int = 5) -> Optional[pd.DataFrame]:
    checkins = lb_bundle.get("checkins")
    pois = lb_bundle.get("pois")
    comm_df = lb_bundle.get("comm_df")
    if checkins is None or pois is None or comm_df is None:
        print("[lbsn2vec] Thiếu checkins/pois/comm_df nên không làm semantic specialization được.")
        return None

    u = guess_col(checkins, ["user_id", "user"])
    v = guess_col(checkins, ["venue_id", "venue", "poi_id", "place_id"])
    cu = guess_col(comm_df, ["user_id", "user"])
    cc = guess_col(comm_df, ["community_id", "community", "cluster", "label"])
    pv = guess_col(pois, ["venue_id", "venue", "poi_id", "place_id"])
    cat = guess_col(pois, ["category", "cat", "type", "venue_category"])

    if None in (u, v, cu, cc, pv, cat):
        print("[lbsn2vec] Không tìm đủ cột cần thiết.")
        print("checkins cols:", list(checkins.columns)[:30])
        print("comm_df cols:", list(comm_df.columns)[:30])
        print("pois cols:", list(pois.columns)[:30])
        return None

    tmp = checkins[[u, v]].merge(comm_df[[cu, cc]], left_on=u, right_on=cu, how="inner")
    tmp = tmp.merge(pois[[pv, cat]], left_on=v, right_on=pv, how="left")
    tmp = tmp.dropna(subset=[cat])

    grp = tmp.groupby([cc, cat]).size().rename("n").reset_index()
    total = grp.groupby(cc)["n"].sum().rename("N").reset_index()
    grp = grp.merge(total, on=cc, how="left")
    grp["p"] = grp["n"] / grp["N"]

    # entropy per community
    ent = grp.groupby(cc).apply(lambda g: float(-(g["p"] * np.log(g["p"] + 1e-12)).sum())).rename("category_entropy").reset_index()

    # top-k categories per community
    top = grp.sort_values([cc, "p"], ascending=[True, False]).groupby(cc).head(topk)
    top = top.rename(columns={cc: "community_id", cat: "category"})

    ent = ent.rename(columns={cc: "community_id"})
    out = top.merge(ent, on="community_id", how="left")
    return out.sort_values(["category_entropy", "community_id", "p"], ascending=[True, True, False])

sem = semantic_specialization(lb, topk=5)
if sem is not None:
    display(sem.head(40))


In [None]:
final = pd.DataFrame([
    {
        "dataset": "brightkite",
        **{f"cfg.{k}": v for k, v in extract_cfg(bk["run_config"]).items()},
        **{f"stab.{k}": v for k, v in stab_bk.items() if k != "size_col"},
        **{f"spatial.{k}": v for k, v in sp_bk.items()},
        **{f"struct.{k}": v for k, v in st_bk.items()},
    },
    {
        "dataset": "lbsn2vec",
        **{f"cfg.{k}": v for k, v in extract_cfg(lb["run_config"]).items()},
        **{f"stab.{k}": v for k, v in stab_lb.items() if k != "size_col"},
        **{f"spatial.{k}": v for k, v in sp_lb.items()},
        **{f"struct.{k}": v for k, v in st_lb.items()},
    },
]).set_index("dataset")

display(final)

print("\nCách đọc nhanh (Step 8):")
print("- Stability: largest_ratio (≈ largest_comm / total_users) càng thấp càng ổn; pct_comm_lt_10 quá cao => nhiều cụm nhỏ lẻ.")
print("- Structural: modularity cao + conductance thấp => 'đậm social' hơn.")
print("- Spatial: spatial_median_km thấp + z_score cao => 'đậm địa lý' hơn (tốt hơn random baseline).")
print("- Semantic (LBSN2Vec++): category_entropy thấp => community chuyên môn hoá theo loại địa điểm mạnh hơn.")
