In [34]:
from pathlib import Path
import struct

p = Path("/mnt/d/community-detection/data/processed/_runs/20251214_183903/lbsn2vec/edges_final.parquet")

with p.open("rb") as f:
    f.seek(-8, 2)
    tail8 = f.read(8)

footer_len = struct.unpack("<I", tail8[:4])[0]
magic = tail8[4:]
print("tail8:", tail8, "footer_len:", footer_len, "magic:", magic)
print("footer_start_offset:", p.stat().st_size - 8 - footer_len)


tail8: b'\xaa\x05\x00\x00PAR1' footer_len: 1450 magic: b'PAR1'
footer_start_offset: 287650


In [35]:
import pyarrow.parquet as pq
from pathlib import Path

p = Path("/mnt/d/community-detection/data/processed/_runs/20251214_165847/lbsn2vec/edges_final.parquet")
try:
    pf = pq.ParquetFile(p)
    print("OK", pf.metadata.num_rows, pf.metadata.num_row_groups)
except Exception as e:
    print("FAIL:", type(e).__name__, e)


OK 54084 1


In [36]:
import pyarrow.parquet as pq
from pathlib import Path

p = Path("/mnt/d/community-detection/data/processed/_runs/20251214_165847/lbsn2vec/edges_final.parquet")

try:
    t = pq.read_table(p)
    df = t.to_pandas()
    print("READ FULL OK:", df.shape, "cols:", df.columns.tolist())
    print(df.head(3))
except Exception as e:
    print("READ FULL FAIL:", type(e).__name__, e)


READ FULL FAIL: OSError Corrupt snappy compressed data.


In [37]:
print("TRY READ:", p)


TRY READ: /mnt/d/community-detection/data/processed/_runs/20251214_165847/lbsn2vec/edges_final.parquet


In [1]:
import argparse
import json
from pathlib import Path

import numpy as np
import pandas as pd


def _ok(msg): print(f"[OK]  {msg}")
def _warn(msg): print(f"[WARN]{msg}")
def _fail(msg): print(f"[FAIL]{msg}")


def read_manifest(manifest_path: Path):
    if not manifest_path.exists():
        return None
    try:
        return json.loads(manifest_path.read_text(encoding="utf-8"))
    except Exception:
        return None


def file_stat(p: Path):
    if not p.exists():
        return False, "missing"
    sz = p.stat().st_size
    if sz <= 0:
        return False, "empty"
    return True, f"{sz/1024/1024:.2f} MB"


def check_parquet(p: Path, required_cols=None, head=5):
    try:
        df = pd.read_parquet(p)
    except Exception as e:
        return False, f"read_parquet error: {e}", None

    if required_cols:
        missing = [c for c in required_cols if c not in df.columns]
        if missing:
            return False, f"missing cols: {missing}", df

    if len(df) == 0:
        return False, "dataframe is empty", df

    # quick sanity: null ratio
    null_ratio = df.isna().mean(numeric_only=False).sort_values(ascending=False).head(3).to_dict()
    return True, f"rows={len(df):,} cols={len(df.columns)} null_top3={null_ratio}", df.head(head)


def check_npy(p: Path):
    try:
        Z = np.load(p, mmap_mode="r")
    except Exception as e:
        return False, f"np.load error: {e}", None

    if Z is None or len(Z.shape) != 2:
        return False, f"unexpected shape: {None if Z is None else Z.shape}", None

    n, d = Z.shape
    if n <= 0 or d <= 0:
        return False, f"invalid shape: {Z.shape}", None

    # quick numeric sanity
    try:
        sample = np.asarray(Z[: min(2000, n)])
        finite_ratio = np.isfinite(sample).mean()
    except Exception:
        finite_ratio = None

    return True, f"shape={Z.shape} finite_ratio(sample)={finite_ratio}", None


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--project-root", type=str, default=".")
    ap.add_argument("--dataset", type=str, choices=["brightkite", "lbsn2vec"], required=True)
    ap.add_argument("--run-id", type=str, default=None)
    ap.add_argument("--strict", action="store_true", help="fail fast if any required file fails")
    args = ap.parse_args()

    root = Path(args.project_root).resolve()
    processed = root / "data" / "processed"
    data_cleared = processed / "data_cleared" / args.dataset
    runs_root = processed / "_runs"

    manifest = read_manifest(processed / "viz_runs.json")
    run_id = args.run_id
    if run_id is None and manifest and args.dataset in manifest:
        run_id = manifest[args.dataset].get("run_id")

    if run_id is None:
        _fail("run_id not provided and viz_runs.json missing/does not contain dataset")
        return 2

    run_ds = runs_root / run_id / args.dataset

    print("=" * 80)
    print(f"PROJECT_ROOT : {root}")
    print(f"DATASET      : {args.dataset}")
    print(f"DATA_CLEARED : {data_cleared}")
    print(f"RUN_DIR      : {run_ds}")
    print("=" * 80)

    # ---- Required base files (data_cleared)
    base_files = {
        "edges_final": (data_cleared / "edges_final.parquet", ["u", "v"]),
        "users_final": (data_cleared / "users_final.parquet", ["user_id"]),
    }

    # ---- Optional base files
    optional_base = {
        "feat_df": (data_cleared / "feat_df.parquet", ["user_id"]),
        "checkins_clean": (data_cleared / "checkins_clean.parquet", None),
        "meta": (data_cleared / "data_cleared_meta.json", None),
    }

    # ---- Required run artifacts
    comm_df_rep = run_ds / "comm_df.repaired.parquet"
    comm_df_raw = run_ds / "comm_df.parquet"
    comm_df_path = comm_df_rep if comm_df_rep.exists() else comm_df_raw

    run_files = {
        "comm_df": (comm_df_path, ["user_id", "community_id"]),
        "Z": (run_ds / "Z.npy", None),
    }

    # ---- Optional run artifacts
    optional_run = {
        "comm_metrics": (run_ds / "comm_metrics.parquet", None),
        "metrics_global": (run_ds / "metrics_global.json", None),
    }

    # ---- Check existence + load
    failures = 0

    print("\n[BASE REQUIRED]")
    for k, (p, req_cols) in base_files.items():
        alive, stat = file_stat(p)
        if not alive:
            _fail(f"{k}: {p} -> {stat}")
            failures += 1
            if args.strict: return 2
            continue
        ok, msg, head = check_parquet(p, req_cols)
        if ok:
            _ok(f"{k}: {p.name} ({stat}) | {msg}")
            print(head)
        else:
            _fail(f"{k}: {p.name} ({stat}) | {msg}")
            failures += 1
            if args.strict: return 2

    print("\n[RUN REQUIRED]")
    # comm_df
    p, req_cols = run_files["comm_df"]
    alive, stat = file_stat(p)
    if not alive:
        _fail(f"comm_df: {p} -> {stat}")
        failures += 1
        if args.strict: return 2
        comm_df = None
    else:
        ok, msg, head = check_parquet(p, req_cols)
        if ok:
            _ok(f"comm_df: {p.name} ({stat}) | {msg}")
            print(head)
            comm_df = pd.read_parquet(p)[["user_id", "community_id"]].copy()
            comm_df["user_id"] = comm_df["user_id"].astype(str)
        else:
            _fail(f"comm_df: {p.name} ({stat}) | {msg}")
            failures += 1
            if args.strict: return 2
            comm_df = None

    # Z.npy
    pz, _ = run_files["Z"]
    alive, stat = file_stat(pz)
    if not alive:
        _fail(f"Z.npy: {pz} -> {stat}")
        failures += 1
        if args.strict: return 2
        Z = None
    else:
        ok, msg, _ = check_npy(pz)
        if ok:
            _ok(f"Z.npy: {pz.name} ({stat}) | {msg}")
            Z = np.load(pz, mmap_mode="r")
        else:
            _fail(f"Z.npy: {pz.name} ({stat}) | {msg}")
            failures += 1
            if args.strict: return 2
            Z = None

    # Alignment check (critical for embedding view)
    if comm_df is not None and Z is not None:
        if len(comm_df) != Z.shape[0]:
            _warn(
                f"Alignment: len(comm_df)={len(comm_df):,} != Z.shape[0]={Z.shape[0]:,}. "
                "Embedding scatter có thể lệch màu. Nên lưu thêm users_in_run.parquet theo đúng thứ tự Z."
            )
        else:
            _ok(f"Alignment: len(comm_df) matches Z rows ({len(comm_df):,})")

        # community sanity
        n_comm = comm_df["community_id"].nunique()
        largest = comm_df["community_id"].value_counts().max()
        _ok(f"Community sanity: n_communities={n_comm} | largest_comm={largest}")

    print("\n[OPTIONAL FILES]")
    for k, (p, req_cols) in {**optional_base, **optional_run}.items():
        alive, stat = file_stat(p)
        if not alive:
            _warn(f"{k}: {p} -> {stat} (optional)")
            continue

        if p.suffix == ".parquet":
            ok, msg, head = check_parquet(p, req_cols)
            if ok:
                _ok(f"{k}: {p.name} ({stat}) | {msg}")
                print(head)
            else:
                _warn(f"{k}: {p.name} ({stat}) | {msg}")
        elif p.suffix == ".json":
            try:
                obj = json.loads(p.read_text(encoding="utf-8"))
                _ok(f"{k}: {p.name} ({stat}) | json keys={list(obj)[:10]}")
            except Exception as e:
                _warn(f"{k}: {p.name} ({stat}) | json read error: {e}")
        else:
            _ok(f"{k}: {p.name} ({stat})")

    print("\n" + "=" * 80)
    if failures == 0:
        _ok("PRECHECK PASSED ✅ Bạn có thể bắt đầu visualization (Streamlit).")
        return 0
    _fail(f"PRECHECK FAILED ❌ failures={failures}")
    return 2


if __name__ == "__main__":
    raise SystemExit(main())


usage: ipykernel_launcher.py [-h] [--project-root PROJECT_ROOT] --dataset
                             {brightkite,lbsn2vec} [--run-id RUN_ID]
                             [--strict]
ipykernel_launcher.py: error: the following arguments are required: --dataset


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
