## Imports & Environment

In [None]:
from __future__ import annotations

import os
import sys
import json
import time
import math
import random
import logging
import platform
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Dict, Optional, Tuple, List, Literal, Any

import numpy as np
import pandas as pd


try:
    import yaml
    _HAS_YAML = True
except Exception:
    _HAS_YAML = False

try:
    import torch
    _HAS_TORCH = True
except Exception:
    torch = None
    _HAS_TORCH = False

def print_env_info() -> None:
    print("=== Environment Info ===")
    print(f"Python        : {sys.version.split()[0]}")
    print(f"Platform      : {platform.platform()}")
    print(f"NumPy         : {np.__version__}")
    print(f"Pandas        : {pd.__version__}")
    print(f"PyYAML        : {'OK' if _HAS_YAML else 'NOT INSTALLED'}")
    if _HAS_TORCH:
        print(f"PyTorch       : {torch.__version__}")
        cuda_ok = torch.cuda.is_available()
        print(f"CUDA available: {cuda_ok}")
        if cuda_ok:
            print(f"CUDA device   : {torch.cuda.get_device_name(0)}")
            print(f"CUDA version  : {torch.version.cuda}")
    else:
        print("PyTorch       : NOT INSTALLED")
    print("========================")

print_env_info()


=== Environment Info ===
Python        : 3.11.8
Platform      : Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.39
NumPy         : 2.3.5
Pandas        : 2.3.3
PyYAML        : OK
PyTorch       : 2.9.0+cu126
CUDA available: True
CUDA device   : NVIDIA GeForce RTX 4060 Laptop GPU
CUDA version  : 12.6


## Global Config

In [2]:

def find_repo_root(start: Optional[Path] = None) -> Path:

    env_root = os.environ.get("CD_PROJECT_ROOT")
    if env_root:
        p = Path(env_root).expanduser().resolve()
        if p.exists():
            return p
        raise FileNotFoundError(f"CD_PROJECT_ROOT is set but does not exist: {env_root}")

    if start is None:
        start = Path.cwd()

    start = start.resolve()
    for p in [start] + list(start.parents):
        if (p / "pyproject.toml").exists():
            return p
        if (p / "configs" / "default.yaml").exists():
            return p
        if (p / "README.md").exists():
            return p

    return start  # fallback

PROJECT_ROOT = find_repo_root()
CONFIG_PATH_DEFAULT = PROJECT_ROOT / "configs" / "default.yaml"

def _rel(p: Path) -> str:
    try:
        return str(p.resolve().relative_to(PROJECT_ROOT.resolve()))
    except Exception:
        return str(p)

print("[Paths]")
print("  CWD          :", _rel(Path.cwd()))
print("  PROJECT_ROOT :", PROJECT_ROOT.name)
print("  CONFIG_PATH  :", _rel(CONFIG_PATH_DEFAULT))

def load_config(config_path: Optional[Path] = None) -> Dict[str, Any]:
    cfg: Dict[str, Any] = {
        "project": {
            "name": "community-detection",
            "data_dir": "data",
            "processed_dir": "data/processed",
            "artifacts_format": "parquet",
        },
        "run": {
            "seed": 42,
            "log_level": "INFO",
            "save_run_config": True,
            "overwrite": False,
        },
        "preprocess": {
            "min_checkins": 10,
            "min_degree": 3,
            "iterative_filter": True,
            "drop_self_loops": True,
            "dedup_edges": True,
            "enforce_undirected": True,
            "lat_range": [-90.0, 90.0],
            "lon_range": [-180.0, 180.0],
        },
        "features": {
            "use_spatial": True,
            "use_temporal": True,
            "use_venue": True,
            "log1p_counts": True,
            "standardize": True,
        },
        "model": {
            "encoder": "graphsage",
            "num_layers": 2,
            "hidden_dim": 128,
            "embed_dim": 128,
            "neighbor_sampling": [25, 10],
        },
        "train": {
            "epochs": 10,
            "lr": 1e-3,
            "batch_size": 1024,
            "num_negative": 5,
            "positive_strategy": "random_walk",
            "rw_length": 10,
            "rw_window": 5,
        },
        "community": {
            "similarity": "cosine",
            "knn_k": 30,
            "mutual_knn": True,
            "clip_negative_weights": True,
            "leiden_resolution": 1.0,
        },
        "metrics": {
            "random_baseline_runs": 10,
            "distance_metric": "haversine",
            "aggregate": "median",
        },
        "datasets": {
            "active": ["brightkite", "gowalla", "lbsn2vec"],
            "lbsn2vec_snapshot": "old",
            "lbsn2vec_tier": "curated",
        }
    }

    if config_path is None:
        config_path = CONFIG_PATH_DEFAULT

    if config_path.exists():
        if not _HAS_YAML:
            raise RuntimeError("configs/default.yaml exists but PyYAML is not installed. Install pyyaml.")
        with open(config_path, "r", encoding="utf-8") as f:
            y = yaml.safe_load(f) or {}

        def _deep_update(d: Dict[str, Any], u: Dict[str, Any]) -> Dict[str, Any]:
            for k, v in u.items():
                if isinstance(v, dict) and isinstance(d.get(k), dict):
                    _deep_update(d[k], v)
                else:
                    d[k] = v
            return d

        cfg = _deep_update(cfg, y)
        print(f"[Config] Loaded YAML: {_rel(config_path)}")
    else:
        print(f"[Config] YAML not found, using notebook defaults: {_rel(config_path)}")

    return cfg

CFG = load_config()
CFG


[Paths]
  CWD          : notebooks
  PROJECT_ROOT : community-detection
  CONFIG_PATH  : configs/default.yaml
[Config] YAML not found, using notebook defaults: configs/default.yaml


{'project': {'name': 'community-detection',
  'data_dir': 'data',
  'processed_dir': 'data/processed',
  'artifacts_format': 'parquet'},
 'run': {'seed': 42,
  'log_level': 'INFO',
  'save_run_config': True,
  'overwrite': False},
 'preprocess': {'min_checkins': 10,
  'min_degree': 3,
  'iterative_filter': True,
  'drop_self_loops': True,
  'dedup_edges': True,
  'enforce_undirected': True,
  'lat_range': [-90.0, 90.0],
  'lon_range': [-180.0, 180.0]},
 'features': {'use_spatial': True,
  'use_temporal': True,
  'use_venue': True,
  'log1p_counts': True,
  'standardize': True},
 'model': {'encoder': 'graphsage',
  'num_layers': 2,
  'hidden_dim': 128,
  'embed_dim': 128,
  'neighbor_sampling': [25, 10]},
 'train': {'epochs': 10,
  'lr': 0.001,
  'batch_size': 1024,
  'num_negative': 5,
  'positive_strategy': 'random_walk',
  'rw_length': 10,
  'rw_window': 5},
 'community': {'similarity': 'cosine',
  'knn_k': 30,
  'mutual_knn': True,
  'clip_negative_weights': True,
  'leiden_resoluti

## Dataset Registry

In [None]:
DATA_DIR = PROJECT_ROOT / CFG["project"]["data_dir"]
PROCESSED_DIR = PROJECT_ROOT / CFG["project"]["processed_dir"]

print("[Derived Paths]")
print("  DATA_DIR      :", _rel(DATA_DIR))
print("  PROCESSED_DIR :", _rel(PROCESSED_DIR))

DatasetName = Literal["brightkite", "gowalla", "lbsn2vec"]
TierName = Literal["curated", "raw"]
SnapshotName = Literal["old", "new"]

@dataclass(frozen=True)
class DatasetSpec:
    name: DatasetName
    root: Path
    edges_path: Optional[Path] = None
    checkins_path: Optional[Path] = None

    # LBSN2Vec++ extras
    friendship_old_path: Optional[Path] = None
    friendship_new_path: Optional[Path] = None
    readme_path: Optional[Path] = None
    poi_path: Optional[Path] = None
    raw_checkins_path: Optional[Path] = None

    # meta flags
    source: Literal["SNAP", "LBSN2Vec"] = "SNAP"
    tier: Optional[TierName] = None
    snapshot: Optional[SnapshotName] = None

def build_dataset_registry(cfg: Dict[str, Any]) -> Dict[DatasetName, DatasetSpec]:
    brightkite_root = DATA_DIR / "Brightkite"
    gowalla_root = DATA_DIR / "Gowalla"
    lbsn_root = DATA_DIR / "LBSN2Vec"

    tier: TierName = cfg["datasets"].get("lbsn2vec_tier", "curated")
    snapshot: SnapshotName = cfg["datasets"].get("lbsn2vec_snapshot", "old")

    return {
        "brightkite": DatasetSpec(
            name="brightkite",
            root=brightkite_root,
            edges_path=brightkite_root / "Brightkite_edges.txt",
            checkins_path=brightkite_root / "Brightkite_totalCheckins.txt",
            source="SNAP",
        ),
        "gowalla": DatasetSpec(
            name="gowalla",
            root=gowalla_root,
            edges_path=gowalla_root / "Gowalla_edges.txt",
            checkins_path=gowalla_root / "Gowalla_totalCheckins.txt",
            source="SNAP",
        ),
        "lbsn2vec": DatasetSpec(
            name="lbsn2vec",
            root=lbsn_root,
            friendship_old_path=lbsn_root / "dataset_WWW_friendship_old.txt",
            friendship_new_path=lbsn_root / "dataset_WWW_friendship_new.txt",
            readme_path=lbsn_root / "dataset_WWW_readme.txt",
            checkins_path=lbsn_root / "dataset_WWW_Checkins_anonymized.txt",
            raw_checkins_path=lbsn_root / "raw_Checkins_anonymized.txt",
            poi_path=lbsn_root / "raw_POIs.txt",
            source="LBSN2Vec",
            tier=tier,
            snapshot=snapshot,
        ),
    }

DATASETS = build_dataset_registry(CFG)

def validate_dataset_files(datasets: Dict[DatasetName, DatasetSpec]) -> None:
    print("=== Dataset Registry Validation ===")
    ok = True

    if not DATA_DIR.exists():
        print(f"[FAIL] Missing DATA_DIR: {_rel(DATA_DIR)}")
        ok = False

    for name, spec in datasets.items():
        print(f"- {name}: root={_rel(spec.root)}")
        if not spec.root.exists():
            print(f"  [FAIL] Missing folder: {_rel(spec.root)}")
            ok = False
            continue

        if spec.source == "SNAP":
            for p in [spec.edges_path, spec.checkins_path]:
                if p is None or not p.exists():
                    print(f"  [FAIL] Missing file: {_rel(p) if p else p}")
                    ok = False
                else:
                    print(f"  [OK] {p.name}")

        if spec.source == "LBSN2Vec":
            for p in [spec.friendship_old_path, spec.friendship_new_path, spec.checkins_path]:
                if p is None or not p.exists():
                    print(f"  [FAIL] Missing file: {_rel(p) if p else p}")
                    ok = False
                else:
                    print(f"  [OK] {p.name}")

            for p in [spec.readme_path, spec.poi_path, spec.raw_checkins_path]:
                if p is None or not p.exists():
                    print(f"  [i] Optional missing: {_rel(p) if p else p}")
                else:
                    print(f"  [OK] {p.name}")

    if ok:
        print("[OK] Dataset folders/files look good.")
    else:
        print("[WARN] Dataset registry has missing paths. Fix before continuing.")
    print("===================================")

validate_dataset_files(DATASETS)
DATASETS


[Derived Paths]
  DATA_DIR      : data
  PROCESSED_DIR : data/processed
=== Dataset Registry Validation ===
- brightkite: root=data/Brightkite
  [OK] Brightkite_edges.txt
  [OK] Brightkite_totalCheckins.txt
- gowalla: root=data/Gowalla
  [OK] Gowalla_edges.txt
  [OK] Gowalla_totalCheckins.txt
- lbsn2vec: root=data/LBSN2Vec
  [OK] dataset_WWW_friendship_old.txt
  [OK] dataset_WWW_friendship_new.txt
  [OK] dataset_WWW_Checkins_anonymized.txt
  [OK] dataset_WWW_readme.txt
  [OK] raw_POIs.txt
  [OK] raw_Checkins_anonymized.txt
[OK] Dataset folders/files look good.


{'brightkite': DatasetSpec(name='brightkite', root=PosixPath('/mnt/d/community-detection/data/Brightkite'), edges_path=PosixPath('/mnt/d/community-detection/data/Brightkite/Brightkite_edges.txt'), checkins_path=PosixPath('/mnt/d/community-detection/data/Brightkite/Brightkite_totalCheckins.txt'), friendship_old_path=None, friendship_new_path=None, readme_path=None, poi_path=None, raw_checkins_path=None, source='SNAP', tier=None, snapshot=None),
 'gowalla': DatasetSpec(name='gowalla', root=PosixPath('/mnt/d/community-detection/data/Gowalla'), edges_path=PosixPath('/mnt/d/community-detection/data/Gowalla/Gowalla_edges.txt'), checkins_path=PosixPath('/mnt/d/community-detection/data/Gowalla/Gowalla_totalCheckins.txt'), friendship_old_path=None, friendship_new_path=None, readme_path=None, poi_path=None, raw_checkins_path=None, source='SNAP', tier=None, snapshot=None),
 'lbsn2vec': DatasetSpec(name='lbsn2vec', root=PosixPath('/mnt/d/community-detection/data/LBSN2Vec'), edges_path=None, checki

## Seed + Logging + Output Dirs

In [4]:

def set_global_seed(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

    if _HAS_TORCH:
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

def setup_logging(log_level: str = "INFO") -> logging.Logger:
    logger = logging.getLogger("osnclusters")
    logger.setLevel(getattr(logging, log_level.upper(), logging.INFO))
    logger.handlers.clear()

    ch = logging.StreamHandler(stream=sys.stdout)
    ch.setLevel(getattr(logging, log_level.upper(), logging.INFO))
    fmt = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s", datefmt="%H:%M:%S")
    ch.setFormatter(fmt)
    logger.addHandler(ch)
    logger.propagate = False
    return logger

def ensure_dirs() -> Dict[str, Path]:
    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
    run_id = time.strftime("%Y%m%d_%H%M%S")
    run_dir = PROCESSED_DIR / "_runs" / run_id
    run_dir.mkdir(parents=True, exist_ok=True)
    return {"processed": PROCESSED_DIR, "run_dir": run_dir}

set_global_seed(int(CFG["run"]["seed"]))
LOGGER = setup_logging(CFG["run"]["log_level"])
DIRS = ensure_dirs()

LOGGER.info(f"PROJECT_ROOT : {_rel(PROJECT_ROOT)}")
LOGGER.info(f"DATA_DIR     : {_rel(DATA_DIR)}")
LOGGER.info(f"Seed set to  : {CFG['run']['seed']}")
LOGGER.info(f"Processed dir: {_rel(DIRS['processed'])}")
LOGGER.info(f"Run dir      : {_rel(DIRS['run_dir'])}")

if CFG["run"].get("save_run_config", True):
    out = DIRS["run_dir"] / "run_config.json"
    with open(out, "w", encoding="utf-8") as f:
        json.dump(CFG, f, indent=2, ensure_ascii=False)
    LOGGER.info(f"Saved run config to: {_rel(out)}")


[05:38:03] INFO - PROJECT_ROOT : .
[05:38:03] INFO - DATA_DIR     : data
[05:38:03] INFO - Seed set to  : 42
[05:38:03] INFO - Processed dir: data/processed
[05:38:03] INFO - Run dir      : data/processed/_runs/20251214_053803
[05:38:03] INFO - Saved run config to: data/processed/_runs/20251214_053803/run_config.json


## Data Modeling

In [5]:

@dataclass(frozen=True)
class EdgesSchema:
    u: str
    v: str
    weight: Optional[float] = None

EDGES_COLUMNS = ["u", "v"]

def enforce_edges_schema(df: pd.DataFrame) -> pd.DataFrame:
    missing = [c for c in EDGES_COLUMNS if c not in df.columns]
    if missing:
        raise ValueError(f"Edges missing columns: {missing}. Found: {list(df.columns)}")
    out = df[EDGES_COLUMNS].copy()
    out["u"] = out["u"].astype(str)
    out["v"] = out["v"].astype(str)
    return out

print("Edges schema ready:", EDGES_COLUMNS)

@dataclass(frozen=True)
class CheckinsSchema:
    user_id: str
    ts: "pd.Timestamp"
    lat: float
    lon: float
    venue_id: Optional[str] = None
    category: Optional[str] = None

CHECKINS_REQUIRED = ["user_id", "ts", "lat", "lon"]
CHECKINS_OPTIONAL = ["venue_id", "category"]

def enforce_checkins_schema(df: pd.DataFrame) -> pd.DataFrame:
    missing = [c for c in CHECKINS_REQUIRED if c not in df.columns]
    if missing:
        raise ValueError(f"Checkins missing columns: {missing}. Found: {list(df.columns)}")

    out = df.copy()
    out["user_id"] = out["user_id"].astype(str)
    out["ts"] = pd.to_datetime(out["ts"], errors="coerce")
    out["lat"] = pd.to_numeric(out["lat"], errors="coerce")
    out["lon"] = pd.to_numeric(out["lon"], errors="coerce")

    for c in CHECKINS_OPTIONAL:
        if c in out.columns:
            out[c] = out[c].astype(str)

    return out

print("Checkins schema ready:", CHECKINS_REQUIRED, "+ optional", CHECKINS_OPTIONAL)


Edges schema ready: ['u', 'v']
Checkins schema ready: ['user_id', 'ts', 'lat', 'lon'] + optional ['venue_id', 'category']


In [None]:

@dataclass(frozen=True)
class ParserRule:
    dataset: DatasetName
    source: Literal["SNAP", "LBSN2Vec"]
    edges_format: str
    checkins_format: str
    notes: str

PARSER_RULES: Dict[DatasetName, ParserRule] = {
    "brightkite": ParserRule(
        dataset="brightkite",
        source="SNAP",
        edges_format="Edge list (2 cols): user_id_1 user_id_2 (whitespace-delimited)",
        checkins_format="SNAP checkins: user_id, timestamp, lat, lon, location_id (often 5 cols)",
        notes="Verify column order by reading first lines; SNAP formats can differ."
    ),
    "gowalla": ParserRule(
        dataset="gowalla",
        source="SNAP",
        edges_format="Edge list (2 cols): user_id_1 user_id_2 (whitespace-delimited)",
        checkins_format="SNAP checkins: user_id, timestamp, lat, lon, location_id (often 5 cols)",
        notes="Gowalla is larger; use chunked read later if needed."
    ),
    "lbsn2vec": ParserRule(
        dataset="lbsn2vec",
        source="LBSN2Vec",
        edges_format="Friendship snapshots old/new: edge list (u v)",
        checkins_format="Curated checkins (observed 9 cols): user_id venue_id + tokenized timestamp; lat/lon from POIs join",
        notes="Curated checkins need timestamp rebuild and POI join for lat/lon."
    ),
}

def preview_text_file(path: Path, n: int = 5) -> None:
    print(f"--- Preview: {path.name} ---")
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        for _ in range(n):
            line = f.readline()
            if not line:
                break
            print(line.rstrip("\n"))
    print("--------------------------")

def parse_snap_edges(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, sep=r"\s+", header=None, names=["u", "v"], dtype=str, engine="python")
    return enforce_edges_schema(df)

def parse_snap_checkins(path: Path, column_guess: Optional[List[str]] = None) -> pd.DataFrame:
    df0 = pd.read_csv(path, sep=r"\s+", header=None, dtype=str, engine="python")

    if column_guess is None:
        if df0.shape[1] >= 5:
            column_guess = ["user_id", "ts", "lat", "lon", "venue_id"] + [f"extra_{i}" for i in range(df0.shape[1] - 5)]
        else:
            column_guess = ["user_id", "ts", "lat", "lon"] + [f"extra_{i}" for i in range(df0.shape[1] - 4)]

    df0.columns = column_guess[:df0.shape[1]]
    keep = [c for c in ["user_id", "ts", "lat", "lon", "venue_id"] if c in df0.columns]
    df = df0[keep].copy()
    return enforce_checkins_schema(df)

def parse_lbsn_friendship(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, sep=r"\s+", header=None, names=["u", "v"], dtype=str, engine="python")
    return enforce_edges_schema(df)

def parse_lbsn_pois(poi_path: Path) -> pd.DataFrame:
    poi0 = pd.read_csv(poi_path, sep=r"\s+", header=None, dtype=str, engine="python")
    if poi0.shape[1] < 3:
        raise ValueError(f"Unexpected POIs cols: {poi0.shape[1]} (need >=3)")
    poi0 = poi0.rename(columns={0: "venue_id", 1: "lat", 2: "lon"})
    poi = poi0[["venue_id", "lat", "lon"]].copy()
    poi["venue_id"] = poi["venue_id"].astype(str)
    poi["lat"] = pd.to_numeric(poi["lat"], errors="coerce")
    poi["lon"] = pd.to_numeric(poi["lon"], errors="coerce")
    return poi

def parse_lbsn_checkins_curated(checkins_path: Path, poi_path: Optional[Path] = None) -> pd.DataFrame:
    df0 = pd.read_csv(checkins_path, sep=r"\s+", header=None, dtype=str, engine="python")

    if df0.shape[1] != 9:
        raise ValueError(f"Unexpected curated checkins column count: {df0.shape[1]} (expected 9).")

    df0 = df0.iloc[:, :9].copy()
    df0.columns = ["user_id", "venue_id", "wday", "mon", "day", "time", "tz", "year", "utc_offset_min"]

    ts_raw = (
        df0["wday"].astype(str) + " " +
        df0["mon"].astype(str) + " " +
        df0["day"].astype(str) + " " +
        df0["time"].astype(str) + " " +
        df0["tz"].astype(str) + " " +
        df0["year"].astype(str)
    )
    df0["ts"] = pd.to_datetime(ts_raw, errors="coerce", format="%a %b %d %H:%M:%S %z %Y")


    df0["lat"] = np.nan
    df0["lon"] = np.nan

    if poi_path is not None and poi_path.exists():
        poi = parse_lbsn_pois(poi_path)
        df0 = df0.merge(poi, on="venue_id", how="left", suffixes=("", "_poi"))
        if "lat_poi" in df0.columns: df0["lat"] = df0["lat_poi"]
        if "lon_poi" in df0.columns: df0["lon"] = df0["lon_poi"]

    df = df0[["user_id", "ts", "lat", "lon", "venue_id"]].copy()
    return enforce_checkins_schema(df)

def parse_lbsn_checkins_raw(checkins_path: Path, poi_path: Path) -> pd.DataFrame:
    chk0 = pd.read_csv(checkins_path, sep=r"\s+", header=None, dtype=str, engine="python")
    if chk0.shape[1] < 3:
        raise ValueError(f"Unexpected raw checkins cols: {chk0.shape[1]} (need >=3)")

    user = chk0.iloc[:, 0].astype(str)
    venue = chk0.iloc[:, 1].astype(str)
    ts_tokens = chk0.iloc[:, 2:].astype(str).agg(" ".join, axis=1)
    ts = pd.to_datetime(ts_tokens, errors="coerce")

    poi = parse_lbsn_pois(poi_path)
    df0 = pd.DataFrame({"user_id": user, "venue_id": venue, "ts": ts})
    df0 = df0.merge(poi, on="venue_id", how="left")

    df = df0[["user_id", "ts", "lat", "lon", "venue_id"]].copy()
    return enforce_checkins_schema(df)

pd.DataFrame([asdict(v) for v in PARSER_RULES.values()])


Unnamed: 0,dataset,source,edges_format,checkins_format,notes
0,brightkite,SNAP,Edge list (2 cols): user_id_1 user_id_2 (white...,"SNAP checkins: user_id, timestamp, lat, lon, l...",Verify column order by reading first lines; SN...
1,gowalla,SNAP,Edge list (2 cols): user_id_1 user_id_2 (white...,"SNAP checkins: user_id, timestamp, lat, lon, l...",Gowalla is larger; use chunked read later if n...
2,lbsn2vec,LBSN2Vec,Friendship snapshots old/new: edge list (u v),Curated checkins (observed 9 cols): user_id ve...,Curated checkins need timestamp rebuild and PO...


## Validation utils + registry check

In [7]:

def _human_size(nbytes: int) -> str:
    units = ["B", "KB", "MB", "GB", "TB"]
    size = float(nbytes)
    for u in units:
        if size < 1024.0:
            return f"{size:.2f} {u}"
        size /= 1024.0
    return f"{size:.2f} PB"

def file_meta(path: Path) -> Dict[str, Any]:
    return {
        "exists": path.exists(),
        "size": _human_size(path.stat().st_size) if path.exists() else None,
        "path": _rel(path),
        "name": path.name,
    }

def safe_head_lines(path: Path, n: int = 3) -> List[str]:
    if path is None or (not path.exists()):
        return []
    lines = []
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        for _ in range(n):
            line = f.readline()
            if not line:
                break
            lines.append(line.rstrip("\n"))
    return lines

def read_sample_whitespace(path: Path, nrows: int = 50) -> pd.DataFrame:
    return pd.read_csv(path, sep=r"\s+", header=None, dtype=str, engine="python", nrows=nrows)

def validate_edges_df(df: pd.DataFrame) -> Dict[str, Any]:
    report = {"ok": True, "issues": []}
    if df.shape[1] < 2:
        report["ok"] = False
        report["issues"].append(f"Edges sample has <2 columns: shape={df.shape}")
        return report
    u = df.iloc[:, 0].astype(str)
    v = df.iloc[:, 1].astype(str)
    report["n_rows"] = len(df)
    report["n_unique_u"] = u.nunique()
    report["n_unique_v"] = v.nunique()
    report["self_loop_ratio"] = float((u == v).mean())
    pairs = pd.DataFrame({"u": u, "v": v})
    report["dup_ratio_sample"] = float(pairs.duplicated().mean())
    return report

def validate_checkins_df(df: pd.DataFrame, dataset: str) -> Dict[str, Any]:
    report = {"ok": True, "issues": [], "shape": df.shape}
    ncol = df.shape[1]
    if ncol < 3:
        report["ok"] = False
        report["issues"].append(f"Checkins sample has too few columns: {ncol}")
        return report

    candidates = []
    if ncol >= 5:
        candidates.append(("A5:user,ts,lat,lon,venue", [0,1,2,3,4]))
    if ncol >= 4:
        candidates.append(("B4:user,ts,lat,lon", [0,1,2,3]))
    if ncol == 9:
        candidates.append(("L9:user,venue,wday,mon,day,time,tz,year,utc_offset", list(range(9))))
    candidates.append(("C3:user,venue,ts", [0,1,2]))

    best = None
    best_score = -1
    best_detail = None

    for name, idx in candidates:
        tmp = df.iloc[:, idx].copy()

        if name.startswith(("A5", "B4")):
            tmp.columns = ["user_id", "ts", "lat", "lon"] + (["venue_id"] if len(idx) == 5 else [])
            user = tmp["user_id"].astype(str)
            ts = pd.to_datetime(tmp["ts"], errors="coerce")
            lat = pd.to_numeric(tmp["lat"], errors="coerce")
            lon = pd.to_numeric(tmp["lon"], errors="coerce")

            ts_ok = ts.notna().mean()
            lat_ok = lat.notna().mean()
            lon_ok = lon.notna().mean()
            lat_in = ((lat >= -90) & (lat <= 90)).mean()
            lon_in = ((lon >= -180) & (lon <= 180)).mean()

            score = ts_ok + lat_ok + lon_ok + lat_in + lon_in
            detail = {
                "layout": name,
                "ts_parse_rate": float(ts_ok),
                "lat_parse_rate": float(lat_ok),
                "lon_parse_rate": float(lon_ok),
                "lat_in_range_rate": float(lat_in),
                "lon_in_range_rate": float(lon_in),
                "n_unique_users_sample": int(user.nunique())
            }

        elif name.startswith("L9"):
            tmp.columns = ["user_id", "venue_id", "wday", "mon", "day", "time", "tz", "year", "utc_offset_min"]
            user = tmp["user_id"].astype(str)

            ts_raw = (
                tmp["wday"].astype(str) + " " +
                tmp["mon"].astype(str) + " " +
                tmp["day"].astype(str) + " " +
                tmp["time"].astype(str) + " " +
                tmp["tz"].astype(str) + " " +
                tmp["year"].astype(str)
            )
            ts = pd.to_datetime(ts_raw, errors="coerce", format="%a %b %d %H:%M:%S %z %Y")
            ts_ok = ts.notna().mean()

            score = ts_ok
            detail = {
                "layout": name,
                "ts_parse_rate": float(ts_ok),
                "n_unique_users_sample": int(user.nunique()),
                "note": "LBSN2Vec curated detected; lat/lon require POI join by venue_id."
            }

        else:
            tmp.columns = ["user_id", "venue_id", "ts"]
            user = tmp["user_id"].astype(str)
            ts = pd.to_datetime(tmp["ts"], errors="coerce")
            ts_ok = ts.notna().mean()
            score = ts_ok
            detail = {
                "layout": name,
                "ts_parse_rate": float(ts_ok),
                "n_unique_users_sample": int(user.nunique()),
                "note": "lat/lon not present; requires POI join if needed."
            }

        if score > best_score:
            best_score = score
            best = name
            best_detail = detail

    report["best_layout_guess"] = best
    report["best_layout_detail"] = best_detail

    if best is None:
        report["ok"] = False
        report["issues"].append("Cannot infer checkins layout from sample.")
        return report

    if best.startswith(("A5", "B4")):
        d = best_detail
        if d["ts_parse_rate"] < 0.7 or d["lat_parse_rate"] < 0.7 or d["lon_parse_rate"] < 0.7:
            report["ok"] = False
            report["issues"].append("Low parse rate for ts/lat/lon in inferred layout.")
    elif best.startswith("L9"):
        if best_detail["ts_parse_rate"] < 0.7:
            report["ok"] = False
            report["issues"].append("Low timestamp parse rate in LBSN2Vec curated layout.")
        report["needs_poi_join_for_latlon"] = True
    else:
        if best_detail["ts_parse_rate"] < 0.7:
            report["ok"] = False
            report["issues"].append("Low timestamp parse rate in 3-col layout.")
        report["needs_poi_join_for_latlon"] = True

    return report

def validate_dataset_registry_and_load(
    datasets: Dict[DatasetName, DatasetSpec],
    active: Optional[List[str]] = None,
    sample_rows: int = 50,
    preview_lines: int = 2
) -> pd.DataFrame:
    rows = []
    active_set = set(active) if active else set(datasets.keys())

    for name, spec in datasets.items():
        if name not in active_set:
            continue

        row = {
            "dataset": name,
            "source": spec.source,
            "root_exists": spec.root.exists(),
            "root": _rel(spec.root),
        }

        paths = {}
        if spec.source == "SNAP":
            paths["edges"] = spec.edges_path
            paths["checkins"] = spec.checkins_path
        else:
            paths["friendship_old"] = spec.friendship_old_path
            paths["friendship_new"] = spec.friendship_new_path
            paths["checkins_curated"] = spec.checkins_path
            paths["readme"] = spec.readme_path
            paths["poi"] = spec.poi_path

        for k, p in paths.items():
            if p is None:
                row[f"{k}_exists"] = False
                row[f"{k}_size"] = None
                continue
            meta = file_meta(p)
            row[f"{k}_exists"] = meta["exists"]
            row[f"{k}_size"] = meta["size"]

        try:
            if spec.source == "SNAP":
                e_sample = read_sample_whitespace(spec.edges_path, nrows=sample_rows)
                e_val = validate_edges_df(e_sample)
                row["edges_ok"] = e_val["ok"]
                row["edges_self_loop_ratio"] = e_val.get("self_loop_ratio", None)
                row["edges_dup_ratio_sample"] = e_val.get("dup_ratio_sample", None)
                row["edges_head"] = " | ".join(safe_head_lines(spec.edges_path, n=preview_lines))

                c_sample = read_sample_whitespace(spec.checkins_path, nrows=sample_rows)
                c_val = validate_checkins_df(c_sample, dataset=name)
                row["checkins_ok"] = c_val["ok"]
                row["checkins_best_layout"] = c_val.get("best_layout_guess", None)
                row["checkins_ts_parse_rate"] = (c_val.get("best_layout_detail") or {}).get("ts_parse_rate", None)
                row["checkins_lat_parse_rate"] = (c_val.get("best_layout_detail") or {}).get("lat_parse_rate", None)
                row["checkins_lon_parse_rate"] = (c_val.get("best_layout_detail") or {}).get("lon_parse_rate", None)
                row["checkins_head"] = " | ".join(safe_head_lines(spec.checkins_path, n=preview_lines))
                row["issues"] = "; ".join((e_val.get("issues") or []) + (c_val.get("issues") or []))

            else:
                old_sample = read_sample_whitespace(spec.friendship_old_path, nrows=sample_rows)
                new_sample = read_sample_whitespace(spec.friendship_new_path, nrows=sample_rows)
                old_val = validate_edges_df(old_sample)
                new_val = validate_edges_df(new_sample)
                row["friendship_old_ok"] = old_val["ok"]
                row["friendship_new_ok"] = new_val["ok"]
                row["friendship_old_head"] = " | ".join(safe_head_lines(spec.friendship_old_path, n=preview_lines))
                row["friendship_new_head"] = " | ".join(safe_head_lines(spec.friendship_new_path, n=preview_lines))

                chk_sample = read_sample_whitespace(spec.checkins_path, nrows=sample_rows)
                chk_val = validate_checkins_df(chk_sample, dataset=name)
                row["checkins_ok"] = chk_val["ok"]
                row["checkins_best_layout"] = chk_val.get("best_layout_guess", None)
                row["checkins_ts_parse_rate"] = (chk_val.get("best_layout_detail") or {}).get("ts_parse_rate", None)
                row["checkins_lat_parse_rate"] = (chk_val.get("best_layout_detail") or {}).get("lat_parse_rate", None)
                row["checkins_lon_parse_rate"] = (chk_val.get("best_layout_detail") or {}).get("lon_parse_rate", None)
                row["checkins_needs_poi_join"] = chk_val.get("needs_poi_join_for_latlon", False)
                row["checkins_head"] = " | ".join(safe_head_lines(spec.checkins_path, n=preview_lines))

                if spec.readme_path and spec.readme_path.exists():
                    row["readme_head"] = " | ".join(safe_head_lines(spec.readme_path, n=min(10, preview_lines)))
                else:
                    row["readme_head"] = None

                row["issues"] = "; ".join((old_val.get("issues") or []) + (new_val.get("issues") or []) + (chk_val.get("issues") or []))

        except Exception as ex:
            row["issues"] = (row.get("issues", "") + f"; EXCEPTION: {type(ex).__name__}: {ex}").strip("; ")
            row.setdefault("edges_ok", False)
            row.setdefault("checkins_ok", False)

        rows.append(row)

    return pd.DataFrame(rows)

def summarize_validation(df_val: pd.DataFrame) -> None:
    print("=== Validation Summary ===")
    for _, r in df_val.iterrows():
        name = r["dataset"]
        src = r["source"]
        ok_root = bool(r["root_exists"])
        if not ok_root:
            print(f"[FAIL] {name} - missing root folder")
            continue

        if src == "SNAP":
            edges_ok = bool(r.get("edges_ok", False))
            chk_ok = bool(r.get("checkins_ok", False))
            print(f"{'[OK] ' if (edges_ok and chk_ok) else '[FAIL]'} {name} (SNAP) | edges_ok={edges_ok} | checkins_ok={chk_ok} | layout={r.get('checkins_best_layout')}")
            if not (edges_ok and chk_ok):
                print(f"  issues: {r.get('issues')}")
        else:
            old_ok = bool(r.get("friendship_old_ok", False))
            new_ok = bool(r.get("friendship_new_ok", False))
            chk_ok = bool(r.get("checkins_ok", False))
            need_join = bool(r.get("checkins_needs_poi_join", False))
            print(f"{'[OK] ' if (old_ok and new_ok and chk_ok) else '[FAIL]'} {name} (LBSN2Vec) | old_ok={old_ok} new_ok={new_ok} checkins_ok={chk_ok} need_POI_join={need_join} | layout={r.get('checkins_best_layout')}")
            if not (old_ok and new_ok and chk_ok):
                print(f"  issues: {r.get('issues')}")
    print("==========================")

# ---- Run validation ----
active = CFG["datasets"].get("active", ["brightkite", "gowalla", "lbsn2vec"])
df_val = validate_dataset_registry_and_load(DATASETS, active=active, sample_rows=100, preview_lines=2)
df_val


  ts = pd.to_datetime(tmp["ts"], errors="coerce")
  ts = pd.to_datetime(tmp["ts"], errors="coerce")
  ts = pd.to_datetime(tmp["ts"], errors="coerce")
  ts = pd.to_datetime(tmp["ts"], errors="coerce")


Unnamed: 0,dataset,source,root_exists,root,edges_exists,edges_size,checkins_exists,checkins_size,edges_ok,edges_self_loop_ratio,...,readme_exists,readme_size,poi_exists,poi_size,friendship_old_ok,friendship_new_ok,friendship_old_head,friendship_new_head,checkins_needs_poi_join,readme_head
0,brightkite,SNAP,True,data/Brightkite,True,4.37 MB,True,364.42 MB,True,0.0,...,,,,,,,,,,
1,gowalla,SNAP,True,data/Gowalla,True,21.09 MB,True,376.36 MB,True,0.0,...,,,,,,,,,,
2,lbsn2vec,LBSN2Vec,True,data/LBSN2Vec,,,,,,,...,True,1.85 KB,True,672.12 MB,True,True,15\t595326 | 19\t54,15\t595326 | 19\t54,True,This dataset includes long-term (about 22 mont...


In [8]:
summarize_validation(df_val)


=== Validation Summary ===
[OK]  brightkite (SNAP) | edges_ok=True | checkins_ok=True | layout=A5:user,ts,lat,lon,venue
[OK]  gowalla (SNAP) | edges_ok=True | checkins_ok=True | layout=A5:user,ts,lat,lon,venue
[OK]  lbsn2vec (LBSN2Vec) | old_ok=True new_ok=True checkins_ok=True need_POI_join=True | layout=L9:user,venue,wday,mon,day,time,tz,year,utc_offset


## Pipeline runner

In [None]:


from pathlib import Path
import logging, sys
import numpy as np
import pandas as pd


PROJECT_ROOT = Path.cwd()


if "LOGGER" not in globals():
    LOGGER = logging.getLogger("community-detection")
    LOGGER.setLevel(logging.INFO)
    LOGGER.handlers.clear()
    h = logging.StreamHandler(sys.stdout)
    h.setFormatter(logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s", "%H:%M:%S"))
    LOGGER.addHandler(h)
    LOGGER.propagate = False


if "CFG" not in globals():
    CFG = {
        "datasets": {"lbsn2vec_snapshot": "old", "lbsn2vec_tier": "curated"},
        "preprocess": {
            "min_checkins": 10,
            "min_degree": 3,
            "iterative_filter": True,
            "lat_range": [-90.0, 90.0],
            "lon_range": [-180.0, 180.0],
        }
    }

DATASET_NAME = "lbsn2vec"


LBSN_ROOT = PROJECT_ROOT / "data" / "LBSN2Vec"
PATHS = {
    "friendship_old": LBSN_ROOT / "dataset_WWW_friendship_old.txt",
    "friendship_new": LBSN_ROOT / "dataset_WWW_friendship_new.txt",
    "checkins_curated": LBSN_ROOT / "dataset_WWW_Checkins_anonymized.txt",
    "poi": LBSN_ROOT / "raw_POIs.txt",
}

LOGGER.info(f"[C0] dataset={DATASET_NAME} | snapshot={CFG['datasets'].get('lbsn2vec_snapshot')} | tier={CFG['datasets'].get('lbsn2vec_tier')}")
LOGGER.info(f"[C0] LBSN_ROOT={LBSN_ROOT}")
for k, p in PATHS.items():
    LOGGER.info(f"[C0] {k}: exists={p.exists()} | path={p}")


[05:38:04] INFO - [C0] dataset=lbsn2vec | snapshot=old | tier=curated
[05:38:04] INFO - [C0] LBSN_ROOT=/mnt/d/community-detection/notebooks/data/LBSN2Vec
[05:38:04] INFO - [C0] friendship_old: exists=False | path=/mnt/d/community-detection/notebooks/data/LBSN2Vec/dataset_WWW_friendship_old.txt
[05:38:04] INFO - [C0] friendship_new: exists=False | path=/mnt/d/community-detection/notebooks/data/LBSN2Vec/dataset_WWW_friendship_new.txt
[05:38:04] INFO - [C0] checkins_curated: exists=False | path=/mnt/d/community-detection/notebooks/data/LBSN2Vec/dataset_WWW_Checkins_anonymized.txt
[05:38:04] INFO - [C0] poi: exists=False | path=/mnt/d/community-detection/notebooks/data/LBSN2Vec/raw_POIs.txt


### Step 1: load raw -> parse

In [None]:

LBSN_SAMPLE_FRAC = 0.10   # 10% để chạy thử; đặt = 1.0 để chạy full
LBSN_SAMPLE_SEED = int(CFG["run"].get("seed", 42))

def sample_users_from_checkins_chunked(
    checkins_path,
    sample_frac: float,
    seed: int,
    chunksize: int = 2_000_000
):
    
    import numpy as np
    import pandas as pd

    users = set()
    for chunk in pd.read_csv(checkins_path, sep=r"\s+", header=None, dtype=str, engine="python",
                             chunksize=chunksize):

        users.update(chunk.iloc[:, 0].astype(str).unique().tolist())

    users = np.array(list(users), dtype=object)
    rng = np.random.default_rng(seed)
    rng.shuffle(users)
    n = int(np.ceil(len(users) * float(sample_frac)))
    keep = set(users[:n].tolist())
    return keep

print(f"[C0] LBSN sample frac={LBSN_SAMPLE_FRAC} (set 1.0 for full run)")


[C0] LBSN sample frac=0.1 (set 1.0 for full run)


In [None]:


from pathlib import Path
import pandas as pd
import numpy as np

DATASET_NAME = "lbsn2vec"
spec = DATASETS[DATASET_NAME]

def read_edges_two_cols(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, sep=r"\s+", header=None, usecols=[0, 1], dtype=str, engine="python")
    df.columns = ["u", "v"]
    df["u"] = df["u"].astype(str)
    df["v"] = df["v"].astype(str)
    return df

def read_pois_minimal(path: Path) -> pd.DataFrame:
    poi = pd.read_csv(path, sep=r"\s+", header=None, usecols=[0, 1, 2], dtype=str, engine="python")
    poi.columns = ["venue_id", "lat", "lon"]
    poi["venue_id"] = poi["venue_id"].astype(str)
    poi["lat"] = pd.to_numeric(poi["lat"], errors="coerce")
    poi["lon"] = pd.to_numeric(poi["lon"], errors="coerce")
    return poi

def parse_lbsn_curated_checkins_9col_chunk(chunk: pd.DataFrame) -> pd.DataFrame:
    if chunk.shape[1] < 9:
        raise ValueError(f"Expected >=9 columns for curated checkins, got shape={chunk.shape}")

    df = chunk.iloc[:, :9].copy()
    df.columns = ["user_id", "venue_id", "dow", "mon", "day", "time", "tz", "year", "tz_offset_min"]

    ts_str = (
        df["dow"].astype(str) + " " +
        df["mon"].astype(str) + " " +
        df["day"].astype(str) + " " +
        df["time"].astype(str) + " " +
        df["tz"].astype(str) + " " +
        df["year"].astype(str)
    )
    ts = pd.to_datetime(ts_str, format="%a %b %d %H:%M:%S %z %Y", errors="coerce")

    out = pd.DataFrame({
        "user_id": df["user_id"].astype(str),
        "venue_id": df["venue_id"].astype(str),
        "ts": ts,
        "tz_offset_min": pd.to_numeric(df["tz_offset_min"], errors="coerce"),
    })
    return out

# ---- choose snapshot edges ----
snapshot = CFG["datasets"].get("lbsn2vec_snapshot", "old")
edges_path = spec.friendship_old_path if snapshot == "old" else spec.friendship_new_path
checkins_path = spec.checkins_path
poi_path = spec.poi_path

# ---- sanity check required files ----
required_files = [edges_path, checkins_path, poi_path]
missing = [p for p in required_files if (p is None or not Path(p).exists())]
if missing:
    raise FileNotFoundError("Missing required file(s) in registry:\n" + "\n".join([str(p) for p in missing]))

LOGGER.info(f"[C1] dataset={DATASET_NAME} | snapshot={snapshot} | sampling={LBSN_SAMPLE_FRAC:.2f}")

# ---- Step 1a: sample 10% users (fast enough; one pass by chunks) ----
if LBSN_SAMPLE_FRAC < 1.0:
    LOGGER.info("[C1] collecting unique users (chunked) to sample ...")
    keep_users = sample_users_from_checkins_chunked(
        checkins_path=checkins_path,
        sample_frac=LBSN_SAMPLE_FRAC,
        seed=LBSN_SAMPLE_SEED,
        chunksize=2_000_000
    )
    LOGGER.info(f"[C1] sampled users: {len(keep_users):,}")
else:
    keep_users = None
    LOGGER.info("[C1] sampling disabled (full dataset).")

# ---- Step 1b: load edges then filter by sampled users ----
LOGGER.info("[C1] loading edges ...")
edges_all = read_edges_two_cols(edges_path)

if keep_users is not None:
    edges_raw = edges_all[edges_all["u"].isin(keep_users) & edges_all["v"].isin(keep_users)].copy()
else:
    edges_raw = edges_all

LOGGER.info(f"[C1] edges_raw after sampling filter: {edges_raw.shape}")

# ---- Step 1c: load curated checkins chunked; keep only sampled users ----
LOGGER.info("[C1] loading curated checkins (chunked) ...")
chunks = []
for chunk in pd.read_csv(checkins_path, sep=r"\s+", header=None, dtype=str, engine="python",
                         chunksize=2_000_000):
    if keep_users is not None:
        chunk = chunk[chunk.iloc[:, 0].astype(str).isin(keep_users)]
    if len(chunk) == 0:
        continue
    chunks.append(parse_lbsn_curated_checkins_9col_chunk(chunk))

checkins_core = pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame(columns=["user_id","venue_id","ts","tz_offset_min"])
LOGGER.info(f"[C1] checkins_core after sampling filter: {checkins_core.shape}")

# ---- Step 1d: join POIs (full POI file might be big, but 672MB is still ok once)
# Optional optimization: filter POIs to only venues in sampled checkins.
LOGGER.info("[C1] loading POIs ...")
pois = read_pois_minimal(poi_path)

if len(checkins_core) > 0:
    needed_venues = set(checkins_core["venue_id"].unique().tolist())
    pois = pois[pois["venue_id"].isin(needed_venues)].copy()
    LOGGER.info(f"[C1] POIs filtered to used venues: {pois.shape}")

LOGGER.info("[C1] join checkins with POIs ...")
checkins_raw = checkins_core.merge(pois, on="venue_id", how="left")

checkins_raw["lat"] = pd.to_numeric(checkins_raw["lat"], errors="coerce")
checkins_raw["lon"] = pd.to_numeric(checkins_raw["lon"], errors="coerce")

LOGGER.info(f"[C1] edges_raw={edges_raw.shape} | checkins_raw={checkins_raw.shape}")
LOGGER.info(f"[C1] ts_parse_ok_rate={(checkins_raw['ts'].notna().mean()):.3f} | lat_ok_rate={(checkins_raw['lat'].notna().mean()):.3f} | lon_ok_rate={(checkins_raw['lon'].notna().mean()):.3f}")

edges_raw.head(), checkins_raw.head()


[05:38:04] INFO - [C1] dataset=lbsn2vec | snapshot=old | sampling=0.10
[05:38:04] INFO - [C1] collecting unique users (chunked) to sample ...
[05:41:36] INFO - [C1] sampled users: 11,433
[05:41:36] INFO - [C1] loading edges ...
[05:41:37] INFO - [C1] edges_raw after sampling filter: (3474, 2)
[05:41:37] INFO - [C1] loading curated checkins (chunked) ...
[05:45:14] INFO - [C1] checkins_core after sampling filter: (2284971, 4)
[05:45:14] INFO - [C1] loading POIs ...
[05:47:42] INFO - [C1] POIs filtered to used venues: (679734, 3)
[05:47:42] INFO - [C1] join checkins with POIs ...
[05:47:43] INFO - [C1] edges_raw=(3474, 2) | checkins_raw=(2284971, 6)
[05:47:43] INFO - [C1] ts_parse_ok_rate=1.000 | lat_ok_rate=1.000 | lon_ok_rate=1.000


(       u      v
 235  190  76637
 803  541   3063
 809  541   4562
 812  541   4826
 824  541   7199,
    user_id                  venue_id                        ts  tz_offset_min  \
 0  1583419  4f770f6be4b0f52db8b2976b 2012-04-03 18:00:13+00:00          180.0   
 1  1270024  4d273474915fa093c63df109 2012-04-03 18:00:24+00:00          480.0   
 2  1599762  4e482dde1f6e29f10dbd6e41 2012-04-03 18:00:28+00:00          180.0   
 3   295503  4df7a1d01838c789e8ba2af6 2012-04-03 18:00:39+00:00         -240.0   
 4  1583941  4e08b4fbfa767637fd324447 2012-04-03 18:00:47+00:00         -300.0   
 
          lat         lon  
 0  37.926483   23.709966  
 1   2.933741  101.766744  
 2  29.309856   47.968669  
 3  34.015523  -84.576258  
 4  34.806880  -87.679396  )

### Step 2: cleaning chuẩn hoá

In [None]:

def make_undirected_dedup(edges: pd.DataFrame) -> pd.DataFrame:
    u = edges["u"].astype(str).to_numpy()
    v = edges["v"].astype(str).to_numpy()
    u2 = np.where(u <= v, u, v)
    v2 = np.where(u <= v, v, u)
    out = pd.DataFrame({"u": u2, "v": v2})
    out = out[out["u"] != out["v"]]              
    out = out.drop_duplicates(["u", "v"]).reset_index(drop=True)
    return out

def clean_checkins(chk: pd.DataFrame, cfg: dict) -> pd.DataFrame:
    lat_lo, lat_hi = cfg["preprocess"]["lat_range"]
    lon_lo, lon_hi = cfg["preprocess"]["lon_range"]

    out = chk.copy()
    out["user_id"] = out["user_id"].astype(str)
    out["venue_id"] = out["venue_id"].astype(str)
    out["ts"] = pd.to_datetime(out["ts"], errors="coerce")
    out["lat"] = pd.to_numeric(out["lat"], errors="coerce")
    out["lon"] = pd.to_numeric(out["lon"], errors="coerce")


    out = out.dropna(subset=["user_id", "venue_id", "ts", "lat", "lon"])


    out = out[(out["lat"] >= lat_lo) & (out["lat"] <= lat_hi) & (out["lon"] >= lon_lo) & (out["lon"] <= lon_hi)]


    try:
        if hasattr(out["ts"].dt, "tz") and out["ts"].dt.tz is not None:
            out["ts"] = out["ts"].dt.tz_convert("UTC").dt.tz_localize(None)
        else:
            out["ts"] = pd.to_datetime(out["ts"], errors="coerce")
    except Exception:
        out["ts"] = pd.to_datetime(out["ts"], errors="coerce")

    out = out.dropna(subset=["ts"]).reset_index(drop=True)
    return out[["user_id", "ts", "lat", "lon", "venue_id"]].copy()

LOGGER.info("[C2] cleaning edges ...")
edges_clean = make_undirected_dedup(edges_raw)

LOGGER.info("[C2] cleaning checkins ...")
checkins_clean = clean_checkins(checkins_raw, CFG)

LOGGER.info(f"[C2] edges_clean={edges_clean.shape} | checkins_clean={checkins_clean.shape}")
LOGGER.info(f"[C2] checkins users={checkins_clean['user_id'].nunique()} | venues={checkins_clean['venue_id'].nunique()}")

edges_clean.head(), checkins_clean.head()


[05:47:43] INFO - [C2] cleaning edges ...
[05:47:43] INFO - [C2] cleaning checkins ...
[05:47:46] INFO - [C2] edges_clean=(3474, 2) | checkins_clean=(2284967, 5)
[05:47:47] INFO - [C2] checkins users=11433 | venues=679733


(      u      v
 0   190  76637
 1  3063    541
 2  4562    541
 3  4826    541
 4   541   7199,
    user_id                  ts        lat         lon  \
 0  1583419 2012-04-03 18:00:13  37.926483   23.709966   
 1  1270024 2012-04-03 18:00:24   2.933741  101.766744   
 2  1599762 2012-04-03 18:00:28  29.309856   47.968669   
 3   295503 2012-04-03 18:00:39  34.015523  -84.576258   
 4  1583941 2012-04-03 18:00:47  34.806880  -87.679396   
 
                    venue_id  
 0  4f770f6be4b0f52db8b2976b  
 1  4d273474915fa093c63df109  
 2  4e482dde1f6e29f10dbd6e41  
 3  4df7a1d01838c789e8ba2af6  
 4  4e08b4fbfa767637fd324447  )

### Step 3: filter users + induced subgraph

In [13]:

def degree_from_edges(edges: pd.DataFrame) -> pd.Series:
    u = edges["u"].astype(str)
    v = edges["v"].astype(str)
    return pd.concat([u, v]).value_counts()

def filter_induced_once(edges: pd.DataFrame, chk: pd.DataFrame, k: int, d: int):
    ccount = chk["user_id"].astype(str).value_counts()
    deg = degree_from_edges(edges)

    users_ok = ccount[ccount >= k].index
    deg_ok = deg[deg >= d].index
    v_keep = pd.Index(users_ok).intersection(pd.Index(deg_ok))

    edges2 = edges[edges["u"].isin(v_keep) & edges["v"].isin(v_keep)].copy().reset_index(drop=True)
    chk2 = chk[chk["user_id"].isin(v_keep)].copy().reset_index(drop=True)

    return v_keep, edges2, chk2

k = int(CFG["preprocess"].get("min_checkins", 10))
d = int(CFG["preprocess"].get("min_degree", 3))
iterative = bool(CFG["preprocess"].get("iterative_filter", True))

LOGGER.info(f"[C3] start filter: min_checkins(k)={k}, min_degree(d)={d}, iterative={iterative}")

edges_tmp = edges_clean.copy()
chk_tmp = checkins_clean.copy()

prev_users = -1
history = []
for r in range(1, 21):
    v_keep, edges_tmp, chk_tmp = filter_induced_once(edges_tmp, chk_tmp, k=k, d=d)
    n_users = len(v_keep)
    history.append((r, n_users, len(edges_tmp), len(chk_tmp)))
    LOGGER.info(f"[C3] round={r} | users={n_users} | edges={len(edges_tmp)} | checkins={len(chk_tmp)}")

    if (not iterative) or (n_users == prev_users):
        break
    prev_users = n_users

users_final = pd.DataFrame({"user_id": pd.Index(chk_tmp["user_id"].unique()).sort_values()})
edges_final = edges_tmp
checkins_final = chk_tmp

LOGGER.info(f"[C3] DONE | users_final={users_final.shape} | edges_final={edges_final.shape} | checkins_final={checkins_final.shape}")
history[:5], history[-1]


[05:47:47] INFO - [C3] start filter: min_checkins(k)=10, min_degree(d)=3, iterative=True
[05:47:48] INFO - [C3] round=1 | users=658 | edges=935 | checkins=159222
[05:47:48] INFO - [C3] round=2 | users=310 | edges=592 | checkins=77112
[05:47:48] INFO - [C3] round=3 | users=231 | edges=487 | checkins=56410
[05:47:48] INFO - [C3] round=4 | users=207 | edges=449 | checkins=48501
[05:47:48] INFO - [C3] round=5 | users=196 | edges=431 | checkins=46550
[05:47:48] INFO - [C3] round=6 | users=191 | edges=424 | checkins=45820
[05:47:48] INFO - [C3] round=7 | users=187 | edges=419 | checkins=44916
[05:47:48] INFO - [C3] round=8 | users=185 | edges=415 | checkins=44375
[05:47:48] INFO - [C3] round=9 | users=182 | edges=410 | checkins=43968
[05:47:48] INFO - [C3] round=10 | users=179 | edges=407 | checkins=43487
[05:47:48] INFO - [C3] round=11 | users=178 | edges=405 | checkins=43352
[05:47:48] INFO - [C3] round=12 | users=178 | edges=405 | checkins=43352
[05:47:48] INFO - [C3] DONE | users_final=(

([(1, 658, 935, 159222),
  (2, 310, 592, 77112),
  (3, 231, 487, 56410),
  (4, 207, 449, 48501),
  (5, 196, 431, 46550)],
 (12, 178, 405, 43352))

### Step 4: build X_users 

In [None]:


def _entropy_from_counts(counts: np.ndarray, eps: float = 1e-12) -> float:
    s = counts.sum()
    if s <= 0:
        return 0.0
    p = counts / (s + eps)
    p = p[p > 0]
    return float(-(p * np.log(p + eps)).sum())

def _haversine_km(lat1, lon1, lat2, lon2):

    R = 6371.0
    lat1 = np.radians(lat1); lon1 = np.radians(lon1)
    lat2 = np.radians(lat2); lon2 = np.radians(lon2)
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2.0)**2
    c = 2*np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

def build_user_features_from_checkins(
    users_final: pd.DataFrame,
    checkins_final: pd.DataFrame,
    log1p_counts: bool = True,
    standardize: bool = True
):

    chk = checkins_final.copy()
    chk["user_id"] = chk["user_id"].astype(str)
    chk["venue_id"] = chk["venue_id"].astype(str)
    chk["ts"] = pd.to_datetime(chk["ts"], errors="coerce")
    chk = chk.dropna(subset=["ts", "lat", "lon", "user_id"])
    chk["hour"] = chk["ts"].dt.hour.astype(int)
    chk["dow"]  = chk["ts"].dt.dayofweek.astype(int)  #
    chk["date"] = chk["ts"].dt.date

    g = chk.groupby("user_id", sort=False)


    num_checkins = g.size().rename("num_checkins")

    num_active_days = g["date"].nunique().rename("num_active_days")
    num_unique_venues = g["venue_id"].nunique().rename("num_unique_venues")

    mean_lat = g["lat"].mean().rename("mean_lat")
    mean_lon = g["lon"].mean().rename("mean_lon")
    std_lat  = g["lat"].std(ddof=0).fillna(0.0).rename("std_lat")
    std_lon  = g["lon"].std(ddof=0).fillna(0.0).rename("std_lon")

    rog = {}
    med_dist = {}
    for uid, sub in g:
        latc = float(sub["lat"].mean())
        lonc = float(sub["lon"].mean())
        d = _haversine_km(sub["lat"].to_numpy(), sub["lon"].to_numpy(), latc, lonc)
        rog[uid] = float(np.sqrt(np.mean(d**2))) if len(d) else 0.0
        med_dist[uid] = float(np.median(d)) if len(d) else 0.0
    rog = pd.Series(rog, name="radius_of_gyration_km")
    med_dist = pd.Series(med_dist, name="median_dist_to_centroid_km")


    hour_counts = pd.crosstab(chk["user_id"], chk["hour"])  
    for h in range(24):
        if h not in hour_counts.columns:
            hour_counts[h] = 0
    hour_counts = hour_counts[list(range(24))].sort_index(axis=1)

    dow_counts = pd.crosstab(chk["user_id"], chk["dow"])
    for d0 in range(7):
        if d0 not in dow_counts.columns:
            dow_counts[d0] = 0
    dow_counts = dow_counts[list(range(7))].sort_index(axis=1)

    hour_entropy = hour_counts.apply(lambda r: _entropy_from_counts(r.to_numpy()), axis=1).rename("hour_entropy")
    dow_entropy  = dow_counts.apply(lambda r: _entropy_from_counts(r.to_numpy()), axis=1).rename("dow_entropy")

    venue_entropy = g["venue_id"].apply(lambda s: _entropy_from_counts(s.value_counts().to_numpy())).rename("venue_entropy")

    feat = pd.concat([
        num_checkins, num_active_days, num_unique_venues,
        mean_lat, mean_lon, std_lat, std_lon,
        rog, med_dist,
        hour_entropy, dow_entropy, venue_entropy
    ], axis=1)

    hour_prop = hour_counts.div(hour_counts.sum(axis=1).replace(0, np.nan), axis=0).fillna(0.0)
    hour_prop.columns = [f"hour_{h:02d}_p" for h in hour_prop.columns]

    dow_prop = dow_counts.div(dow_counts.sum(axis=1).replace(0, np.nan), axis=0).fillna(0.0)
    dow_prop.columns = [f"dow_{d0}_p" for d0 in dow_prop.columns]

    feat = feat.join(hour_prop, how="left").join(dow_prop, how="left").fillna(0.0)

    if log1p_counts:
        for c in ["num_checkins", "num_active_days", "num_unique_venues"]:
            feat[c] = np.log1p(feat[c].astype(float))


    user_order = users_final["user_id"].astype(str).tolist()
    feat = feat.reindex(user_order).fillna(0.0)
    feat.index.name = "user_id"

    if standardize:
        mu = feat.mean(axis=0)
        sd = feat.std(axis=0, ddof=0).replace(0, 1.0)
        feat = (feat - mu) / sd

    X_users = feat.to_numpy(dtype=np.float32)
    feature_names = feat.columns.tolist()
    return X_users, feat, feature_names

X_users, feat_df, feature_names = build_user_features_from_checkins(
    users_final, checkins_final,
    log1p_counts=bool(CFG.get("features", {}).get("log1p_counts", True)),
    standardize=bool(CFG.get("features", {}).get("standardize", True)),
)

LOGGER.info(f"[C4] X_users shape = {X_users.shape} | #features={len(feature_names)}")
feat_df.head()


[05:47:48] INFO - [C4] X_users shape = (178, 43) | #features=43


Unnamed: 0_level_0,num_checkins,num_active_days,num_unique_venues,mean_lat,mean_lon,std_lat,std_lon,radius_of_gyration_km,median_dist_to_centroid_km,hour_entropy,...,hour_21_p,hour_22_p,hour_23_p,dow_0_p,dow_1_p,dow_2_p,dow_3_p,dow_4_p,dow_5_p,dow_6_p
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10067,0.590789,0.909083,1.322706,1.302186,0.049988,-0.305042,-0.256626,-0.316204,-0.208769,0.111082,...,-0.624574,-0.872944,-1.02342,1.215113,0.926638,-0.356168,0.820342,0.130334,-0.400101,-1.34065
1010482,1.700363,1.434874,2.172274,0.588717,-1.053078,-0.449577,-0.183486,-0.282021,-0.193406,1.415218,...,0.799747,0.299856,0.233436,-0.760165,-0.413037,-0.208375,0.458095,0.503226,0.283624,-0.004398
1020789,-0.038288,-0.094755,-0.720353,0.166448,1.019148,-0.467476,-0.144914,-0.232801,-0.190406,0.378636,...,-1.251141,-0.876722,-1.272655,2.166257,-0.608678,0.511357,0.197082,0.44561,-0.481001,-1.184623
103827,-0.40647,-0.151763,-1.216992,1.135293,1.70105,-0.443254,-0.229466,-0.340867,-0.162072,-1.714017,...,-1.251141,-0.964742,4.276102,-0.058243,-0.65662,2.876468,-1.747237,-0.359127,-0.432351,0.354759
1064906,2.149414,1.68661,2.318824,0.267539,-0.88117,-0.4208,-0.281658,-0.367026,-0.204562,1.232195,...,0.024037,0.232916,0.439276,-0.753861,1.469926,0.123061,-0.074554,-0.066282,-0.74777,0.295944


### Step 5: GraphSAGE unsupervised

In [None]:


try:
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
except Exception as e:
    raise RuntimeError("Step C5 requires PyTorch. Please install torch (GPU if available) before running C5.") from e

from collections import defaultdict
import random
import numpy as np

def build_id_maps(users_final: pd.DataFrame):
    user_ids = users_final["user_id"].astype(str).tolist()
    id2idx = {u:i for i,u in enumerate(user_ids)}
    idx2id = user_ids
    return id2idx, idx2id

def build_adj_list(edges_final: pd.DataFrame, id2idx: dict):
    adj = defaultdict(list)
    for u, v in zip(edges_final["u"].astype(str), edges_final["v"].astype(str)):
        if u in id2idx and v in id2idx:
            ui = id2idx[u]; vi = id2idx[v]
            if ui != vi:
                adj[ui].append(vi)
                adj[vi].append(ui)
    return adj

def sample_neighbors(adj, nodes, sample_size: int):
    out = []
    for n in nodes.tolist():
        neigh = adj.get(n, [])
        if len(neigh) == 0:
            out.append([n])
        elif len(neigh) <= sample_size:
            out.append(neigh)
        else:
            out.append(random.sample(neigh, sample_size))
    return out

class MeanAggregator(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.lin = nn.Linear(in_dim * 2, out_dim)

    def forward(self, self_h, neigh_h):
        return self.lin(torch.cat([self_h, neigh_h], dim=1))

class GraphSAGE(nn.Module):
    def __init__(self, in_dim, hidden_dim, embed_dim):
        super().__init__()
        self.agg1 = MeanAggregator(in_dim, hidden_dim)
        self.agg2 = MeanAggregator(hidden_dim, embed_dim)

    def forward_batch(self, x, adj, batch_nodes, sizes=(25,10)):
        B = batch_nodes.shape[0]
        S1, S2 = sizes

        # Layer 1
        neigh1 = sample_neighbors(adj, batch_nodes, S1)
        self_h0 = x[batch_nodes] 
        neigh1_mean0 = []
        for i in range(B):
            idxs = torch.tensor(neigh1[i], device=x.device, dtype=torch.long)
            neigh1_mean0.append(x[idxs].mean(dim=0))
        neigh1_mean0 = torch.stack(neigh1_mean0, dim=0)
        h1 = F.relu(self.agg1(self_h0, neigh1_mean0))  

        # Layer 2 
        neigh2 = sample_neighbors(adj, batch_nodes, S2)
        neigh2_mean1 = []
        for i in range(B):
            idxs = torch.tensor(neigh2[i], device=x.device, dtype=torch.long)
            n_nodes = idxs
            nB = n_nodes.shape[0]
            n_self0 = x[n_nodes]
            n_neigh1 = sample_neighbors(adj, n_nodes, S1)
            n_neigh_mean0 = []
            for j in range(nB):
                j_idxs = torch.tensor(n_neigh1[j], device=x.device, dtype=torch.long)
                n_neigh_mean0.append(x[j_idxs].mean(dim=0))
            n_neigh_mean0 = torch.stack(n_neigh_mean0, dim=0)
            n_h1 = F.relu(self.agg1(n_self0, n_neigh_mean0))  
            neigh2_mean1.append(n_h1.mean(dim=0))
        neigh2_mean1 = torch.stack(neigh2_mean1, dim=0)

        z = self.agg2(h1, neigh2_mean1)
        z = F.normalize(z, p=2, dim=1)
        return z

def train_graphsage_unsup(
    edges_final: pd.DataFrame,
    users_final: pd.DataFrame,
    X_users: np.ndarray,
    hidden_dim=128,
    embed_dim=128,
    neighbor_sampling=(25,10),
    epochs=3,
    batch_size=1024,
    num_negative=5,
    lr=1e-3,
    device=None,
    seed=42,
    train_edge_frac: float = 0.1,   
):
    if not (0.0 < train_edge_frac <= 1.0):
        raise ValueError("train_edge_frac must be in (0, 1].")

    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)

    id2idx, idx2id = build_id_maps(users_final)
    adj = build_adj_list(edges_final, id2idx)


    pos_u = []
    pos_v = []
    for u, v in zip(edges_final["u"].astype(str), edges_final["v"].astype(str)):
        if u in id2idx and v in id2idx:
            ui = id2idx[u]; vi = id2idx[v]
            if ui != vi:
                pos_u.append(ui); pos_v.append(vi)

    pos_u = np.array(pos_u, dtype=np.int64)
    pos_v = np.array(pos_v, dtype=np.int64)
    if len(pos_u) == 0:
        raise ValueError("No positive edges after filtering. Check Step 3 outputs.")


    if train_edge_frac < 1.0:
        m = len(pos_u)
        m_sub = max(1, int(m * train_edge_frac))
        idx = np.random.choice(m, size=m_sub, replace=False)
        pos_u = pos_u[idx]
        pos_v = pos_v[idx]

    N = len(idx2id)
    in_dim = X_users.shape[1]

    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    x = torch.tensor(X_users, dtype=torch.float32, device=device)
    model = GraphSAGE(in_dim=in_dim, hidden_dim=hidden_dim, embed_dim=embed_dim).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr)

    LOGGER.info(f"[C5] device={device} | N={N} | train_pos_edges={len(pos_u)} (frac={train_edge_frac}) | in_dim={in_dim}")
    LOGGER.info(f"[C5] epochs={epochs} | batch_size={batch_size} | neg={num_negative} | neighbor_sampling={neighbor_sampling}")

    num_batches = int(np.ceil(len(pos_u) / batch_size))

    for ep in range(1, epochs+1):
        perm = np.random.permutation(len(pos_u))
        pu = pos_u[perm]; pv = pos_v[perm]
        total_loss = 0.0
        model.train()

        for b in range(num_batches):
            s = b * batch_size
            e = min((b+1) * batch_size, len(pu))
            bu = torch.tensor(pu[s:e], device=device)
            bv = torch.tensor(pv[s:e], device=device)

            zu = model.forward_batch(x, adj, bu, sizes=neighbor_sampling)
            zv = model.forward_batch(x, adj, bv, sizes=neighbor_sampling)

            pos_logits = (zu * zv).sum(dim=1)

            neg_v = torch.randint(low=0, high=N, size=(bu.shape[0], num_negative), device=device)
            neg_logits = []
            for j in range(num_negative):
                zvn = model.forward_batch(x, adj, neg_v[:, j], sizes=neighbor_sampling)
                neg_logits.append((zu * zvn).sum(dim=1))
            neg_logits = torch.stack(neg_logits, dim=1)

            loss_pos = F.binary_cross_entropy_with_logits(pos_logits, torch.ones_like(pos_logits))
            loss_neg = F.binary_cross_entropy_with_logits(neg_logits, torch.zeros_like(neg_logits))
            loss = loss_pos + loss_neg

            opt.zero_grad()
            loss.backward()
            opt.step()

            total_loss += float(loss.item())

        LOGGER.info(f"[C5] epoch {ep}/{epochs} | avg_loss={(total_loss/num_batches):.4f}")


    model.eval()
    Z = np.zeros((N, embed_dim), dtype=np.float32)
    with torch.no_grad():
        all_idx = torch.arange(N, device=device)
        bs = 2048
        for s in range(0, N, bs):
            idx = all_idx[s:s+bs]
            z = model.forward_batch(x, adj, idx, sizes=neighbor_sampling)
            Z[s:s+len(idx)] = z.detach().cpu().numpy()

    return Z, model


m_cfg = CFG.get("model", {})
t_cfg = CFG.get("train", {})


TRAIN_EDGE_FRAC = float(t_cfg.get("train_edge_frac", 0.1))  # default 10% if not set

Z, sage_model = train_graphsage_unsup(
    edges_final=edges_final,
    users_final=users_final,
    X_users=X_users,
    hidden_dim=int(m_cfg.get("hidden_dim", 128)),
    embed_dim=int(m_cfg.get("embed_dim", 128)),
    neighbor_sampling=tuple(m_cfg.get("neighbor_sampling", [25,10])),
    epochs=int(t_cfg.get("epochs", 3)),
    batch_size=int(t_cfg.get("batch_size", 1024)),
    num_negative=int(t_cfg.get("num_negative", 5)),
    lr=float(t_cfg.get("lr", 1e-3)),
    seed=int(CFG.get("run", {}).get("seed", 42)),
    train_edge_frac=TRAIN_EDGE_FRAC,   
)

LOGGER.info(f"[C5] Z shape = {Z.shape}")


[05:48:19] INFO - [C5] device=cuda | N=178 | train_pos_edges=40 (frac=0.1) | in_dim=43
[05:48:19] INFO - [C5] epochs=10 | batch_size=1024 | neg=5 | neighbor_sampling=(25, 10)
[05:48:20] INFO - [C5] epoch 1/10 | avg_loss=1.3778
[05:48:21] INFO - [C5] epoch 2/10 | avg_loss=1.2876
[05:48:21] INFO - [C5] epoch 3/10 | avg_loss=1.2168
[05:48:22] INFO - [C5] epoch 4/10 | avg_loss=1.1572
[05:48:22] INFO - [C5] epoch 5/10 | avg_loss=1.1414
[05:48:22] INFO - [C5] epoch 6/10 | avg_loss=1.1392
[05:48:23] INFO - [C5] epoch 7/10 | avg_loss=1.1192
[05:48:23] INFO - [C5] epoch 8/10 | avg_loss=1.1259
[05:48:23] INFO - [C5] epoch 9/10 | avg_loss=1.1031
[05:48:24] INFO - [C5] epoch 10/10 | avg_loss=1.1021
[05:48:24] INFO - [C5] Z shape = (178, 128)


### Step 6: kNN similarity graph + Leiden -> community labels

In [None]:

import numpy as np
import pandas as pd

def l2_normalize_rows(X: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    n = np.linalg.norm(X, axis=1, keepdims=True)
    return X / (n + eps)

def build_knn_edges_cosine(Z: np.ndarray, k: int = 30, mutual: bool = True):

    Z = l2_normalize_rows(Z.astype(np.float32))

    try:
        from sklearn.neighbors import NearestNeighbors
        nn = NearestNeighbors(n_neighbors=min(k + 1, Z.shape[0]), metric="cosine", algorithm="auto")
        nn.fit(Z)
        dist, ind = nn.kneighbors(Z, return_distance=True)  
        sim = 1.0 - dist 
        ind = ind[:, 1:]
        sim = sim[:, 1:]
    except Exception:
        N = Z.shape[0]
        if N > 20000:
            raise RuntimeError("sklearn not available and N is large; please install scikit-learn for kNN.")
        S = Z @ Z.T  
        np.fill_diagonal(S, -np.inf)
        ind = np.argpartition(-S, kth=min(k, N-1), axis=1)[:, :k]

        row = np.arange(N)[:, None]
        sims = S[row, ind]
        order = np.argsort(-sims, axis=1)
        ind = ind[row, order]
        sim = sims[row, order]


    src = np.repeat(np.arange(Z.shape[0]), ind.shape[1])
    dst = ind.reshape(-1)
    w = sim.reshape(-1)

    if not mutual:
        return src, dst, w


    pairs = set(zip(src.tolist(), dst.tolist()))
    keep = np.array([ (j, i) in pairs for i, j in zip(src, dst) ], dtype=bool)
    return src[keep], dst[keep], w[keep]

def leiden_partition_from_edges(n_nodes: int, src: np.ndarray, dst: np.ndarray, w: np.ndarray, resolution: float = 1.0):

    try:
        import igraph as ig
    except Exception as e:
        raise RuntimeError("C6 requires python-igraph for Leiden. Install: pip install igraph") from e

    g = ig.Graph(n=n_nodes, edges=list(zip(src.tolist(), dst.tolist())), directed=False)
    g.es["weight"] = w.astype(float).tolist()


    try:
        import leidenalg
        part = leidenalg.find_partition(
            g,
            leidenalg.RBConfigurationVertexPartition,
            weights="weight",
            resolution_parameter=float(resolution),
        )
        labels = np.array(part.membership, dtype=np.int64)
        method = "leiden"
        quality = float(part.quality())
    except Exception:

        part = g.community_multilevel(weights="weight")
        labels = np.array(part.membership, dtype=np.int64)
        method = "igraph_multilevel"
        quality = float(part.modularity)

    return labels, {"method": method, "quality": quality, "n_communities": int(labels.max() + 1)}

# ---- run C6 ----
c_cfg = CFG.get("community", {}) if "CFG" in globals() else {}
knn_k = int(c_cfg.get("knn_k", 30))
mutual_knn = bool(c_cfg.get("mutual_knn", True))
resolution = float(c_cfg.get("leiden_resolution", 1.0))

LOGGER.info(f"[C6] building kNN graph: k={knn_k}, mutual={mutual_knn}, resolution={resolution}")

src, dst, w = build_knn_edges_cosine(Z, k=knn_k, mutual=mutual_knn)
LOGGER.info(f"[C6] kNN edges kept = {len(src)}")

labels, info = leiden_partition_from_edges(n_nodes=Z.shape[0], src=src, dst=dst, w=w, resolution=resolution)
LOGGER.info(f"[C6] method={info['method']} | n_communities={info['n_communities']} | quality={info['quality']:.4f}")

comm_df = pd.DataFrame({
    "user_id": users_final["user_id"].astype(str).tolist(),
    "community_id": labels.astype(int)
})


sizes = comm_df["community_id"].value_counts().sort_values(ascending=False)
comm_stats = {
    "n_users": int(len(comm_df)),
    "n_communities": int(info["n_communities"]),
    "largest_comm_size": int(sizes.iloc[0]) if len(sizes) else 0,
    "median_comm_size": float(sizes.median()) if len(sizes) else 0.0,
    "method": info["method"],
    "quality": info["quality"],
}
LOGGER.info(f"[C6] stats: {comm_stats}")

comm_df.head(), sizes.head()


[05:48:24] INFO - [C6] building kNN graph: k=30, mutual=True, resolution=1.0
[05:48:40] INFO - [C6] kNN edges kept = 3772
[05:48:45] INFO - [C6] method=leiden | n_communities=5 | quality=4218.8898
[05:48:45] INFO - [C6] stats: {'n_users': 178, 'n_communities': 5, 'largest_comm_size': 51, 'median_comm_size': 35.0, 'method': 'leiden', 'quality': 4218.88981186899}


(   user_id  community_id
 0    10067             4
 1  1010482             3
 2  1020789             0
 3   103827             0
 4  1064906             3,
 community_id
 0    51
 1    38
 2    35
 3    32
 4    22
 Name: count, dtype: int64)

### Step 7: metrics + random baselines

In [None]:

import numpy as np
import pandas as pd

def haversine_km_vec(lat, lon, lat2, lon2):
    R = 6371.0
    lat = np.radians(lat); lon = np.radians(lon)
    lat2 = np.radians(lat2); lon2 = np.radians(lon2)
    dlat = lat2 - lat
    dlon = lon2 - lon
    a = np.sin(dlat/2.0)**2 + np.cos(lat)*np.cos(lat2)*np.sin(dlon/2.0)**2
    c = 2*np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

def compute_user_centroids(checkins_final: pd.DataFrame) -> pd.DataFrame:
    chk = checkins_final.copy()
    chk["user_id"] = chk["user_id"].astype(str)
    cent = chk.groupby("user_id")[["lat","lon"]].mean().reset_index()
    cent.columns = ["user_id", "user_lat", "user_lon"]
    return cent

def structural_metrics_igraph(edges_final: pd.DataFrame, comm_df: pd.DataFrame):

    import igraph as ig

    users = comm_df["user_id"].astype(str).tolist()
    id2idx = {u:i for i,u in enumerate(users)}

    # Build graph
    e = edges_final.copy()
    e["u"] = e["u"].astype(str); e["v"] = e["v"].astype(str)
    e = e[e["u"].isin(id2idx) & e["v"].isin(id2idx)]
    edges_idx = list(zip(e["u"].map(id2idx).tolist(), e["v"].map(id2idx).tolist()))
    g = ig.Graph(n=len(users), edges=edges_idx, directed=False)

    labels = comm_df.set_index("user_id").loc[users, "community_id"].to_numpy()
    mod = float(g.modularity(labels))

    deg = np.array(g.degree(), dtype=np.float64)
    total_vol = deg.sum()
    comm_to_nodes = {}
    for i, c in enumerate(labels):
        comm_to_nodes.setdefault(int(c), []).append(i)

    adj = [set(g.neighbors(i)) for i in range(g.vcount())]

    conductances = []
    densities = []
    for c, nodes in comm_to_nodes.items():
        nodes_set = set(nodes)
        cut = 0
        internal = 0
        for u in nodes:
            for v in adj[u]:
                if v in nodes_set:
                    internal += 1
                else:
                    cut += 1

        internal = internal / 2.0
        volS = deg[list(nodes)].sum()
        volT = total_vol - volS
        denom = min(volS, volT) if min(volS, volT) > 0 else np.nan
        phi = (cut / denom) if denom and not np.isnan(denom) else np.nan
        conductances.append(phi)

        n = len(nodes)
        possible = n * (n - 1) / 2.0
        dens = (internal / possible) if possible > 0 else np.nan
        densities.append(dens)

    return {
        "modularity": mod,
        "conductance_mean": float(np.nanmean(conductances)) if len(conductances) else np.nan,
        "conductance_median": float(np.nanmedian(conductances)) if len(conductances) else np.nan,
        "intra_density_mean": float(np.nanmean(densities)) if len(densities) else np.nan,
        "intra_density_median": float(np.nanmedian(densities)) if len(densities) else np.nan,
    }

def spatial_cohesion_metrics(comm_df: pd.DataFrame, user_centroids: pd.DataFrame):

    df = comm_df.merge(user_centroids, on="user_id", how="left").dropna(subset=["user_lat","user_lon"])
    g = df.groupby("community_id")

    rows = []
    for cid, sub in g:
        latc = sub["user_lat"].mean()
        lonc = sub["user_lon"].mean()
        d = haversine_km_vec(sub["user_lat"].to_numpy(), sub["user_lon"].to_numpy(), latc, lonc)
        rows.append({
            "community_id": int(cid),
            "comm_size": int(len(sub)),
            "spatial_median_km": float(np.median(d)) if len(d) else np.nan,
            "spatial_mean_km": float(np.mean(d)) if len(d) else np.nan,
        })
    return pd.DataFrame(rows)

def random_baseline_same_sizes(comm_df: pd.DataFrame, n_runs: int = 10, seed: int = 42):

    rng = np.random.default_rng(seed)
    labels = comm_df["community_id"].to_numpy()

    baselines = []
    for r in range(n_runs):
        perm = labels.copy()
        rng.shuffle(perm)
        baselines.append(perm)
    return baselines


seed = int(CFG.get("run", {}).get("seed", 42)) if "CFG" in globals() else 42
n_rand = int(CFG.get("metrics", {}).get("random_baseline_runs", 10)) if "CFG" in globals() else 10

LOGGER.info("[C7] computing user centroids from checkins ...")
user_centroids = compute_user_centroids(checkins_final)

LOGGER.info("[C7] spatial cohesion per community ...")
comm_spatial = spatial_cohesion_metrics(comm_df, user_centroids)


try:
    import igraph  
    LOGGER.info("[C7] structural metrics via igraph ...")
    struct = structural_metrics_igraph(edges_final, comm_df)
except Exception as e:
    struct = {
        "modularity": np.nan,
        "conductance_mean": np.nan,
        "conductance_median": np.nan,
        "intra_density_mean": np.nan,
        "intra_density_median": np.nan,
    }
    LOGGER.warning(f"[C7] igraph not available for structural metrics. Install igraph for modularity/conductance. Reason: {type(e).__name__}: {e}")


LOGGER.info(f"[C7] random baselines: runs={n_rand} (shuffle labels, keep size distribution)")
baseline_labels_list = random_baseline_same_sizes(comm_df, n_runs=n_rand, seed=seed)

baseline_spatial_medians = []
for perm_labels in baseline_labels_list:
    tmp = comm_df.copy()
    tmp["community_id"] = perm_labels
    tmp_sp = spatial_cohesion_metrics(tmp, user_centroids)
    baseline_spatial_medians.append(float(tmp_sp["spatial_median_km"].median()))

baseline_spatial_medians = np.array(baseline_spatial_medians, dtype=float)


obs_global_spatial = float(comm_spatial["spatial_median_km"].median()) if len(comm_spatial) else np.nan
rand_mean = float(np.nanmean(baseline_spatial_medians)) if len(baseline_spatial_medians) else np.nan
rand_std  = float(np.nanstd(baseline_spatial_medians)) if len(baseline_spatial_medians) else np.nan
z_score = (obs_global_spatial - rand_mean) / (rand_std + 1e-12) if np.isfinite(obs_global_spatial) else np.nan

metrics_global = {
    "n_users": int(len(comm_df)),
    "n_communities": int(comm_df["community_id"].nunique()),
    **struct,
    "spatial_median_km_global": obs_global_spatial,
    "spatial_random_median_km_mean": rand_mean,
    "spatial_random_median_km_std": rand_std,
    "spatial_zscore_vs_random": float(z_score),
}

LOGGER.info(f"[C7] global metrics: {metrics_global}")

comm_metrics_df = comm_spatial.sort_values(["comm_size"], ascending=False).reset_index(drop=True)

metrics_global, comm_metrics_df.head(10)


[05:48:45] INFO - [C7] computing user centroids from checkins ...
[05:48:45] INFO - [C7] spatial cohesion per community ...
[05:48:45] INFO - [C7] structural metrics via igraph ...
[05:48:46] INFO - [C7] random baselines: runs=10 (shuffle labels, keep size distribution)
[05:48:46] INFO - [C7] global metrics: {'n_users': 178, 'n_communities': 5, 'modularity': 0.6231763450693492, 'conductance_mean': 0.1660330879311771, 'conductance_median': 0.1, 'intra_density_mean': 0.1159609492917193, 'intra_density_median': 0.08907563025210084, 'spatial_median_km_global': 570.2890844457413, 'spatial_random_median_km_mean': 7788.158492197111, 'spatial_random_median_km_std': 270.8763507784968, 'spatial_zscore_vs_random': -26.64636239748222}


({'n_users': 178,
  'n_communities': 5,
  'modularity': 0.6231763450693492,
  'conductance_mean': 0.1660330879311771,
  'conductance_median': 0.1,
  'intra_density_mean': 0.1159609492917193,
  'intra_density_median': 0.08907563025210084,
  'spatial_median_km_global': 570.2890844457413,
  'spatial_random_median_km_mean': 7788.158492197111,
  'spatial_random_median_km_std': 270.8763507784968,
  'spatial_zscore_vs_random': -26.64636239748222},
    community_id  comm_size  spatial_median_km  spatial_mean_km
 0             0         51        2355.710506      2447.673152
 1             1         38         570.289084      1150.922755
 2             2         35         564.708539       961.085564
 3             3         32         173.740222       686.017419
 4             4         22        1381.782224      1401.957075)