# Shenzhen exposure-weight demo (Colab / local)

This notebook computes **user × grid exposure weights** in lunch/dinner windows using `poi_visit_aggregator`.

Outputs (per city): `user_grid_time_strict_filled_<city>.parquet` + `qa_summary_strict_filled_<city>.csv`.

Tip: for stability/performance, put `tmp_root` and DuckDB temp on a local disk (Colab: `/tmp`, Windows: e.g. `C:\\temp`).


In [None]:
from __future__ import annotations

from pathlib import Path
import os
import sys

import numpy as np
import pandas as pd

# --- Colab detection ---
try:
    from google.colab import drive  # type: ignore

    IN_COLAB = True
except Exception:
    IN_COLAB = False


def _find_repo_root(start: Path) -> Path:
    p = start.resolve()
    for parent in [p, *p.parents]:
        if (parent / "pyproject.toml").exists() and (parent / "poi_visit_aggregator").exists():
            return parent
    return start.resolve()


# --- Colab bootstrap (Drive + optional clone) ---
# If you open this notebook directly in Colab (not opened from the repo),
# keep CLONE_REPO_IN_COLAB=True to clone the repo into Drive and add it to sys.path.
CLONE_REPO_IN_COLAB = True
COLAB_TARGET_DIR = Path("/content/drive/MyDrive/Script/Module")
COLAB_REPO_PATH = COLAB_TARGET_DIR / "poi_visit_aggregator"

if IN_COLAB and CLONE_REPO_IN_COLAB:
    drive.mount("/content/drive")
    COLAB_TARGET_DIR.mkdir(parents=True, exist_ok=True)
    os.chdir(str(COLAB_TARGET_DIR))

    if not COLAB_REPO_PATH.exists():
        get_ipython().system("git clone https://github.com/weipengdeng/poi_visit_aggregator.git")
    else:
        try:
            get_ipython().system(f"git -C {COLAB_REPO_PATH} pull")
        except Exception:
            pass

    if str(COLAB_REPO_PATH) in sys.path:
        sys.path.remove(str(COLAB_REPO_PATH))
    sys.path.insert(0, str(COLAB_REPO_PATH))
    os.chdir(str(COLAB_REPO_PATH))

if not IN_COLAB:
    # VSCode/Jupyter sometimes sets CWD to `notebooks/`. Make repo imports work either way.
    REPO_ROOT = _find_repo_root(Path.cwd())
    os.chdir(str(REPO_ROOT))
    if str(REPO_ROOT) not in sys.path:
        sys.path.insert(0, str(REPO_ROOT))

try:
    import psutil  # type: ignore
except Exception:
    psutil = None


def mem_gb():
    """Return (rss_gb, avail_gb, used_pct)."""
    if psutil is None:
        return (np.nan, np.nan, np.nan)
    vm = psutil.virtual_memory()
    rss = psutil.Process(os.getpid()).memory_info().rss
    return (float(rss) / 1e9, float(vm.available) / 1e9, float(vm.percent))


pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

print("IN_COLAB:", IN_COLAB)
print("CWD:", Path.cwd())
rss_gb, avail_gb, used_pct = mem_gb()
print(f"Memory: RSS={rss_gb:.2f} GB | avail={avail_gb:.2f} GB | used={used_pct:.0f}%")


In [None]:
# Install export deps (run once)
# If you rerun the notebook often, you can comment this cell after the first run.
#
# Colab:
# !pip -q install -e ".[export]"
#
# Local (VSCode/Jupyter):
# !{sys.executable} -m pip install -e ".[export]"


In [None]:
from pathlib import Path
import os

CITY = "Shenzhen"
# City code that matches staypoints `c_code` (also used in grid_uid).
GRID_UID_CODE = "440300"

# --- Configure paths ---
if IN_COLAB:
    DRIVE_ROOT = Path("/content/drive/MyDrive")
    DATA_ROOT = DRIVE_ROOT / "Project/202512_EFE"

    RUN_TAG = "access_k10_popcentroid_v1"
    OUT_ROOT = DATA_ROOT / "data" / "derived" / "accessibility" / RUN_TAG
    GRID_JSON_DIR = OUT_ROOT / "grid_json"

    UUID_TABLE = DATA_ROOT / "data/jike/uuid.csv"  # .csv or .parquet
    STAYPOINTS = [DATA_ROOT / "data/jike/track.csv"]
    GRID_META = GRID_JSON_DIR / f"grid_meta_{CITY}.json"
    OUT_DIR = OUT_ROOT / "out/poi_visit_aggregator"

    # Recommended: keep temp files off Google Drive.
    TMP_ROOT = Path("/tmp/poi_visit_aggregator_tmp")
    DUCKDB_TEMP_DIR = Path("/tmp/duckdb_tmp")
else:
    # TODO: edit these paths to your local disk.
    DATA_ROOT = Path(r"D:\\Project\\202512_EFE")
    UUID_TABLE = DATA_ROOT / "uuid.csv"  # .csv or .parquet
    STAYPOINTS = [DATA_ROOT / "staypoints.csv"]
    GRID_META = DATA_ROOT / f"grid_meta_{CITY}.json"
    OUT_DIR = DATA_ROOT / "out/poi_visit_aggregator"

    # Put temp on a fast local disk (avoid OneDrive folders).
    temp_dir = Path(os.environ.get("TEMP", "."))
    TMP_ROOT = Path(os.environ.get("POI_VISIT_TMP_ROOT", str(temp_dir / "poi_visit_aggregator_tmp")))
    DUCKDB_TEMP_DIR = Path(os.environ.get("POI_VISIT_DUCKDB_TMP", str(temp_dir / "duckdb_tmp")))

# If staypoints contain nationwide users, keep this on for speed.
FILTER_CITY_CODE = True
CITY_CODE_COL = "c_code"
CITY_CODE_VALUE = GRID_UID_CODE

OUT_CITY_DIR = OUT_DIR / CITY
OUT_FILE = OUT_CITY_DIR / f"user_grid_time_strict_filled_{CITY}.parquet"
QA_FILE = OUT_CITY_DIR / f"qa_summary_strict_filled_{CITY}.csv"

OUT_CITY_DIR


In [None]:
# Optional: map your column names if they differ.
# Fill in only what you need.
SCHEMA_MAP = {
    "staypoints": {
        # "uuid": "uuid",
        # "start_time": "start_ms",
        # "end_time": "end_ms",
        # one of (x,y) or (lon,lat) or location
        # "lon": "lon",
        # "lat": "lat",
        # "source": "source",
    },
    "uuid_table": {
        # "uuid": "uuid",
    },
}
SCHEMA_MAP


In [None]:
import os
import time
from contextlib import contextmanager

try:
    import psutil  # type: ignore
except Exception:
    psutil = None


def rss_mb() -> float:
    if psutil is None:
        return float("nan")
    return psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024


@contextmanager
def step(name: str):
    t0 = time.perf_counter()
    m0 = rss_mb()
    print(f"[START] {name} (RAM={m0:,.0f} MB)")
    try:
        yield
    finally:
        dt = time.perf_counter() - t0
        m1 = rss_mb()
        print(f"[ END ] {name} (dt={dt:,.1f}s, RAM={m1:,.0f} MB, Δ={m1 - m0:,.0f} MB)")


In [None]:
from poi_visit_aggregator.export_user_grid_time_strict_filled import (
    export_user_grid_time_strict_filled,
)

RESUME_STAGE2 = False  # set True to reuse existing Stage-1 parts
KEEP_INTERMEDIATE = True  # keep Stage-1 parts (useful for resume/debug)

with step("export_user_grid_time_strict_filled"):
    export_user_grid_time_strict_filled(
        city=CITY,
        staypoints=[Path(p) for p in STAYPOINTS],
        staypoints_format="auto",  # or csv/parquet
        uuid_table=UUID_TABLE,
        grid_meta_path=GRID_META,
        out_dir=OUT_DIR,
        tmp_root=TMP_ROOT,
        duckdb_temp_dir=DUCKDB_TEMP_DIR,
        schema_map=SCHEMA_MAP,
        output_grid_uid=True,
        output_grid_id=False,
        grid_uid_code=GRID_UID_CODE,
        grid_uid_prefix="grid",
        grid_uid_order="col_row",
        filter_city_code=FILTER_CITY_CODE,
        city_code_col=CITY_CODE_COL,
        city_code_value=CITY_CODE_VALUE,
        windows=["lunch", "dinner"],
        min_interval_minutes=5,
        point_source_filter=True,
        point_source_value="cell_appearance",
        drop_uuid_not_in_table=True,  # skip users not in UUID_TABLE
        timestamps_are_utc=True,
        tz_offset_hours=8,
        epoch_unit="ms",
        coords_already_projected=False,
        uid64_hash_method="xxh64",  # faster if installed; else use sha256_64
        buckets=256,
        batch_size=1_000_000,
        log_every_batches=500,
        overlap_rounding="floor",
        oob_mode="drop",
        threads=max(1, (os.cpu_count() or 8) - 1),
        memory_limit="8GB",
        id_mode="uuid",  # uuid|uid64|both
        resume_stage2=RESUME_STAGE2,
        keep_intermediate=KEEP_INTERMEDIATE,
    )

OUT_FILE


In [None]:
import pandas as pd

df = pd.read_parquet(OUT_FILE)
qa = pd.read_csv(QA_FILE)

display(df.head())
display(qa.T.head(50))


In [None]:
# Weekday/weekend + per-week average examples
import numpy as np

DATE_START = "2024-11-01"  # change to your month
DATE_END = "2024-11-30"
dates = pd.date_range(DATE_START, DATE_END, freq="D")
n_days = len(dates)
n_weekdays = int((dates.weekday < 5).sum())
n_weekends = int((dates.weekday >= 5).sum())
n_weeks = n_days / 7.0

# Average minutes per (weekday/weekend) day
df["tau_filled_min_per_day_type"] = df["tau_filled_min"] / np.where(df["is_weekend"], n_weekends, n_weekdays)

# Average minutes per week (over the period)
df["tau_filled_min_per_week"] = df["tau_filled_min"] / n_weeks

df[["window", "is_weekend", "tau_filled_min", "tau_filled_min_per_day_type", "tau_filled_min_per_week"]].head()
