# Shenzhen demo (Colab)

This notebook demonstrates how to compute **user × grid exposure weights** in lunch/dinner windows using `poi_visit_aggregator`.

Outputs (per city): `user_grid_time_strict_filled_<city>.parquet` + `qa_summary_strict_filled_<city>.csv`.


In [None]:
# (Colab) Mount Google Drive
from google.colab import drive

drive.mount('/content/drive')


In [None]:
# Install repo + optional export deps
# Option A: clone your GitHub repo
!git clone https://github.com/weipengdeng/poi_visit_aggregator.git
%cd <YOUR_REPO>
!pip -q install -e ".[export]"


In [None]:
from pathlib import Path

CITY = "shenzhen"
GRID_UID_CODE = "440300"  # set to your city code (e.g. Shenzhen=4403)
DRIVE_ROOT = Path("/content/drive/MyDrive")

# Update these paths to your Google Drive files
UUID_TABLE = DRIVE_ROOT / "data/shenzhen/uuid_table.parquet"  # or .csv
STAYPOINTS = [
    DRIVE_ROOT / "data/shenzhen/staypoints_2024-11.parquet",
    # or daily files:
    # *sorted((DRIVE_ROOT / "data/shenzhen/staypoints_daily").glob("*.csv")),
]
GRID_META = DRIVE_ROOT / "data/shenzhen/grid_meta_shenzhen.json"
OUT_DIR = DRIVE_ROOT / "out/poi_visit_aggregator"

OUT_CITY_DIR = OUT_DIR / CITY
OUT_FILE = OUT_CITY_DIR / f"user_grid_time_strict_filled_{CITY}.parquet"
QA_FILE = OUT_CITY_DIR / f"qa_summary_strict_filled_{CITY}.csv"

OUT_CITY_DIR


In [None]:
# Optional: map your column names if they differ.
# Fill in only what you need.
SCHEMA_MAP = {
    "staypoints": {
        # "uuid": "uuid",
        # "start_time": "start_ms",
        # "end_time": "end_ms",
        # one of (x,y) or (lon,lat) or location
        # "lon": "lon",
        # "lat": "lat",
        # "source": "source",
    },
    "uuid_table": {
        # "uuid": "uuid",
    },
}
SCHEMA_MAP


In [None]:
import os
import time
from contextlib import contextmanager

import psutil


def rss_mb() -> float:
    return psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024


@contextmanager
def step(name: str):
    t0 = time.perf_counter()
    m0 = rss_mb()
    print(f"[START] {name} (RAM={m0:,.0f} MB)")
    try:
        yield
    finally:
        dt = time.perf_counter() - t0
        m1 = rss_mb()
        print(f"[ END ] {name} (dt={dt:,.1f}s, RAM={m1:,.0f} MB, Δ={m1 - m0:,.0f} MB)")


In [None]:
from poi_visit_aggregator.export_user_grid_time_strict_filled import (
    export_user_grid_time_strict_filled,
)

with step("export_user_grid_time_strict_filled"):
    export_user_grid_time_strict_filled(
        city=CITY,
        staypoints=[Path(p) for p in STAYPOINTS],
        staypoints_format="auto",  # or csv/parquet
        uuid_table=UUID_TABLE,
        grid_meta_path=GRID_META,
        out_dir=OUT_DIR,
        schema_map=SCHEMA_MAP,
        output_grid_uid=True,
        output_grid_id=False,
        grid_uid_code=GRID_UID_CODE,
        grid_uid_prefix="grid",
        grid_uid_order="col_row",
        filter_city_code=False,
        city_code_col="c_code",
        city_code_value=GRID_UID_CODE,
        windows=["lunch", "dinner"],
        min_interval_minutes=5,
        point_source_filter=True,
        point_source_value="cell_appearance",
        drop_uuid_not_in_table=True,  # skip users not in UUID_TABLE
        timestamps_are_utc=True,
        tz_offset_hours=8,
        epoch_unit="ms",
        coords_already_projected=False,
        uid64_hash_method="xxh64",  # faster if installed; else use sha256_64
        buckets=256,
        batch_size=1_000_000,
        overlap_rounding="floor",
        oob_mode="drop",
        threads=8,
        memory_limit="8GB",
        id_mode="uuid",  # uuid|uid64|both
        keep_intermediate=False,
    )

OUT_FILE


In [None]:
import pandas as pd

df = pd.read_parquet(OUT_FILE)
qa = pd.read_csv(QA_FILE)

display(df.head())
display(qa.T.head(50))


In [None]:
# Weekday/weekend + per-week average examples
import numpy as np

DATE_START = "2024-11-01"  # change to your month
DATE_END = "2024-11-30"
dates = pd.date_range(DATE_START, DATE_END, freq="D")
n_days = len(dates)
n_weekdays = int((dates.weekday < 5).sum())
n_weekends = int((dates.weekday >= 5).sum())
n_weeks = n_days / 7.0

# Average minutes per (weekday/weekend) day
df["tau_filled_min_per_day_type"] = df["tau_filled_min"] / np.where(df["is_weekend"], n_weekends, n_weekdays)

# Average minutes per week (over the period)
df["tau_filled_min_per_week"] = df["tau_filled_min"] / n_weeks

df[["window", "is_weekend", "tau_filled_min", "tau_filled_min_per_day_type", "tau_filled_min_per_week"]].head()
