01_user_activity_scale.pdf：每用户 stays 数分布（log-log）+ 每用户时间跨度（log-log）

02_user_entropy.pdf：user visit entropy + user conditional transition entropy

03_global_cond_entropy_by_grid.pdf：全局按 current grid 的 H(next|curr) 分布

04_topk_visit_ratio.pdf：Top-1/3/5 visit ratio across users（boxplot）

05_routine_vs_tail.pdf：每用户 routine stay fraction / routine next fraction 分布

06_global_routine_fractions.pdf：全局均值条形图

07_time_uncertainty.pdf：stay duration + inter-stay gap（log-log）

08_context_availability.pdf：fuzzy/precise/both 的出现比例

09_precise_alignment.pdf：precise context 的 step distance 分布 + 匹配率

10_fuzzy_alignment.pdf：fuzzy context 的 step distance 分布 + sigma 解析分布（若可解析）

In [1]:
# dataset_stats.py
# -*- coding: utf-8 -*-
"""
Dataset statistics + visualization for GeoLife_all.csv

Implements:
1) User activity & trajectory scale distribution
2) Transition entropy / uncertainty metrics
3) Routine vs Aperiodic quantification
4) Time uncertainty distributions
5) Context availability
6) Context availability & alignment (precise/fuzzy)

Outputs figures to ./figures/
"""

import os
import re
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# -----------------------------
# Utilities
# -----------------------------
def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)


def entropy_from_counts(counts: np.ndarray, base: float = 2.0) -> float:
    """Entropy from counts (non-negative)."""
    counts = counts.astype(float)
    s = counts.sum()
    if s <= 0:
        return 0.0
    p = counts / s
    p = p[p > 0]
    return float(-(p * (np.log(p) / np.log(base))).sum())


def nice_hist(ax, x, bins=50, logx=False, logy=False, title="", xlabel="", ylabel="Count",
              color=None, alpha=0.85):
    x = np.asarray(x)
    x = x[np.isfinite(x)]
    ax.hist(x, bins=bins, color=color, alpha=alpha, edgecolor="white", linewidth=0.6)
    if logx:
        ax.set_xscale("log")
    if logy:
        ax.set_yscale("log")
    ax.set_title(title, pad=10)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.grid(True, linestyle="--", linewidth=0.6, alpha=0.35)


def nice_box(ax, data_list, labels, title="", ylabel="", colors=None):
    bp = ax.boxplot(data_list, labels=labels, patch_artist=True, showfliers=False)
    if colors is None:
        colors = [None] * len(data_list)
    for patch, c in zip(bp["boxes"], colors):
        if c is not None:
            patch.set_facecolor(c)
        patch.set_alpha(0.85)
        patch.set_edgecolor("#333333")
        patch.set_linewidth(1.0)
    for k in ["whiskers", "caps", "medians"]:
        for line in bp[k]:
            line.set_color("#333333")
            line.set_linewidth(1.0)
    ax.set_title(title, pad=10)
    ax.set_ylabel(ylabel)
    ax.grid(True, linestyle="--", linewidth=0.6, alpha=0.35)


def parse_context_precise(text: str):
    """
    Example:
    'User 0 will move from grid 14808 to grid 14800, at 2008-10-26 15:03:47.'
    Returns (to_grid:int, t:Timestamp) or (None, None)
    """
    if not isinstance(text, str) or not text:
        return None, None
    m_grid = re.search(r"to grid\s+(\d+)", text)
    m_time = re.search(r"at\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})", text)
    if not m_grid or not m_time:
        return None, None
    to_grid = int(m_grid.group(1))
    t = pd.to_datetime(m_time.group(1), errors="coerce")
    if pd.isna(t):
        return None, None
    return to_grid, t


def parse_context_fuzzy(text: str):
    """
    Example:
    'User 0 will move from grid 14808 to grid 14800, arriving around in about 3 days, on 2008-10-26.'
    Returns (to_grid:int, date:Timestamp normalized to date, sigma_minutes:float or None)
    - sigma_minutes extracted from "in about X (days|hours|mins)" if present
    """
    if not isinstance(text, str) or not text:
        return None, None, None
    m_grid = re.search(r"to grid\s+(\d+)", text)
    m_date = re.search(r"on\s+(\d{4}-\d{2}-\d{2})", text)
    if not m_grid or not m_date:
        return None, None, None
    to_grid = int(m_grid.group(1))
    date = pd.to_datetime(m_date.group(1), errors="coerce")
    if pd.isna(date):
        return None, None, None
    date = date.normalize()

    sigma_minutes = None
    m_sigma = re.search(r"in about\s+(\d+)\s*(day|days|hour|hours|min|mins|minute|minutes)", text)
    if m_sigma:
        val = float(m_sigma.group(1))
        unit = m_sigma.group(2)
        if unit.startswith("day"):
            sigma_minutes = val * 24 * 60
        elif unit.startswith("hour"):
            sigma_minutes = val * 60
        else:
            sigma_minutes = val
    return to_grid, date, sigma_minutes


def prepare_sequences(df: pd.DataFrame) -> pd.DataFrame:
    """Sort by user & stime, build next fields and indices."""
    df = df.copy()
    df["stime"] = pd.to_datetime(df["stime"], errors="coerce")
    df["etime"] = pd.to_datetime(df["etime"], errors="coerce")

    df = df.dropna(subset=["userID", "stime", "etime", "grid"]).copy()
    df["userID"] = df["userID"].astype(int)
    df["grid"] = df["grid"].astype(int)

    df = df.sort_values(["userID", "stime"]).reset_index(drop=True)
    df["idx_in_user"] = df.groupby("userID").cumcount()

    # next stay info
    df["next_grid"] = df.groupby("userID")["grid"].shift(-1)
    df["next_stime"] = df.groupby("userID")["stime"].shift(-1)
    df["next_etime"] = df.groupby("userID")["etime"].shift(-1)

    # time deltas
    df["stay_duration_min"] = (df["etime"] - df["stime"]).dt.total_seconds() / 60.0
    df["inter_stay_gap_min"] = (df["next_stime"] - df["etime"]).dt.total_seconds() / 60.0
    # gaps can be negative due to noise; clip for hist
    df["inter_stay_gap_min_clip"] = df["inter_stay_gap_min"].clip(lower=0)

    return df


# -----------------------------
# 1) User activity & scale
# -----------------------------
def plot_user_activity(df: pd.DataFrame, outdir: str):
    ensure_dir(outdir)
    palette = plt.get_cmap("tab10").colors

    n_users = df["userID"].nunique()
    n_stays = len(df)
    n_trans = df["next_grid"].notna().sum()

    stays_per_user = df.groupby("userID").size().astype(int)
    span_days = (df.groupby("userID")["etime"].max() - df.groupby("userID")["stime"].min()).dt.total_seconds() / (24*3600)

    summary = {
        "n_users": int(n_users),
        "n_stays": int(n_stays),
        "n_transitions": int(n_trans),
        "stays_per_user_mean": float(stays_per_user.mean()),
        "stays_per_user_median": float(stays_per_user.median()),
        "stays_per_user_std": float(stays_per_user.std()),
        "stays_per_user_IQR": float(stays_per_user.quantile(0.75) - stays_per_user.quantile(0.25)),
        "span_days_mean": float(span_days.mean()),
        "span_days_median": float(span_days.median()),
    }
    print("[User Activity Summary]")
    for k, v in summary.items():
        print(f"  {k}: {v}")

    fig = plt.figure(figsize=(12, 4.5))
    ax1 = plt.subplot(1, 2, 1)
    nice_hist(
        ax1,
        stays_per_user.values,
        bins=50,
        logx=True,
        logy=True,
        title=f"Stays per user (log-log) | users={n_users}, stays={n_stays}",
        xlabel="#stays per user",
        ylabel="count of users",
        color=palette[0],
    )

    ax2 = plt.subplot(1, 2, 2)
    nice_hist(
        ax2,
        span_days.values,
        bins=50,
        logx=True,
        logy=True,
        title="Time span per user (log-log)",
        xlabel="span (days)",
        ylabel="count of users",
        color=palette[1],
    )

    plt.tight_layout()
    fig.savefig(os.path.join(outdir, "01_user_activity_scale.pdf"), dpi=220)
    plt.close(fig)

    return summary


# -----------------------------
# 2) Transition entropy / uncertainty
# -----------------------------
def compute_user_visit_entropy(df: pd.DataFrame) -> pd.Series:
    """H_u over visited grids for each user."""
    ent = {}
    for uid, g in df.groupby("userID"):
        counts = g["grid"].value_counts().values
        ent[uid] = entropy_from_counts(counts, base=2.0)
    return pd.Series(ent, name="visit_entropy_bits")


def compute_user_conditional_transition_entropy(df: pd.DataFrame) -> pd.Series:
    """
    H_u(next|curr) = sum_{curr} p(curr) * H(next|curr).
    """
    ent = {}
    for uid, g in df.groupby("userID"):
        trans = g.dropna(subset=["next_grid"])
        if len(trans) == 0:
            ent[uid] = 0.0
            continue
        # group by current grid
        total = len(trans)
        H = 0.0
        for curr, gg in trans.groupby("grid"):
            w = len(gg) / total
            counts = gg["next_grid"].value_counts().values
            H += w * entropy_from_counts(counts, base=2.0)
        ent[uid] = float(H)
    return pd.Series(ent, name="cond_entropy_bits")


def compute_global_cond_entropy_by_curr(df: pd.DataFrame) -> pd.Series:
    """For each current grid, compute H(next|curr) across all users."""
    trans = df.dropna(subset=["next_grid"])
    ent = {}
    for curr, g in trans.groupby("grid"):
        counts = g["next_grid"].value_counts().values
        ent[curr] = entropy_from_counts(counts, base=2.0)
    return pd.Series(ent, name="H_next_given_curr_bits")


def plot_entropy(df: pd.DataFrame, outdir: str):
    ensure_dir(outdir)
    palette = plt.get_cmap("tab10").colors

    visit_H = compute_user_visit_entropy(df)
    cond_H = compute_user_conditional_transition_entropy(df)
    curr_H = compute_global_cond_entropy_by_curr(df)

    # Plot user-level entropies
    fig = plt.figure(figsize=(12, 4.5))
    ax1 = plt.subplot(1, 2, 1)
    nice_hist(
        ax1, visit_H.values, bins=50, logx=False, logy=True,
        title="User visit entropy H(grid | user)",
        xlabel="entropy (bits)", ylabel="users",
        color=palette[2]
    )
    ax2 = plt.subplot(1, 2, 2)
    nice_hist(
        ax2, cond_H.values, bins=50, logx=False, logy=True,
        title="User conditional transition entropy H(next | curr, user)",
        xlabel="entropy (bits)", ylabel="users",
        color=palette[3]
    )
    plt.tight_layout()
    fig.savefig(os.path.join(outdir, "02_user_entropy.pdf"), dpi=220)
    plt.close(fig)

    # Plot grid-level conditional entropy (global)
    fig = plt.figure(figsize=(6.2, 4.5))
    ax = plt.gca()
    nice_hist(
        ax, curr_H.values, bins=60, logx=False, logy=True,
        title="Global H(next | current grid)",
        xlabel="entropy (bits)", ylabel="grids",
        color=palette[4]
    )
    plt.tight_layout()
    fig.savefig(os.path.join(outdir, "03_global_cond_entropy_by_grid.pdf"), dpi=220)
    plt.close(fig)

    return {
        "visit_entropy_bits": visit_H,
        "cond_entropy_bits": cond_H,
        "global_cond_entropy_by_grid_bits": curr_H
    }


# -----------------------------
# 3) Routine vs Aperiodic
# -----------------------------
def compute_topk_visit_ratio(df: pd.DataFrame, ks=(1, 3, 5)) -> pd.DataFrame:
    """Per-user top-k visit ratio."""
    rows = []
    for uid, g in df.groupby("userID"):
        vc = g["grid"].value_counts()
        total = vc.sum()
        for k in ks:
            topk = vc.iloc[:k].sum() if len(vc) >= 1 else 0
            rows.append({"userID": uid, "k": k, "topk_ratio": float(topk / total) if total > 0 else 0.0})
    return pd.DataFrame(rows)


def routine_stats(df: pd.DataFrame, topN: int = 5):
    """Compute routine set per user (Top-N grids) and routine fractions."""
    routine_frac_stay = {}
    routine_frac_next = {}

    for uid, g in df.groupby("userID"):
        vc = g["grid"].value_counts()
        routine = set(vc.index[:topN])

        # stays in routine
        routine_frac_stay[uid] = float((g["grid"].isin(routine)).mean())

        # next in routine (transitions)
        trans = g.dropna(subset=["next_grid"])
        if len(trans) == 0:
            routine_frac_next[uid] = 0.0
        else:
            routine_frac_next[uid] = float((trans["next_grid"].astype(int).isin(routine)).mean())

    return pd.Series(routine_frac_stay, name="routine_stay_frac"), pd.Series(routine_frac_next, name="routine_next_frac")


def plot_routine_vs_aperiodic(df: pd.DataFrame, outdir: str):
    ensure_dir(outdir)
    palette = plt.get_cmap("tab10").colors

    # Top-k ratio boxplot
    topk_df = compute_topk_visit_ratio(df, ks=(1, 3, 5))
    data_list = [topk_df[topk_df["k"] == k]["topk_ratio"].values for k in (1, 3, 5)]
    fig = plt.figure(figsize=(7.5, 4.8))
    ax = plt.gca()
    nice_box(
        ax, data_list, labels=["Top-1", "Top-3", "Top-5"],
        title="Distribution of Top-k visit ratio across users",
        ylabel="visit ratio",
        colors=[palette[0], palette[1], palette[2]]
    )
    plt.tight_layout()
    fig.savefig(os.path.join(outdir, "04_topk_visit_ratio.pdf"), dpi=220)
    plt.close(fig)

    # Routine vs tail fractions
    routine_stay, routine_next = routine_stats(df, topN=5)
    fig = plt.figure(figsize=(12, 4.5))
    ax1 = plt.subplot(1, 2, 1)
    nice_hist(
        ax1, routine_stay.values, bins=50, logx=False, logy=True,
        title="Fraction of stays in routine (Top-5 locations)",
        xlabel="routine stay fraction", ylabel="users",
        color=palette[5]
    )
    ax2 = plt.subplot(1, 2, 2)
    nice_hist(
        ax2, routine_next.values, bins=50, logx=False, logy=True,
        title="Fraction of next-stays in routine (Top-5 locations)",
        xlabel="routine next fraction", ylabel="users",
        color=palette[6]
    )
    plt.tight_layout()
    fig.savefig(os.path.join(outdir, "05_routine_vs_tail.pdf"), dpi=220)
    plt.close(fig)

    # Global average (single bar summary)
    global_stay = float(routine_stay.mean())
    global_next = float(routine_next.mean())
    fig = plt.figure(figsize=(5.8, 4.5))
    ax = plt.gca()
    ax.bar(["stay in routine", "next in routine"], [global_stay, global_next],
           color=[palette[5], palette[6]], alpha=0.85, edgecolor="white", linewidth=0.8)
    ax.set_ylim(0, 1.0)
    ax.set_title("Global routine fractions (mean across users)", pad=10)
    ax.set_ylabel("fraction")
    ax.grid(True, axis="y", linestyle="--", linewidth=0.6, alpha=0.35)
    plt.tight_layout()
    fig.savefig(os.path.join(outdir, "06_global_routine_fractions.pdf"), dpi=220)
    plt.close(fig)

    return {"topk_df": topk_df, "routine_stay_frac": routine_stay, "routine_next_frac": routine_next}


# -----------------------------
# 4) Time uncertainty distributions
# -----------------------------
def plot_time_uncertainty(df: pd.DataFrame, outdir: str):
    ensure_dir(outdir)
    palette = plt.get_cmap("tab10").colors

    dur = df["stay_duration_min"].values
    gap = df["inter_stay_gap_min_clip"].dropna().values

    fig = plt.figure(figsize=(12, 4.5))
    ax1 = plt.subplot(1, 2, 1)
    nice_hist(
        ax1, dur, bins=70, logx=True, logy=True,
        title="Stay duration distribution (log-log)",
        xlabel="duration (minutes)", ylabel="stays",
        color=palette[7]
    )

    ax2 = plt.subplot(1, 2, 2)
    nice_hist(
        ax2, gap, bins=70, logx=True, logy=True,
        title="Inter-stay gap distribution (log-log)",
        xlabel="gap (minutes)", ylabel="transitions",
        color=palette[8]
    )

    plt.tight_layout()
    fig.savefig(os.path.join(outdir, "07_time_uncertainty.pdf"), dpi=220)
    plt.close(fig)


# -----------------------------
# 5-6) Context availability & alignment
# -----------------------------
def build_user_index(df: pd.DataFrame):
    """
    Build per-user lookup:
    - by exact stime: map Timestamp -> (idx_in_user, grid)
    - also list of stimes for nearest match if needed
    """
    user_maps = {}
    for uid, g in df.groupby("userID"):
        g2 = g[["idx_in_user", "stime", "grid"]].copy()
        exact = {t: (int(i), int(gr)) for i, t, gr in zip(g2["idx_in_user"], g2["stime"], g2["grid"])}
        # for nearest matching
        stimes = g2["stime"].values
        user_maps[uid] = {"exact": exact, "g": g2}
    return user_maps


def match_precise_alignment(df: pd.DataFrame) -> pd.DataFrame:
    """
    For each row with context_precise, parse (to_grid, target_time),
    then match to user's stay whose stime == target_time (exact).
    Return per-context stats:
      - matched (bool)
      - step_distance (matched_idx - curr_idx)
      - time_error_min (target_time - matched_stime)
      - grid_match (bool)
    """
    user_maps = build_user_index(df)
    records = []
    for r in df.itertuples(index=False):
        uid = int(r.userID)
        curr_idx = int(r.idx_in_user)
        txt = getattr(r, "context_precise")
        to_grid, t = parse_context_precise(txt)

        if to_grid is None or t is None:
            continue

        exact = user_maps[uid]["exact"]
        if t in exact:
            matched_idx, matched_grid = exact[t]
            records.append({
                "userID": uid,
                "curr_idx": curr_idx,
                "to_grid": int(to_grid),
                "target_time": t,
                "matched": True,
                "matched_idx": matched_idx,
                "matched_grid": matched_grid,
                "step_distance": int(matched_idx - curr_idx),
                "time_error_min": 0.0,  # exact match
                "grid_match": bool(int(matched_grid) == int(to_grid)),
            })
        else:
            records.append({
                "userID": uid,
                "curr_idx": curr_idx,
                "to_grid": int(to_grid),
                "target_time": t,
                "matched": False,
                "matched_idx": np.nan,
                "matched_grid": np.nan,
                "step_distance": np.nan,
                "time_error_min": np.nan,
                "grid_match": False,
            })

    return pd.DataFrame(records)


def match_fuzzy_alignment(df: pd.DataFrame) -> pd.DataFrame:
    """
    For fuzzy context: parse (to_grid, date, sigma_minutes).
    We match by looking for user's stay with same grid on that date (any time),
    picking the first occurrence on that date (by stime) to estimate alignment.
    Returns:
      - matched(bool)
      - step_distance (matched_idx - curr_idx)
      - day_error (matched_date - context_date, in days)
      - sigma_minutes (from text if available)
    """
    # precompute for each user: (date, grid) -> earliest idx/stime
    index = {}
    for uid, g in df.groupby("userID"):
        g2 = g[["idx_in_user", "stime", "grid"]].copy()
        g2["date"] = g2["stime"].dt.normalize()
        # earliest occurrence
        key_map = {}
        for rr in g2.sort_values("stime").itertuples(index=False):
            key = (rr.date, int(rr.grid))
            if key not in key_map:
                key_map[key] = (int(rr.idx_in_user), rr.stime)
        index[int(uid)] = key_map

    records = []
    for r in df.itertuples(index=False):
        uid = int(r.userID)
        curr_idx = int(r.idx_in_user)
        txt = getattr(r, "context_fuzzy")
        to_grid, d, sigma_min = parse_context_fuzzy(txt)
        if to_grid is None or d is None:
            continue
        key = (d, int(to_grid))
        key_map = index.get(uid, {})
        if key in key_map:
            matched_idx, matched_stime = key_map[key]
            day_error = (matched_stime.normalize() - d).days
            records.append({
                "userID": uid,
                "curr_idx": curr_idx,
                "to_grid": int(to_grid),
                "context_date": d,
                "matched": True,
                "matched_idx": matched_idx,
                "matched_stime": matched_stime,
                "step_distance": int(matched_idx - curr_idx),
                "day_error": float(day_error),
                "sigma_minutes": float(sigma_min) if sigma_min is not None else np.nan,
            })
        else:
            records.append({
                "userID": uid,
                "curr_idx": curr_idx,
                "to_grid": int(to_grid),
                "context_date": d,
                "matched": False,
                "matched_idx": np.nan,
                "matched_stime": pd.NaT,
                "step_distance": np.nan,
                "day_error": np.nan,
                "sigma_minutes": float(sigma_min) if sigma_min is not None else np.nan,
            })
    return pd.DataFrame(records)


def plot_context_stats(df: pd.DataFrame, outdir: str):
    ensure_dir(outdir)
    palette = plt.get_cmap("tab10").colors

    # Availability
    has_fuzzy = df["context_fuzzy"].notna().mean()
    has_precise = df["context_precise"].notna().mean()
    both = (df["context_fuzzy"].notna() & df["context_precise"].notna()).mean()

    fig = plt.figure(figsize=(6.2, 4.5))
    ax = plt.gca()
    ax.bar(["fuzzy", "precise", "both"], [has_fuzzy, has_precise, both],
           color=[palette[0], palette[1], palette[2]],
           alpha=0.85, edgecolor="white", linewidth=0.8)
    ax.set_ylim(0, 1.0)
    ax.set_title("Context availability ratio", pad=10)
    ax.set_ylabel("ratio")
    ax.grid(True, axis="y", linestyle="--", linewidth=0.6, alpha=0.35)
    plt.tight_layout()
    fig.savefig(os.path.join(outdir, "08_context_availability.pdf"), dpi=220)
    plt.close(fig)

    # Precise alignment
    precise_df = match_precise_alignment(df)
    if len(precise_df) > 0:
        match_rate = precise_df["matched"].mean()
        grid_match_rate = (precise_df["matched"] & precise_df["grid_match"]).mean()

        # step distance distribution (matched only)
        sd = precise_df.loc[precise_df["matched"], "step_distance"].values
        sd = sd[np.isfinite(sd) & (sd >= 1)]  # future steps
        fig = plt.figure(figsize=(12, 4.5))
        ax1 = plt.subplot(1, 2, 1)
        nice_hist(
            ax1, sd, bins=60, logx=False, logy=True,
            title=f"Precise context step distance (matched)\nmatch={match_rate:.3f}, grid_match={grid_match_rate:.3f}",
            xlabel="matched_idx - curr_idx (steps)", ylabel="contexts",
            color=palette[3]
        )

        # grid match (bar)
        ax2 = plt.subplot(1, 2, 2)
        ax2.bar(["matched", "grid matched"], [match_rate, grid_match_rate],
                color=[palette[4], palette[5]], alpha=0.85, edgecolor="white", linewidth=0.8)
        ax2.set_ylim(0, 1.0)
        ax2.set_title("Precise context matching rates", pad=10)
        ax2.set_ylabel("ratio")
        ax2.grid(True, axis="y", linestyle="--", linewidth=0.6, alpha=0.35)

        plt.tight_layout()
        fig.savefig(os.path.join(outdir, "09_precise_alignment.pdf"), dpi=220)
        plt.close(fig)

    # Fuzzy alignment
    fuzzy_df = match_fuzzy_alignment(df)
    if len(fuzzy_df) > 0:
        match_rate = fuzzy_df["matched"].mean()

        sd = fuzzy_df.loc[fuzzy_df["matched"], "step_distance"].values
        sd = sd[np.isfinite(sd) & (sd >= 1)]
        sigma = fuzzy_df["sigma_minutes"].values
        sigma = sigma[np.isfinite(sigma) & (sigma > 0)]

        fig = plt.figure(figsize=(12, 4.5))
        ax1 = plt.subplot(1, 2, 1)
        nice_hist(
            ax1, sd, bins=60, logx=False, logy=True,
            title=f"Fuzzy context step distance (matched)\nmatch={match_rate:.3f}",
            xlabel="matched_idx - curr_idx (steps)", ylabel="contexts",
            color=palette[6]
        )
        ax2 = plt.subplot(1, 2, 2)
        if len(sigma) > 0:
            nice_hist(
                ax2, sigma, bins=60, logx=True, logy=True,
                title="Fuzzy context uncertainty proxy (sigma minutes)",
                xlabel="sigma (minutes, log)", ylabel="contexts",
                color=palette[7]
            )
        else:
            ax2.text(0.5, 0.5, "No parsable sigma in fuzzy texts", ha="center", va="center")
            ax2.set_axis_off()
        plt.tight_layout()
        fig.savefig(os.path.join(outdir, "10_fuzzy_alignment.pdf"), dpi=220)
        plt.close(fig)

    print("[Context Availability]")
    print(f"  fuzzy:   {has_fuzzy:.4f}")
    print(f"  precise: {has_precise:.4f}")
    print(f"  both:    {both:.4f}")

    return {
        "availability": {"fuzzy": has_fuzzy, "precise": has_precise, "both": both},
        "precise_alignment_df": precise_df,
        "fuzzy_alignment_df": fuzzy_df,
    }


# -----------------------------
# Main
# -----------------------------
def main(csv_path: str, outdir: str = "figures"):
    ensure_dir(outdir)

    df_raw = pd.read_csv(csv_path)
    df = prepare_sequences(df_raw)

    # 1) user activity
    plot_user_activity(df, outdir)

    # 2) entropies
    plot_entropy(df, outdir)

    # 3) routine vs aperiodic
    plot_routine_vs_aperiodic(df, outdir)

    # 4) time uncertainty distributions
    plot_time_uncertainty(df, outdir)

    # 5-6) context availability & alignment
    plot_context_stats(df, outdir)

    print(f"\nAll figures saved to: {os.path.abspath(outdir)}")


if __name__ == "__main__":
    # Change to your actual path if needed:
    CSV_PATH = "./Data/Output/all_users_context_combined.csv"
    main(CSV_PATH, outdir="./Pictures/GeoLife/")


[User Activity Summary]
  n_users: 168
  n_stays: 27914
  n_transitions: 27746
  stays_per_user_mean: 166.1547619047619
  stays_per_user_median: 46.0
  stays_per_user_std: 328.4789476041115
  stays_per_user_IQR: 146.0
  span_days_mean: 182.44300581459436
  span_days_median: 74.4909201388889


  bp = ax.boxplot(data_list, labels=labels, patch_artist=True, showfliers=False)


[Context Availability]
  fuzzy:   0.4151
  precise: 0.4151
  both:    0.4151

All figures saved to: d:\codeSpace\TrajectoryFeatureGeneration\Pictures\GeoLife


In [2]:
CSV_PATH = "./Data/MoreUser/all.csv"
main(CSV_PATH, outdir="./Pictures/MoreUser/")

[User Activity Summary]
  n_users: 9907
  n_stays: 6818428
  n_transitions: 6808521
  stays_per_user_mean: 688.2434642172201
  stays_per_user_median: 654.0
  stays_per_user_std: 398.82448697097675
  stays_per_user_IQR: 592.0
  span_days_mean: 168.68627095179616
  span_days_median: 170.24101851851853


  bp = ax.boxplot(data_list, labels=labels, patch_artist=True, showfliers=False)


[Context Availability]
  fuzzy:   0.2616
  precise: 0.2616
  both:    0.2616

All figures saved to: d:\codeSpace\TrajectoryFeatureGeneration\Pictures\MoreUser
