# 对数据进行抽样

1. 固定总样本数（Total stays 固定）
   - 改变 users 数 U
     - U In {500, 1k, 2k, 4k, 8k, 9.9k}
     - N_total in {0.5M, 1M, 2M}, choose 1M.
     - 每个用户至少有 (64 + 1) 个 stay，否则无法构造样本.
   - 每用户长度 L 自动变化（L ≈ Total/U）
   - 结论含义：用户多样性 vs 每用户密度 的 trade-off。
   - We study the effect of user diversity under a fixed training budget. We fix the total number of stay tokens to N_total and vary the number of users U ∈ {0.5k, 1k, 2k, 4k, 8k, 10k}. For each U, we randomly sample users with at least 65 stays and subsample their trajectories to match N_total while preserving temporal order.
2. 固定每用户长度（Per-user stays 固定）
   - 改变 users 数 U。
     - U in {500, 1k, 2k, 4k, 8k, 9.9k}
   - 轨迹长度确定为500.
   - 总样本数随之变化
   - 结论含义：用户多样性 + 总样本量 的纯增益（更接近你当前观察）。不会出现“某些用户贡献 10 倍样本”的情况，因为它排除了 per-user sample imbalance。
3. 固定 users 数（User count 固定）
   - 用户数量规定为2000 。
   - 改每用户长度 L。
     - L in {100, 200, 500, 800, 1000}
   - 观察是否存在“长度阈值”（比如超过某个长度模型突然变强）
   - 结论含义：模型是否依赖长程结构（你论文“轨迹像语言”的论证点）

细节：
1. 抽样采用random span sampling 方式的原因：
   1. 如果使用整块的抽样容易学到位置偏置（position bias）。
   2. 随机 span 训练迫使模型理解「这一整段缺失，我该如何从上下文中恢复？」

In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd


# -----------------------------
# 0) 列名自动识别（兼容 GeoLife 常见处理格式）
# -----------------------------
def _infer_columns(df: pd.DataFrame,
                   user_col: str | None,
                   time_col: str | None):
    if user_col is None:
        for c in ["user_id", "userID", "uid", "UserId", "user"]:
            if c in df.columns:
                user_col = c
                break
    if time_col is None:
        for c in ["timestamp", "time", "datetime", "stime", "t"]:
            if c in df.columns:
                time_col = c
                break

    if user_col is None or time_col is None:
        raise ValueError(
            f"无法自动识别 user/time 列。请显式传入 user_col/time_col。\n"
            f"当前列名：{list(df.columns)}"
        )
    return user_col, time_col


def _ensure_sorted(df: pd.DataFrame, user_col: str, time_col: str) -> pd.DataFrame:
    # 保证每个用户内部时序正确（不打乱）
    return df.sort_values([user_col, time_col]).reset_index(drop=True)


def _random_contiguous_slice(idx: np.ndarray, length: int, rng: np.random.Generator) -> np.ndarray:
    """
    给定某个用户的行索引 idx（已按时间排序），随机取一个连续片段，长度为 length。
    如果刚好相等就全取。
    """
    n = len(idx)
    if n < length:
        raise ValueError("User length not enough for slicing.")
    if n == length:
        return idx
    start = rng.integers(0, n - length + 1)
    return idx[start:start + length]


def _sample_users_with_min_stays(counts: pd.Series,
                                 U: int,
                                 min_stays: int,
                                 rng: np.random.Generator) -> np.ndarray:
    eligible = counts[counts >= min_stays].index.to_numpy()
    if len(eligible) < U:
        raise ValueError(f"满足 min_stays={min_stays} 的用户只有 {len(eligible)} 个，不足 U={U}")
    return rng.choice(eligible, size=U, replace=False)


def _save_df(df: pd.DataFrame, save_dir: str, filename: str):
    Path(save_dir).mkdir(parents=True, exist_ok=True)
    path = Path(save_dir) / filename
    df.to_csv(path, index=False)
    print(f"[Saved] {path}")


# -----------------------------
# 1) 固定总样本数（Total stays 固定）
#    - 固定 N_total=1,000,000
#    - 改变 U
#    - 每个用户至少 65 stays
#    - 每用户长度自动变化，且总 token 精确匹配 N_total
# -----------------------------
def sample_fixed_total_stays(
    csv_path: str,
    U: int,
    N_total: int = 1_000_000,
    min_stays: int = 65,
    user_col: str | None = None,
    time_col: str | None = None,
    random_state: int = 42,
    slice_mode: str = "random_contiguous",  # "random_contiguous" or "prefix"
    save_dir: str = "./samples"
) -> pd.DataFrame:
    """
    For each U:
      - sample U users (each >= max(min_stays, required_length))
      - allocate per-user length so that sum == N_total
      - slice each user trajectory while preserving order
    """
    rng = np.random.default_rng(random_state)
    df = pd.read_csv(csv_path)
    user_col, time_col = _infer_columns(df, user_col, time_col)
    df = _ensure_sorted(df, user_col, time_col)

    counts = df.groupby(user_col).size()

    # 每用户长度分配：尽量平均，总和精确为 N_total
    base = N_total // U
    rem = N_total - base * U  # 0..U-1
    lengths = np.full(U, base, dtype=int)
    if rem > 0:
        lengths[:rem] += 1

    # 由于要能截取 length，每个用户至少要有 max(min_stays, base或base+1)
    per_user_min_needed = max(min_stays, base + (1 if rem > 0 else 0))

    sampled_users = _sample_users_with_min_stays(counts, U, per_user_min_needed, rng)

    # 为了让 lengths 不与 user 顺序耦合：打乱 lengths 分配给 sampled_users（仍不影响用户内部时序）
    rng.shuffle(lengths)

    # 取索引并截取
    pieces = []
    for uid, L in zip(sampled_users, lengths):
        idx = df.index[df[user_col] == uid].to_numpy()
        if slice_mode == "prefix":
            take_idx = idx[:L]
        elif slice_mode == "random_contiguous":
            take_idx = _random_contiguous_slice(idx, L, rng)
        else:
            raise ValueError("slice_mode must be 'random_contiguous' or 'prefix'")
        pieces.append(df.loc[take_idx])

    out = pd.concat(pieces, axis=0)
    out = _ensure_sorted(out, user_col, time_col)  # 每用户内部顺序保证；用户间也按时间+user排一下便于检查

    # 断言总 token 数
    if len(out) != N_total:
        # 理论上不会发生；防御性检查
        out = out.iloc[:N_total].copy()
        out = _ensure_sorted(out, user_col, time_col)

    # 保存：清晰区分策略 + 关键参数
    filename = f"geolife_S1_fixedTotal_N{N_total}_U{U}_min{min_stays}_seed{random_state}_{slice_mode}.csv"
    _save_df(out, save_dir, filename)
    return out


# -----------------------------
# 2) 固定每用户长度（Per-user stays 固定）
#    - 改变 U
#    - 每用户长度 L=500
#    - 总 token 随 U 变化
# -----------------------------
def sample_fixed_per_user_stays(
    csv_path: str,
    U: int,
    L: int = 500,
    user_col: str | None = None,
    time_col: str | None = None,
    random_state: int = 42,
    slice_mode: str = "random_contiguous",  # "random_contiguous" or "prefix"
    save_dir: str = "./samples"
) -> pd.DataFrame:
    rng = np.random.default_rng(random_state)
    df = pd.read_csv(csv_path)
    user_col, time_col = _infer_columns(df, user_col, time_col)
    df = _ensure_sorted(df, user_col, time_col)

    counts = df.groupby(user_col).size()
    sampled_users = _sample_users_with_min_stays(counts, U, L, rng)

    pieces = []
    for uid in sampled_users:
        idx = df.index[df[user_col] == uid].to_numpy()
        if slice_mode == "prefix":
            take_idx = idx[:L]
        elif slice_mode == "random_contiguous":
            take_idx = _random_contiguous_slice(idx, L, rng)
        else:
            raise ValueError("slice_mode must be 'random_contiguous' or 'prefix'")
        pieces.append(df.loc[take_idx])

    out = pd.concat(pieces, axis=0)
    out = _ensure_sorted(out, user_col, time_col)

    filename = f"geolife_S2_fixedPerUser_U{U}_L{L}_seed{random_state}_{slice_mode}.csv"
    _save_df(out, save_dir, filename)
    return out


# -----------------------------
# 3) 固定 users 数（User count 固定）
#    - 固定 U=2000
#    - 改每用户长度 L in {100,200,500,800,1000}
# -----------------------------
def sample_fixed_user_count(
    csv_path: str,
    U: int = 2000,
    L: int = 500,
    min_stays: int | None = None,  # 可选：比如你想强制 >=65，则设为65；默认就是 L
    user_col: str | None = None,
    time_col: str | None = None,
    random_state: int = 42,
    slice_mode: str = "random_contiguous",  # "random_contiguous" or "prefix"
    save_dir: str = "./samples"
) -> pd.DataFrame:
    rng = np.random.default_rng(random_state)
    df = pd.read_csv(csv_path)
    user_col, time_col = _infer_columns(df, user_col, time_col)
    df = _ensure_sorted(df, user_col, time_col)

    counts = df.groupby(user_col).size()
    need = max(L, (min_stays if min_stays is not None else 0))
    sampled_users = _sample_users_with_min_stays(counts, U, need, rng)

    pieces = []
    for uid in sampled_users:
        idx = df.index[df[user_col] == uid].to_numpy()
        if slice_mode == "prefix":
            take_idx = idx[:L]
        elif slice_mode == "random_contiguous":
            take_idx = _random_contiguous_slice(idx, L, rng)
        else:
            raise ValueError("slice_mode must be 'random_contiguous' or 'prefix'")
        pieces.append(df.loc[take_idx])

    out = pd.concat(pieces, axis=0)
    out = _ensure_sorted(out, user_col, time_col)

    filename = f"geolife_S3_fixedUsers_U{U}_L{L}_seed{random_state}_{slice_mode}.csv"
    _save_df(out, save_dir, filename)
    return out


# -----------------------------
# 4) 一键生成你列举的实验组
# -----------------------------
def run_all_sampling_grids(
    csv_path: str,
    save_dir: str = "./samples",
    random_state: int = 42,
    slice_mode: str = "random_contiguous"
):
    """_summary_

    Args:
        csv_path (str): _description_
        save_dir (str, optional): _description_. Defaults to "./samples".
        random_state (int, optional): _description_. Defaults to 42.
        slice_mode (str, optional): 当某个用户的原始轨迹长度 > 你需要的长度 L 时，
            如何从该用户的轨迹中“截取”样本
            slice_mode="prefix"（前缀截取）
            slice_mode="random_contiguous"随机选择一个连续长度为 L 的子序列. 
            Defaults to "random_contiguous".
    """
    U_list = [500, 1000, 2000, 4000, 8000, 9900]   # 9.9k
    # (1) 固定总样本数 N_total=1M
    for U in U_list:
        sample_fixed_total_stays(
            csv_path=csv_path,
            U=U,
            N_total=1_000_000,
            min_stays=65,
            random_state=random_state,
            slice_mode=slice_mode,
            save_dir=save_dir
        )

    # (2) 固定每用户长度 L=500
    for U in U_list:
        sample_fixed_per_user_stays(
            csv_path=csv_path,
            U=U,
            L=500,
            random_state=random_state,
            slice_mode=slice_mode,
            save_dir=save_dir
        )

    # (3) 固定用户数 U=2000，改变 L
    for L in [100, 200, 500, 800, 1000]:
        sample_fixed_user_count(
            csv_path=csv_path,
            U=2000,
            L=L,
            min_stays=None,  # 如果你想也强制 >=65，可以改成 65
            random_state=random_state,
            slice_mode=slice_mode,
            save_dir=save_dir
        )


if __name__ == "__main__":
    # 你上传的数据路径（示例）
    # csv_path = "./Data/Output/all_users_context_combined.csv"
    csv_path = "./Data/MoreUser/all.csv"
    run_all_sampling_grids(
        csv_path=csv_path,
        save_dir="./Data/MoreUser/Sampled/",
        random_state=42,
        slice_mode="random_contiguous"  # 或 ""prefix
    )


## 单纯用户数量抽样

In [6]:
import pandas as pd
import numpy as np
import os

def sample_user_trajectories(
    csv_path,
    num_users,
    user_col="user_id",
    time_col="timestamp",
    random_state=42,
    save_dir=None,
    save_prefix="sampled_users"
):
    """
    随机抽样指定数量的用户轨迹，保持单个用户内部顺序不变
    若提供 save_dir，则保存结果到文件，文件名包含抽样用户数量
    """
    # 读取数据
    df = pd.read_csv(csv_path, index_col=0)

    # 获取所有用户
    all_users = df[user_col].unique()
    if num_users > len(all_users):
        raise ValueError("抽样用户数量超过数据集中用户总数")

    # 随机抽样用户
    rng = np.random.default_rng(random_state)
    sampled_users = rng.choice(all_users, size=num_users, replace=False)

    # 过滤数据
    sampled_df = df[df[user_col].isin(sampled_users)]

    # 保证每个用户内部轨迹按时间排序
    sampled_df = sampled_df.sort_values(
        by=[user_col, time_col]
    ).reset_index(drop=True)

    # 保存结果（如果需要）
    if save_dir is not None:
        os.makedirs(save_dir, exist_ok=True)
        save_path = os.path.join(
            save_dir,
            f"{save_prefix}_{num_users}.csv"
        )
        sampled_df.to_csv(save_path, index=False)
        print(f"抽样数据已保存至: {save_path}")

    return sampled_df


In [None]:
if __name__ == "__main__":
    # csv_path = "./Data/Output/all_users_context_combined.csv"
    csv_path = "./Data/MoreUser/all.csv"

    for num_users in [10, 100, 500, 1000, 2000, 5000, 8000]:

        sampled_df = sample_user_trajectories(
            csv_path=csv_path,
            num_users=num_users,
            user_col="userID",
            time_col="stime",
            save_dir="./Data/MoreUser/Sampled",   # 保存目录
            save_prefix="MoreUser_sample"
        )

        # print(sampled_df.head())
        print(f"抽样用户数: {sampled_df['userID'].nunique()}")

抽样数据已保存至: ./Data/MoreUser/Sampled\MoreUser_sample_10.csv
   userID                stime                etime        lon        lat  \
0     280  2024-10-01 00:05:10  2024-10-01 11:26:49  114.02597  30.578440   
1     280  2024-10-01 11:52:48  2024-10-01 13:01:43  114.02151  30.575377   
2     280  2024-10-01 13:01:43  2024-10-01 16:43:06  114.02597  30.578440   
3     280  2024-10-01 17:19:35  2024-10-01 18:06:42  114.02151  30.575377   
4     280  2024-10-01 18:19:14  2024-10-01 19:04:05  113.99754  30.583616   

   duration    grid                                      context_fuzzy  \
0   40899.0  5220.0                                                NaN   
1    4135.0  5118.0  User 280 will move from grid 5118 to grid 5019...   
2   13283.0  5220.0  User 280 will move from grid 5220 to grid 4920...   
3    2827.0  5118.0                                                NaN   
4    2691.0  4920.0                                                NaN   

                                   

In [5]:
sampled_df.describe()

Unnamed: 0.1,Unnamed: 0,userID,lon,lat,duration,grid
count,6810.0,6810.0,6810.0,6810.0,6810.0,6810.0
mean,507.912041,4580.37254,114.271337,30.655584,20329.75,9061.000441
std,407.741995,3482.125251,0.146627,0.143376,57909.19,3000.901608
min,0.0,280.0,113.68186,30.35869,1800.0,0.0
25%,181.0,487.0,114.23063,30.57737,2980.75,6721.0
50%,401.0,4921.0,114.29945,30.59836,5904.5,8462.0
75%,735.0,7920.0,114.36661,30.672186,16425.0,11094.0
max,1677.0,8681.0,114.89435,31.367685,1575181.0,17600.0
