# 合并样本

## 合并gird相同且连续的样本

发现数据中有很多样本的时间间隔非常小，同时grid相同，需要进行清洗。

对 同一用户 的轨迹，按时间排序后：

若满足 同时满足：

1. grid_i == grid_{i+1}
2. stime_{i+1} - etime_i < 30 minutes

则认为是 同一次停留的伪分割，应合并为一条：

1. grid：保持不变
2. stime ：取第一条的 stime
3. etime ：取最后一条的 etime
4. duration重新计算，通过合并之后的一条样本的 etime - stime 得到。
5. context_fuzzy, context_precise列中如果多个样本都有，那么选择保留第一个样本的内容；如果多个样本中只有一个样本 context_fuzzy, context_precise有值那么保留有值的样本的 context_fuzzy, context_precise；如果都没有那么继续保持为空。

In [1]:
import pandas as pd

def _has_value(x) -> bool:
    """判定 context 是否“有值”：非 NaN 且非空字符串。"""
    if pd.isna(x):
        return False
    if isinstance(x, str):
        return x.strip() != ""
    return True  # 非字符串（例如 dict/对象）只要非 NaN 就算有值


def _merge_context(cur_val, next_val):
    """
    context 合并规则：
    - 如果 cur 已有值：永远保留 cur（“多条都有值保留第一条”）
    - 如果 cur 没值但 next 有值：用 next（“只有一个有值则保留有值的”）
    - 否则保持空
    """
    if _has_value(cur_val):
        return cur_val
    if _has_value(next_val):
        return next_val
    return cur_val  # 都空，保持空（可能是 NaN 或 ""）


def merge_consecutive_stays(
    df,
    user_col="user_id",
    grid_col="grid",
    stime_col="stime",
    etime_col="etime",
    duration_col="duration",
    lat_col="lat",
    lon_col="lon",
    ctx_fuzzy_col="context_fuzzy",
    ctx_precise_col="context_precise",
    gap_minutes=30,
    duration_unit="minutes",  # "minutes" or "seconds"
):
    """
    合并同一用户中相邻 stay：
    - grid 相同
    - 且 (next.stime - cur.etime) < gap_minutes

    合并策略：
    - stime: 保留第一个
    - etime: 取合并段内最大
    - duration: 用 (etime - stime) 重算
    - lat/lon: 保留第一个样本的
    - context_fuzzy/context_precise:
        * 多条都有值 -> 保留第一条
        * 只有一条有值 -> 保留有值那条
        * 都无值 -> 保持为空
    """

    df = df.copy()

    # datetime
    df[stime_col] = pd.to_datetime(df[stime_col])
    df[etime_col] = pd.to_datetime(df[etime_col])

    # sort
    df = df.sort_values([user_col, stime_col]).reset_index(drop=True)

    merged_rows = []

    for uid, user_df in df.groupby(user_col, sort=False):
        user_df = user_df.reset_index(drop=True)
        cur_row = user_df.iloc[0].to_dict()

        # 记录“段起点”的 lat/lon（实际上 cur_row 初始就已经是第一个）
        # 后续合并时不更新 lat/lon，即自然满足“保留第一个样本的 lat/lon”。

        for i in range(1, len(user_df)):
            next_row = user_df.iloc[i]

            same_grid = next_row[grid_col] == cur_row[grid_col]
            gap = (next_row[stime_col] - cur_row[etime_col]).total_seconds() / 60

            if same_grid and gap < gap_minutes:
                # 1) 延长 etime
                cur_row[etime_col] = max(cur_row[etime_col], next_row[etime_col])

                # 2) lat/lon 保留第一个：啥也不做（不覆盖）

                # 3) 合并 context（按规则）
                if ctx_fuzzy_col in cur_row and ctx_fuzzy_col in next_row:
                    cur_row[ctx_fuzzy_col] = _merge_context(cur_row.get(ctx_fuzzy_col), next_row.get(ctx_fuzzy_col))
                if ctx_precise_col in cur_row and ctx_precise_col in next_row:
                    cur_row[ctx_precise_col] = _merge_context(cur_row.get(ctx_precise_col), next_row.get(ctx_precise_col))

            else:
                # 结束当前段：重算 duration
                delta_sec = (cur_row[etime_col] - cur_row[stime_col]).total_seconds()
                if duration_unit == "minutes":
                    cur_row[duration_col] = delta_sec / 60.0
                elif duration_unit == "seconds":
                    cur_row[duration_col] = delta_sec
                else:
                    raise ValueError("duration_unit must be 'minutes' or 'seconds'")

                merged_rows.append(cur_row)
                cur_row = next_row.to_dict()

        # 最后一段
        delta_sec = (cur_row[etime_col] - cur_row[stime_col]).total_seconds()
        if duration_unit == "minutes":
            cur_row[duration_col] = delta_sec / 60.0
        elif duration_unit == "seconds":
            cur_row[duration_col] = delta_sec
        else:
            raise ValueError("duration_unit must be 'minutes' or 'seconds'")

        merged_rows.append(cur_row)

    merged_df = pd.DataFrame(merged_rows)
    return merged_df


In [2]:
df = pd.read_csv("./Data/MoreUser/all.csv")

df_merged = merge_consecutive_stays(
    df,
    user_col="userID",
    grid_col="grid",
    stime_col="stime",
    etime_col="etime",
    duration_col="duration",
    lat_col="lat",
    lon_col="lon",
    ctx_fuzzy_col="context_fuzzy",
    ctx_precise_col="context_precise",
    gap_minutes=30,
    duration_unit="seconds",
)

df_merged.to_csv("./Data/MoreUser/all_merged30min.csv", index=False)

# moreuser consume 23m 10.7s .

## 合并gap（时间价格）等于或者约等于0的样本

合并规则如下：

1. grid：取样本中duration时间最长的。 
2. stime ：取第一条的 stime 
3. etime ：取最后一条的 etime 
4. duration重新计算，通过合并之后的一条样本的 etime - stime 得到。 
5. context_fuzzy, context_precise列中如果多个样本都有，那么选择保留第停留时间最长样本的内容；如果多个样本中只有一个样本包含 context_fuzzy, context_precise就只那么保留有值的样本的 context_fuzzy, context_precise；如果都没有那么继续保持为空。

In [3]:
import pandas as pd

def _has_value(x) -> bool:
    if pd.isna(x):
        return False
    if isinstance(x, str):
        return x.strip() != ""
    return True


def merge_stays_with_nonpositive_gap(
    df,
    user_col="user_id",
    grid_col="grid",
    stime_col="stime",
    etime_col="etime",
    duration_col="duration",
    ctx_fuzzy_col="context_fuzzy",
    ctx_precise_col="context_precise",
    duration_unit="minutes",   # or "seconds"
):
    df = df.copy()

    # datetime
    df[stime_col] = pd.to_datetime(df[stime_col])
    df[etime_col] = pd.to_datetime(df[etime_col])

    # sort
    df = df.sort_values([user_col, stime_col]).reset_index(drop=True)

    merged_rows = []

    for uid, user_df in df.groupby(user_col, sort=False):
        user_df = user_df.reset_index(drop=True)

        segment = [user_df.iloc[0]]

        for i in range(1, len(user_df)):
            prev = segment[-1]
            curr = user_df.iloc[i]

            gap = (curr[stime_col] - prev[etime_col]).total_seconds() / 60.0

            if gap <= 0:
                # 继续收集到当前 segment
                segment.append(curr)
            else:
                # 处理当前 segment
                merged_rows.append(
                    _merge_one_segment(
                        segment,
                        grid_col,
                        stime_col,
                        etime_col,
                        duration_col,
                        ctx_fuzzy_col,
                        ctx_precise_col,
                        duration_unit,
                    )
                )
                segment = [curr]

        # 最后一个 segment
        merged_rows.append(
            _merge_one_segment(
                segment,
                grid_col,
                stime_col,
                etime_col,
                duration_col,
                ctx_fuzzy_col,
                ctx_precise_col,
                duration_unit,
            )
        )

    return pd.DataFrame(merged_rows)


def _merge_one_segment(
    segment,
    grid_col,
    stime_col,
    etime_col,
    duration_col,
    ctx_fuzzy_col,
    ctx_precise_col,
    duration_unit,
):
    # segment: List[pd.Series]

    # 1. stime / etime
    stime = segment[0][stime_col]
    etime = segment[-1][etime_col]

    # 2. duration（重算）
    delta_sec = (etime - stime).total_seconds()
    duration = delta_sec / 60.0 if duration_unit == "minutes" else delta_sec

    # 3. 选 duration 最长的原始 stay
    longest = max(
        segment,
        key=lambda r: (
            r[duration_col]
            if pd.notna(r[duration_col])
            else -1
        ),
    )

    # 4. grid
    grid = longest[grid_col]

    # 5. context（按“最长 duration 优先”）
    def pick_context(col):
        candidates = [r for r in segment if _has_value(r[col])]
        if not candidates:
            return None
        if len(candidates) == 1:
            return candidates[0][col]
        return max(
            candidates,
            key=lambda r: r[duration_col] if pd.notna(r[duration_col]) else -1,
        )[col]

    ctx_fuzzy = pick_context(ctx_fuzzy_col)
    ctx_precise = pick_context(ctx_precise_col)

    # 6. 输出行（以第一条为模板）
    out = segment[0].to_dict()
    out[stime_col] = stime
    out[etime_col] = etime
    out[duration_col] = duration
    out[grid_col] = grid
    out[ctx_fuzzy_col] = ctx_fuzzy
    out[ctx_precise_col] = ctx_precise

    return out


In [5]:
import pandas as pd

# 1) 读入数据
df = pd.read_csv("./Data/MoreUser/all_merged30min.csv")

# 2) 合并 gap <= 0 的 stay（按你制定的规则）
df_merged = merge_stays_with_nonpositive_gap(
    df,
    user_col="userID",
    grid_col="grid",
    stime_col="stime",
    etime_col="etime",
    duration_col="duration",
    ctx_fuzzy_col="context_fuzzy",
    ctx_precise_col="context_precise",
    duration_unit="seconds",   # 如果你的 duration 是秒，改成 "seconds"
)

# 3) 保存
out_path = "./Data/MoreUser/all_gapLE0_merged.csv"
df_merged.to_csv(out_path, index=False)

print("Before:", len(df))
print("After :", len(df_merged))
print("Saved :", out_path)

Before: 5147648
After : 3007840
Saved : ./Data/MoreUser/all_gapLE0_merged.csv
