# Merge & Filter + Annotated Distributions
本 Notebook：
- 合并 & 异常筛除；
- 过滤后：绘制 **mean** 分布（直方图带百分比、x 轴标明每个 bin 的边界范围），并导出每个指标的分布表；
- 合并原始：绘制 **min/max** 分布（同样带百分比和边界范围），并导出分布表。

In [1]:

# ==== 参数区（请按需修改）====
BASE_DIR = r"/Volumes/weishanshan/Geo trax tool results/DJI_0031/step5 the result of 1st fillter outliers"
OUTPUT_DIR = BASE_DIR
IGNORE_PART0 = True
BINS = 20  # 适中，避免过度拥挤；可调整为 10/30 等
KEY_BASES = [
    "headway_distance_m",
    "net_headway_distance_m",
    "time_headway_s",
    "net_time_headway_s",
    "rel_v_kph",
    "rel_a_mps2",
    "TTC_s",
    "leader_v",
    "leader_a",
    "follower_v",
    "follower_a",
]


In [2]:

import os, re, glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def parse_lane_part(fname: str):
    base = os.path.basename(fname)
    m = re.match(r'^(lane-?\w+)_.*_part(\d+)_1st_segment_summary\.csv$', base, flags=re.IGNORECASE)
    lane, part = None, None
    if m:
        lane = m.group(1).lower()
        try:
            part = int(m.group(2))
        except Exception:
            part = None
    else:
        m_lane = re.search(r'(lane-?\w+)', base, flags=re.IGNORECASE)
        if m_lane:
            lane = m_lane.group(1).lower()
        m_part = re.search(r'_part(\d+)_1st_segment_summary\.csv$', base, flags=re.IGNORECASE)
        if m_part:
            try:
                part = int(m_part.group(1))
            except Exception:
                part = None
    return lane, part

def flag_outliers(row):
    reasons = []

    for col in ["headway_distance_m_min", "time_headway_s_min"]:
        if col in row and pd.notna(row[col]) and row[col] < 0:
            reasons.append(f"{col}<0")
    if "net_headway_distance_m_min" in row and pd.notna(row["net_headway_distance_m_min"]) and row["net_headway_distance_m_min"] < 0:
        reasons.append("net_headway_distance_m_min<0")
    for col in ["leader_v_max", "follower_v_max"]:
        if col in row and pd.notna(row[col]) and row[col] > 150:
            reasons.append(f"{col}>150kmh")
    for col in ["leader_v_min", "follower_v_min"]:
        if col in row and pd.notna(row[col]) and row[col] < 0:
            reasons.append(f"{col}<0kmh")
    for col in ["leader_a_min", "follower_a_min"]:
        if col in row and pd.notna(row[col]) and row[col] < -8:
            reasons.append(f"{col}<-8")
    for col in ["leader_a_max", "follower_a_max"]:
        if col in row and pd.notna(row[col]) and row[col] > 8:
            reasons.append(f"{col}>8")
    if "time_headway_s_mean" in row and pd.notna(row["time_headway_s_mean"]) and row["time_headway_s_mean"] > 8:
        reasons.append("time_headway_s_mean>8s")
    key_cols = [
        "headway_distance_m_mean", "time_headway_s_mean",
        "leader_v_mean", "follower_v_mean"
    ]
    nan_cnt = sum(1 for c in key_cols if (c in row and pd.isna(row[c])))
    if nan_cnt >= 3:
        reasons.append("too_many_nans")
    return ";".join(reasons)

def plot_hist_annot(series, title, out_path, bins=20):
    arr = pd.to_numeric(series, errors="coerce").dropna().values
    if arr.size == 0:
        return None
    import numpy as np
    counts, edges = np.histogram(arr, bins=bins)
    total = counts.sum()
    centers = (edges[:-1] + edges[1:]) / 2.0
    widths = np.diff(edges)
    percents = (counts / total * 100.0) if total > 0 else np.zeros_like(counts, dtype=float)

    fig = plt.figure(figsize=(10,5))
    ax = plt.gca()
    ax.bar(centers, counts, width=widths, align='center')
    labels = [f"{l:.3g}-{r:.3g}" for l, r in zip(edges[:-1], edges[1:])]
    ax.set_xticks(centers)
    ax.set_xticklabels(labels, rotation=90, fontsize=7)

    for c, x in zip(counts, centers):
        if c > 0:
            ax.text(x, c, f"{(c/total*100):.1f}%", ha='center', va='bottom', fontsize=8)

    ax.set_xlabel(title + " (bin ranges on x-axis)")
    ax.set_ylabel("Count")
    ax.set_title(f"Histogram: {title} (n={total})")
    fig.tight_layout()
    fig.savefig(out_path, dpi=150)
    plt.close(fig)

    return pd.DataFrame({"bin_left": edges[:-1], "bin_right": edges[1:], "count": counts, "percent": percents})


In [3]:

# 1) 合并
pattern = os.path.join(BASE_DIR, "**", "*_1st_segment_summary.csv")
files = sorted(glob.glob(pattern, recursive=True))
if IGNORE_PART0:
    files = [f for f in files if not re.search(r'_part0_1st_segment_summary\.csv$', os.path.basename(f), re.IGNORECASE)]
print(f"[INFO] Found {len(files)} files.")

dfs = []
for f in files:
    df = pd.read_csv(f)
    lane, part = parse_lane_part(f)
    if "lane" not in df.columns:
        df.insert(0, "lane", lane)
    else:
        df["lane"] = df["lane"].astype(str).str.lower().fillna(lane)
    if "part" not in df.columns:
        df.insert(1, "part", part)
    dfs.append(df)

combined = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
print(f"[INFO] Combined rows: {len(combined)}")


[INFO] Found 10 files.
[INFO] Combined rows: 902


In [4]:

# 2) 异常筛选
if not combined.empty:
    combined["outlier_reasons"] = combined.apply(flag_outliers, axis=1)
    combined["is_outlier"] = combined["outlier_reasons"].apply(lambda s: len(s) > 0)
else:
    combined["outlier_reasons"] = []
    combined["is_outlier"] = []

filtered = combined[~combined["is_outlier"]].copy()
print(f"[INFO] Filtered rows: {len(filtered)}")


[INFO] Filtered rows: 817


In [5]:

# 3) 保存 CSV（可直接在 OUTPUT_DIR 使用）
os.makedirs(OUTPUT_DIR, exist_ok=True)
raw_out = os.path.join(OUTPUT_DIR, "combined_segment_summary_raw.csv")
flt_out = os.path.join(OUTPUT_DIR, "combined_segment_summary_filtered.csv")
cnt_out = os.path.join(OUTPUT_DIR, "combined_segment_summary_counts_by_lane_part.csv")
combined.to_csv(raw_out, index=False)
filtered.to_csv(flt_out, index=False)
counts = (combined.groupby(["lane","part","is_outlier"], dropna=False)
          .size().rename("n").reset_index()) if not combined.empty else pd.DataFrame(columns=["lane","part","is_outlier","n"])
counts.to_csv(cnt_out, index=False)
counts


Unnamed: 0,lane,part,is_outlier,n
0,lane-1,1,False,52
1,lane-1,1,True,10
2,lane-1,2,False,41
3,lane-1,2,True,13
4,lane-2,1,False,169
5,lane-2,1,True,16
6,lane-2,2,False,122
7,lane-2,2,True,17
8,lane1_lane1_following,1,False,41
9,lane1_lane1_following,1,True,2


### 合并原始（combined）数据：`min/max` 分布（直方图带百分比与边界） + 汇总表

In [6]:

PLOT_DIR_C = os.path.join(OUTPUT_DIR, "plots_combined_min_max_annot")
os.makedirs(PLOT_DIR_C, exist_ok=True)
summaries2 = []
if not combined.empty:
    for base in KEY_BASES:
        for suffix in ["min","max"]:
            col = f"{base}_{suffix}"
            if col in combined.columns:
                s = pd.to_numeric(combined[col], errors="coerce").dropna()
                if s.empty:
                    continue
                out_img = os.path.join(PLOT_DIR_C, f"hist_{col}_annot.png")
                df_sum = plot_hist_annot(s, col, out_img, bins=BINS)
                if df_sum is not None:
                    df_sum.insert(0, "metric", col)
                    summaries2.append(df_sum)
hist_csv_c = os.path.join(OUTPUT_DIR, "hist_summaries_combined_min_max.csv")
(pd.concat(summaries2, ignore_index=True) if summaries2 else pd.DataFrame()).to_csv(hist_csv_c, index=False)
hist_csv_c, PLOT_DIR_C


('/Volumes/weishanshan/Geo trax tool results/DJI_0031/step5 the result of 1st fillter outliers/hist_summaries_combined_min_max.csv',
 '/Volumes/weishanshan/Geo trax tool results/DJI_0031/step5 the result of 1st fillter outliers/plots_combined_min_max_annot')