
# Car-Following Data Filtering and Visualization

This notebook demonstrates how to process the `lane-1_following_parts less 25m.xlsx` dataset to
extract continuous car-following segments that satisfy the following criteria:

1. The follower vehicle speed (`follower_v`) is greater than 1 m/s.
2. The headway distance (`headway_distance_m`) is less than or equal to 100 m.
3. The frame IDs are consecutive, and the duration of the segment (difference of `time_s`)
   is at least 1 second.

For each part and each follower-leader pair, the notebook will:

- Split the data into segments based on the above conditions and compute statistics
  (minimum, maximum, mean) for each segment.
- Plot the original time–speed–time headway relationship, with shaded regions
  indicating where the conditions are not met or the segment is too short.
- Plot the filtered data showing only valid segments, with purple dashed lines
  marking segment boundaries and right-hand axes showing time headway values.
- Save the filtered data and segment statistics as CSV files.



In [None]:

import os
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')  # use headless backend
import matplotlib.pyplot as plt
from typing import List, Tuple


In [None]:

# Function to split a follower-leader group into valid segments
# A segment is valid if follower_v>speed_threshold and headway_distance_m<=distance_threshold,
# frames are contiguous (frame_id increments by 1), and duration>=min_duration seconds

def find_valid_segments(group: pd.DataFrame,
                        speed_threshold: float = 1.0,
                        distance_threshold: float = 100.0,
                        min_duration: float = 1.0) -> List[pd.DataFrame]:
    segments: List[pd.DataFrame] = []
    current_segment_rows: List[int] = []
    prev_frame_id: float = None
    for idx, row in group.iterrows():
        cond_speed = row['follower_v'] > speed_threshold
        cond_dist = row['headway_distance_m'] <= distance_threshold
        cond = cond_speed and cond_dist
        if cond and (prev_frame_id is None or row['frame_id'] == prev_frame_id + 1):
            current_segment_rows.append(idx)
        else:
            if current_segment_rows:
                seg = group.loc[current_segment_rows]
                duration = seg['time_s'].iloc[-1] - seg['time_s'].iloc[0]
                if duration >= min_duration:
                    segments.append(seg.copy())
                current_segment_rows = []
            if cond:
                current_segment_rows.append(idx)
        prev_frame_id = row['frame_id']
    if current_segment_rows:
        seg = group.loc[current_segment_rows]
        duration = seg['time_s'].iloc[-1] - seg['time_s'].iloc[0]
        if duration >= min_duration:
            segments.append(seg.copy())
    return segments

# Compute statistics (min, max, mean) for given metrics in a segment

def compute_segment_statistics(segment: pd.DataFrame) -> dict:
    metrics = {
        'headway_distance_m': segment['headway_distance_m'],
        'net_headway_distance_m': segment['net_headway_distance_m'],
        'time_headway_s': segment['time_headway_s'],
        'net_time_headway_s': segment['net_time_headway_s'],
        'rel_v_kph': segment['rel_v_kph'],
        'rel_a_mps2': segment['rel_a_mps2'],
        'TTC_s': segment['TTC_s'],
        'leader_v': segment['leader_v'],
        'leader_a': segment['leader_a'],
        'follower_v': segment['follower_v'],
        'follower_a': segment['follower_a'],
    }
    result = {}
    for name, series in metrics.items():
        clean = series.dropna()
        result[f'{name}_min'] = clean.min() if not clean.empty else np.nan
        result[f'{name}_max'] = clean.max() if not clean.empty else np.nan
        result[f'{name}_mean'] = clean.mean() if not clean.empty else np.nan
    return result

# Plot original and filtered relationships for one pair

def plot_pair(group: pd.DataFrame, segments: List[pd.DataFrame], out_dir: str,
              base_name: str) -> Tuple[str, str]:
    follower_kmh = group['follower_v'] * 3.6
    leader_kmh = group['leader_v'] * 3.6
    t = group['time_s']
    thw = group['time_headway_s']
    valid_mask = (group['follower_v'] > 1.0) & (group['headway_distance_m'] <= 100.0)
    cont_mask = group['frame_id'].diff().fillna(1) == 1
    contiguous_valid = valid_mask & cont_mask
    contiguous_valid.iloc[0] = valid_mask.iloc[0]
    spans: List[Tuple[int, int, bool]] = []
    start = 0
    is_valid = contiguous_valid.iloc[0]
    for i in range(1, len(group)):
        if contiguous_valid.iloc[i] != is_valid:
            spans.append((start, i - 1, is_valid))
            start = i
            is_valid = contiguous_valid.iloc[i]
    spans.append((start, len(group) - 1, is_valid))
    min_duration = 1.0
    for idx, (s, e, valid) in enumerate(spans):
        if valid:
            duration = t.iloc[e] - t.iloc[s]
            if duration < min_duration:
                spans[idx] = (s, e, False)
    original_path = os.path.join(out_dir, f"{base_name}_original.png")
    plt.figure(figsize=(9,4))
    ax1 = plt.gca()
    ax1.plot(t, follower_kmh, label='Follower speed', color='blue')
    ax1.plot(t, leader_kmh, label='Leader speed', color='orange')
    ax1.set_xlabel('Time (s)')
    ax1.set_ylabel('Speed (km/h)')
    ax2 = ax1.twinx()
    ax2.plot(t, thw, label='Time headway (s)', color='green', linestyle='--')
    ax2.set_ylabel('Time headway (s)')
    ax2.tick_params(axis='y', labelcolor='green')
    for s, e, valid in spans:
        if not valid:
            color = 'yellow' if valid_mask.iloc[s] else 'red'
            ax1.axvspan(t.iloc[s], t.iloc[e], color=color, alpha=0.3)
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, fontsize=6, loc='best')
    plt.title(f"Original: {base_name}")
    plt.tight_layout()
    plt.savefig(original_path, dpi=150)
    plt.close()
    filtered_path = os.path.join(out_dir, f"{base_name}_filtered.png")
    plt.figure(figsize=(9,4))
    ax1 = plt.gca()
    ax2 = ax1.twinx()
    segment_boundaries = []
    for seg in segments:
        t_seg = seg['time_s']
        follower_kmh_seg = seg['follower_v'] * 3.6
        leader_kmh_seg = seg['leader_v'] * 3.6
        thw_seg = seg['time_headway_s']
        ax1.plot(t_seg, follower_kmh_seg, color='blue')
        ax1.plot(t_seg, leader_kmh_seg, color='orange')
        ax2.plot(t_seg, thw_seg, color='green', linestyle='--')
        if not t_seg.empty:
            start_t, end_t = t_seg.iloc[0], t_seg.iloc[-1]
            segment_boundaries.append((start_t, end_t))
            ax1.scatter([start_t, end_t], [follower_kmh_seg.iloc[0], follower_kmh_seg.iloc[-1]], color='blue', marker='o', s=20)
            ax1.scatter([start_t, end_t], [leader_kmh_seg.iloc[0], leader_kmh_seg.iloc[-1]], color='orange', marker='o', s=20)
            ax2.scatter([start_t, end_t], [thw_seg.iloc[0], thw_seg.iloc[-1]], color='green', marker='o', s=20)
    ax1.set_xlabel('Time (s)')
    ax1.set_ylabel('Speed (km/h)')
    ax2.set_ylabel('Time headway (s)')
    ax2.tick_params(axis='y', labelcolor='green')
    for start_t, end_t in segment_boundaries:
        ax1.axvline(start_t, color='purple', linestyle=':', linewidth=0.8)
        ax1.axvline(end_t, color='purple', linestyle=':', linewidth=0.8)
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    uniq = {}
    for l, name in zip(lines1 + lines2, labels1 + labels2):
        if name not in uniq:
            uniq[name] = l
    ax1.legend(list(uniq.values()), list(uniq.keys()), fontsize=6, loc='best')
    plt.title(f"Filtered: {base_name}")
    plt.tight_layout()
    plt.savefig(filtered_path, dpi=150)
    plt.close()
    return original_path, filtered_path

# Process entire dataset: split segments, plot, and save summaries

def process_dataset(file_path: str, output_root: str) -> None:
    df = pd.read_excel(file_path, sheet_name=0)
    df['follower_uid'] = df['follower_uid'].astype(str)
    df['leader_uid'] = df['leader_uid'].astype(str)
    parts = sorted(df['part'].unique())
    for part in parts:
        part_df = df[df['part'] == part].copy().sort_values(['follower_uid','leader_uid','frame_id'])
        part_dir = os.path.join(output_root, f"part{part}")
        filtered_plots_dir = os.path.join(part_dir, 'filtered_plots')
        original_plots_dir = os.path.join(part_dir, 'original_plots')
        os.makedirs(filtered_plots_dir, exist_ok=True)
        os.makedirs(original_plots_dir, exist_ok=True)
        filtered_rows: List[pd.DataFrame] = []
        summary_records = []
        for (follower, leader), group in part_df.groupby(['follower_uid','leader_uid']):
            group_sorted = group.sort_values('frame_id').reset_index(drop=True)
            segments = find_valid_segments(group_sorted)
            if not segments:
                continue
            for seg_id, seg in enumerate(segments, start=1):
                seg = seg.copy()
                seg['segment_id'] = seg_id
                filtered_rows.append(seg)
                stats = compute_segment_statistics(seg)
                summary_records.append({
                    'segment_id': seg_id,
                    'follower_uid': follower,
                    'leader_uid': leader,
                    **stats
                })
            base_name = f"{os.path.basename(file_path).replace('.xlsx','')}_part{part}_{follower}_{leader}"
            orig_path, filt_path = plot_pair(group_sorted, segments, original_plots_dir, base_name)
            dest_filt = os.path.join(filtered_plots_dir, os.path.basename(filt_path))
            os.replace(filt_path, dest_filt)
        if filtered_rows:
            filtered_df = pd.concat(filtered_rows, ignore_index=True)
            if 'part' in filtered_df.columns:
                filtered_df['part'] = part
            else:
                filtered_df.insert(0, 'part', part)
            csv_path = os.path.join(part_dir,
                                     f"{os.path.basename(file_path).replace('.xlsx','')}_part{part}_filtered_data.csv")
            filtered_df.to_csv(csv_path, index=False)
        if summary_records:
            summary_df = pd.DataFrame(summary_records)
            summary_csv = os.path.join(part_dir,
                                       f"{os.path.basename(file_path).replace('.xlsx','')}_part{part}_segment_summary.csv")
            summary_df.to_csv(summary_csv, index=False)


In [None]:

# Example execution (adjust paths as needed)
# input_file = '/home/oai/share/lane-1_following_parts less 25m.xlsx'
# output_dir = '/home/oai/share/lane-1_following_parts_less_25m_results_demo'
# process_dataset(input_file, output_dir)
