# 1Xuanpeng Merge Dataset Exploration

This notebook loads the provided dataset, performs lightweight cleaning, and produces a few visualizations to help you reason about which features may be useful for further analysis or modeling.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from matplotlib.patches import Patch, Rectangle, Polygon
from matplotlib.collections import LineCollection
from matplotlib.colors import Normalize

sns.set_theme(style="ticks", context="talk")


In [None]:
data_path = Path('1Xuanpeng_Merge.csv')
df_raw = pd.read_csv(data_path)
# Strip whitespace from column names and drop any empty trailing columns
df_raw.columns = [col.strip() for col in df_raw.columns]
df = df_raw[[col for col in df_raw.columns if col]]

# Convert numeric-looking columns
numeric_cols = df.columns
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Use elapsed time in seconds for plotting clarity
df['elapsed_time_s'] = df['time'] - df['time'].iloc[0]

df.head()

In [None]:
# Derive helper fields for visualization
df['relative_speed'] = df['Mspeed'] - df['Rspeed']

# Convert GPS coordinates to a local tangent-plane frame (meters)
ref_lat_candidates = pd.concat([df['Rlat'], df['Mlat']], ignore_index=True).dropna()
ref_lon_candidates = pd.concat([df['Rlon'], df['Mlon']], ignore_index=True).dropna()
ref_lat = ref_lat_candidates.mean() if not ref_lat_candidates.empty else 0.0
ref_lon = ref_lon_candidates.mean() if not ref_lon_candidates.empty else 0.0
earth_radius_m = 6_371_000
lat0 = np.radians(ref_lat)
lon0 = np.radians(ref_lon)

def latlon_to_local_xy(lat_series, lon_series):
    lat_rad = np.radians(lat_series)
    lon_rad = np.radians(lon_series)
    x = (lon_rad - lon0) * earth_radius_m * np.cos(lat0)
    y = (lat_rad - lat0) * earth_radius_m
    return x, y

df['R_x_m'], df['R_y_m'] = latlon_to_local_xy(df['Rlat'], df['Rlon'])
df['M_x_m'], df['M_y_m'] = latlon_to_local_xy(df['Mlat'], df['Mlon'])

# Dis2End tracks the ramp car, Dis2End_M the mainline car - their difference tells us when the vehicles align longitudinally
df['distance_gap_m'] = df['Dis2End'] - df['Dis2End_M']

distance_gap = df['distance_gap_m'].dropna()
merge_time = np.nan
merge_longitudinal_distance = np.nan
merge_point_merge = np.array([np.nan, np.nan])
merge_point_reference = np.array([np.nan, np.nan])
merge_distance_gap_abs = np.nan

if not distance_gap.empty:
    crossing_mask = (distance_gap.shift() * distance_gap) <= 0
    crossing_indices = distance_gap[crossing_mask].index.tolist()
    if crossing_indices:
        upper_idx = crossing_indices[0]
        pos = distance_gap.index.get_loc(upper_idx)
        lower_idx = distance_gap.index[max(pos - 1, 0)]
        lower_gap = df.at[lower_idx, 'distance_gap_m']
        upper_gap = df.at[upper_idx, 'distance_gap_m']
        if (lower_idx == upper_idx or not np.isfinite(lower_gap) or not np.isfinite(upper_gap)
                or np.isclose(lower_gap - upper_gap, 0.0)):
            merge_idx = upper_idx
            merge_time = df.at[merge_idx, 'elapsed_time_s']
            merge_point_merge = df.loc[merge_idx, ['M_x_m', 'M_y_m']].to_numpy()
            merge_point_reference = df.loc[merge_idx, ['R_x_m', 'R_y_m']].to_numpy()
            merge_longitudinal_distance = df.at[merge_idx, 'Dis2End']
            merge_distance_gap_abs = abs(df.at[merge_idx, 'distance_gap_m'])
        else:
            alpha = lower_gap / (lower_gap - upper_gap)

            def interpolate(columns):
                lower_vals = df.loc[lower_idx, columns].to_numpy()
                upper_vals = df.loc[upper_idx, columns].to_numpy()
                return lower_vals + alpha * (upper_vals - lower_vals)

            merge_time = df.at[lower_idx, 'elapsed_time_s'] + alpha * (
                df.at[upper_idx, 'elapsed_time_s'] - df.at[lower_idx, 'elapsed_time_s']
            )
            merge_point_merge = interpolate(['M_x_m', 'M_y_m'])
            merge_point_reference = interpolate(['R_x_m', 'R_y_m'])
            merge_longitudinal_distance = float(
                df.at[lower_idx, 'Dis2End'] + alpha * (df.at[upper_idx, 'Dis2End'] - df.at[lower_idx, 'Dis2End'])
            )
            merge_distance_gap_abs = abs(
                df.at[lower_idx, 'distance_gap_m'] + alpha * (
                    df.at[upper_idx, 'distance_gap_m'] - df.at[lower_idx, 'distance_gap_m']
                )
            )
    else:
        merge_idx = distance_gap.abs().idxmin()
        merge_time = df.at[merge_idx, 'elapsed_time_s']
        merge_point_merge = df.loc[merge_idx, ['M_x_m', 'M_y_m']].to_numpy()
        merge_point_reference = df.loc[merge_idx, ['R_x_m', 'R_y_m']].to_numpy()
        merge_longitudinal_distance = df.at[merge_idx, 'Dis2End']
        merge_distance_gap_abs = abs(df.at[merge_idx, 'distance_gap_m'])
else:
    merge_idx = df.index[0]
    merge_time = df.at[merge_idx, 'elapsed_time_s']
    merge_point_merge = df.loc[merge_idx, ['M_x_m', 'M_y_m']].to_numpy()
    merge_point_reference = df.loc[merge_idx, ['R_x_m', 'R_y_m']].to_numpy()
    merge_longitudinal_distance = df.at[merge_idx, 'Dis2End']
    merge_distance_gap_abs = abs(df.at[merge_idx, 'distance_gap_m'])

merge_gap_distance = (
    np.linalg.norm(merge_point_merge - merge_point_reference)
    if np.all(np.isfinite(np.concatenate([merge_point_merge, merge_point_reference])))
    else np.nan
)

# Estimate roadway orientation for contextual overlays
valid_ref = df[['R_x_m', 'R_y_m']].dropna().to_numpy()
if len(valid_ref) > 1:
    deltas = np.diff(valid_ref, axis=0)
    norms = np.linalg.norm(deltas, axis=1)
    forward_vectors = deltas[norms > 0] / norms[norms > 0][:, None]
    if len(forward_vectors) > 0:
        forward_unit = forward_vectors.mean(axis=0)
        forward_unit /= np.linalg.norm(forward_unit)
        if not np.all(np.isfinite(forward_unit)):
            forward_unit = np.array([1.0, 0.0])
    else:
        forward_unit = np.array([1.0, 0.0])
else:
    forward_unit = np.array([1.0, 0.0])

lateral_unit = np.array([-forward_unit[1], forward_unit[0]])


In [None]:
overview = df.describe().T
overview[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]

In [None]:
missing = df.isna().sum().sort_values(ascending=False)
missing = missing[missing > 0]
missing.to_frame(name='missing_values')

## Vehicle speed comparison

Compare the recorded speeds for the reference (R) and merge (M) vehicles over time to understand their relative dynamics, with annotations for the merge moment.


In [None]:
fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(df['elapsed_time_s'], df['Rspeed'], label='Reference speed (Rspeed)')
ax.plot(df['elapsed_time_s'], df['Mspeed'], label='Merge speed (Mspeed)')
ax.plot(df['elapsed_time_s'], df['relative_speed'], label='Relative speed (Mspeed - Rspeed)', linestyle='--', color='tab:green')

ax.axvline(merge_time, color='k', linestyle=':', linewidth=2, label='Merge moment (Dis2End≈Dis2End_M)')
pre_color, post_color = 'tab:blue', 'tab:orange'
ax.axvspan(df['elapsed_time_s'].min(), merge_time, color=pre_color, alpha=0.12)
ax.axvspan(merge_time, df['elapsed_time_s'].max(), color=post_color, alpha=0.12)
pre_patch = Patch(facecolor=pre_color, alpha=0.12, label='Pre-merge')
post_patch = Patch(facecolor=post_color, alpha=0.12, label='Post-merge')

context_lines = []
if np.isfinite(merge_longitudinal_distance):
    context_lines.append(f"Distance to merge point ≈ {merge_longitudinal_distance:.1f} m")
if np.isfinite(merge_distance_gap_abs):
    context_lines.append(f"|Dis2End - Dis2End_M| ≈ {merge_distance_gap_abs:.2f} m")
if context_lines:
    context_text = "\n".join(context_lines)
    ax.text(0.02, 0.93, context_text,
            transform=ax.transAxes,
            fontsize=11, va='top', ha='left', bbox=dict(boxstyle='round,pad=0.4', facecolor='white', alpha=0.8, edgecolor="lightgray"))

ax.set_xlabel('Elapsed time [s]')
ax.set_ylabel('Speed')
ax.set_title('Speed profiles with merge context')
ax.grid(True, linestyle='--', alpha=0.4)
handles, labels = ax.get_legend_handles_labels()
handles.extend([pre_patch, post_patch])
ax.legend(handles=handles, loc='upper right')

plt.show()


## Trajectory footprints (local frame)

Project the GPS trajectories into a local tangent-plane coordinate system and overlay lane boundaries plus the merge zone for added context. Merge vehicle markers are color-coded by speed.


In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

ax.plot(df['R_x_m'], df['R_y_m'], color='tab:blue', linewidth=2.2, label='Reference vehicle path')
ax.plot(df['M_x_m'], df['M_y_m'], color='tab:orange', linewidth=2.4, label='Merge vehicle path')

start_idx = df.index.min()
end_idx = df.index.max()
start_point = df.loc[start_idx, ['M_x_m', 'M_y_m']].to_numpy()
end_point = df.loc[end_idx, ['M_x_m', 'M_y_m']].to_numpy()

if np.all(np.isfinite(start_point)):
    ax.scatter(start_point[0], start_point[1], color='white', edgecolor='tab:orange', s=80,
               zorder=5, label='Start (merge veh.)')
if np.all(np.isfinite(end_point)):
    ax.scatter(end_point[0], end_point[1], color='tab:orange', edgecolor='white', s=80,
               zorder=5, label='End (merge veh.)')

if np.all(np.isfinite(merge_point_merge)):
    ax.scatter(merge_point_merge[0], merge_point_merge[1], color='tab:green', edgecolor='black',
               s=110, marker='X', zorder=6, label='Merge vehicle @ merge')
if np.all(np.isfinite(merge_point_reference)):
    ax.scatter(merge_point_reference[0], merge_point_reference[1], color='tab:blue', edgecolor='white',
               s=110, marker='D', zorder=6, label='Reference vehicle @ merge')

if (np.all(np.isfinite(merge_point_merge)) and np.all(np.isfinite(merge_point_reference))):
    gap_label = 'Separation at merge'
    if np.isfinite(merge_gap_distance):
        gap_label = f'{gap_label} ({merge_gap_distance:.1f} m)'
    ax.plot([merge_point_merge[0], merge_point_reference[0]],
            [merge_point_merge[1], merge_point_reference[1]],
            color='tab:green', linestyle='--', linewidth=1.4, label=gap_label)
    if np.isfinite(merge_gap_distance):
        connector_mid = (merge_point_merge + merge_point_reference) / 2.0
        ax.text(connector_mid[0], connector_mid[1], f'{merge_gap_distance:.1f} m', color='tab:green',
                fontsize=10, ha='left', va='bottom')

annotation_anchor = merge_point_reference if np.all(np.isfinite(merge_point_reference)) else merge_point_merge
if annotation_anchor is not None and np.all(np.isfinite(annotation_anchor)):
    annotation_text = 'Merge (Dis2End≈Dis2End_M)'
    if np.isfinite(merge_longitudinal_distance):
        annotation_text += f"\nDistance to ramp end ≈ {merge_longitudinal_distance:.1f} m"
    ax.annotate(annotation_text, xy=annotation_anchor, xytext=annotation_anchor + lateral_unit * 6,
                textcoords='data', arrowprops=dict(arrowstyle='->', color='tab:green'),
                ha='left', va='bottom', fontsize=11)

ax.set_xlabel('Local X [m]')
ax.set_ylabel('Local Y [m]')
ax.set_title('Trajectory footprints (local frame)')
ax.set_aspect(1.6, adjustable='box')
ax.grid(True, linestyle=':', alpha=0.4)

xdata = np.concatenate([df['R_x_m'].dropna().to_numpy(), df['M_x_m'].dropna().to_numpy()])
ydata = np.concatenate([df['R_y_m'].dropna().to_numpy(), df['M_y_m'].dropna().to_numpy()])
if len(xdata) > 0 and len(ydata) > 0:
    xpad = 5
    ypad = 5
    ax.set_xlim(xdata.min() - xpad, xdata.max() + xpad)
    ax.set_ylim(ydata.min() - ypad, ydata.max() + ypad)

ax.legend(loc='upper left', frameon=True, facecolor='white', edgecolor='lightgray')

plt.show()


## Acceleration distributions

Examine the distribution of acceleration measurements across axes for both vehicles.

In [None]:
acc_cols = ['RaccX', 'RaccY', 'RaccZ', 'MaccX', 'MaccY', 'MaccZ']
fig, axes = plt.subplots(2, 3, figsize=(18, 10), sharey=False)
for ax, col in zip(axes.flatten(), acc_cols):
    sns.histplot(df[col].dropna(), kde=True, ax=ax)
    ax.set_title(f'Distribution of {col}')
    ax.set_xlabel('Acceleration')
plt.tight_layout()
plt.show()

## Distance-related features

Look at the relationships between distance-to-goal features and vehicle speeds.

In [None]:
distance_cols = ['Dis2End', 'Dis2Road', 'Dis2End_M']
feature_subset = ['Rspeed', 'Mspeed'] + distance_cols
corr = df[feature_subset].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation between speeds and distance-based features')
plt.tight_layout()
plt.show()