# 1Xuanpeng Merge Dataset Exploration

This notebook loads the provided dataset, performs lightweight cleaning, and produces a few visualizations to help you reason about which features may be useful for further analysis or modeling.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from matplotlib.patches import Patch, Rectangle

sns.set_theme(style="ticks", context="talk")

In [None]:
data_path = Path('1Xuanpeng_Merge.csv')
df_raw = pd.read_csv(data_path)
# Strip whitespace from column names and drop any empty trailing columns
df_raw.columns = [col.strip() for col in df_raw.columns]
df = df_raw[[col for col in df_raw.columns if col]]

# Convert numeric-looking columns
numeric_cols = df.columns
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Use elapsed time in seconds for plotting clarity
df['elapsed_time_s'] = df['time'] - df['time'].iloc[0]

df.head()

In [None]:
# Derive helper fields for visualization
df['relative_speed'] = df['Mspeed'] - df['Rspeed']

# Merge moment when the lateral deviation (Dis2Road) is approximately zero
merge_idx = df['Dis2Road'].abs().idxmin()
merge_time = df.loc[merge_idx, 'elapsed_time_s']

# Convert GPS coordinates to a local tangent-plane frame (meters)
ref_lat_candidates = pd.concat([df['Rlat'], df['Mlat']], ignore_index=True).dropna()
ref_lon_candidates = pd.concat([df['Rlon'], df['Mlon']], ignore_index=True).dropna()
ref_lat = ref_lat_candidates.mean() if not ref_lat_candidates.empty else 0.0
ref_lon = ref_lon_candidates.mean() if not ref_lon_candidates.empty else 0.0
earth_radius_m = 6_371_000
lat0 = np.radians(ref_lat)
lon0 = np.radians(ref_lon)

def latlon_to_local_xy(lat_series, lon_series):
    lat_rad = np.radians(lat_series)
    lon_rad = np.radians(lon_series)
    x = (lon_rad - lon0) * earth_radius_m * np.cos(lat0)
    y = (lat_rad - lat0) * earth_radius_m
    return x, y

df['R_x_m'], df['R_y_m'] = latlon_to_local_xy(df['Rlat'], df['Rlon'])
df['M_x_m'], df['M_y_m'] = latlon_to_local_xy(df['Mlat'], df['Mlon'])

# Estimate roadway orientation for contextual overlays
valid_ref = df[['R_x_m', 'R_y_m']].dropna().to_numpy()
if len(valid_ref) > 1:
    deltas = np.diff(valid_ref, axis=0)
    norms = np.linalg.norm(deltas, axis=1)
    forward_vectors = deltas[norms > 0] / norms[norms > 0][:, None]
    if len(forward_vectors) > 0:
        forward_unit = forward_vectors.mean(axis=0)
        forward_unit /= np.linalg.norm(forward_unit)
        if not np.all(np.isfinite(forward_unit)):
            forward_unit = np.array([1.0, 0.0])
    else:
        forward_unit = np.array([1.0, 0.0])
else:
    forward_unit = np.array([1.0, 0.0])

lateral_unit = np.array([-forward_unit[1], forward_unit[0]])
lane_width_m = 3.7
lane_offset_vec = lateral_unit * lane_width_m / 2
merge_point = df.loc[merge_idx, ['M_x_m', 'M_y_m']].to_numpy()
merge_heading_deg = np.degrees(np.arctan2(forward_unit[1], forward_unit[0]))


In [None]:
overview = df.describe().T
overview[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]

In [None]:
missing = df.isna().sum().sort_values(ascending=False)
missing = missing[missing > 0]
missing.to_frame(name='missing_values')

## Vehicle speed comparison

Compare the recorded speeds for the reference (R) and merge (M) vehicles over time to understand their relative dynamics, with annotations for the merge moment.


In [None]:
fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(df['elapsed_time_s'], df['Rspeed'], label='Reference speed (Rspeed)')
ax.plot(df['elapsed_time_s'], df['Mspeed'], label='Merge speed (Mspeed)')
ax.plot(df['elapsed_time_s'], df['relative_speed'], label='Relative speed (Mspeed - Rspeed)', linestyle='--', color='tab:green')

ax.axvline(merge_time, color='k', linestyle=':', linewidth=2, label='Merge moment (Dis2Road ≈ 0)')
pre_color, post_color = 'tab:blue', 'tab:orange'
ax.axvspan(df['elapsed_time_s'].min(), merge_time, color=pre_color, alpha=0.12)
ax.axvspan(merge_time, df['elapsed_time_s'].max(), color=post_color, alpha=0.12)
pre_patch = Patch(facecolor=pre_color, alpha=0.12, label='Pre-merge')
post_patch = Patch(facecolor=post_color, alpha=0.12, label='Post-merge')

ax.set_xlabel('Elapsed time [s]')
ax.set_ylabel('Speed')
ax.set_title('Speed profiles with merge context')
ax.grid(True, linestyle='--', alpha=0.4)
handles, labels = ax.get_legend_handles_labels()
handles.extend([pre_patch, post_patch])
ax.legend(handles=handles, loc='upper right')

plt.show()


## Trajectory footprints (local frame)

Project the GPS trajectories into a local tangent-plane coordinate system and overlay lane boundaries plus the merge zone for added context. Merge vehicle markers are color-coded by speed.


In [None]:
fig, ax = plt.subplots(figsize=(9, 9))
ax.plot(df['R_x_m'], df['R_y_m'], color='tab:blue', linewidth=2.0, alpha=0.8, label='Reference trajectory')
ax.plot(df['M_x_m'], df['M_y_m'], color='tab:orange', linewidth=1.6, alpha=0.7)
scatter = ax.scatter(df['M_x_m'], df['M_y_m'], c=df['Mspeed'], cmap='OrRd', s=35, edgecolor='black', linewidth=0.2, label='Merge trajectory speed')

valid_mask = df['R_x_m'].notna() & df['R_y_m'].notna()
lane_center = df.loc[valid_mask, ['R_x_m', 'R_y_m']].to_numpy()
if len(lane_center) > 0:
    lane_left = lane_center + lane_offset_vec
    lane_right = lane_center - lane_offset_vec
    ax.plot(lane_left[:, 0], lane_left[:, 1], color='dimgray', linestyle='--', linewidth=1.2, label='Lane boundaries')
    ax.plot(lane_right[:, 0], lane_right[:, 1], color='dimgray', linestyle='--', linewidth=1.2)

merge_zone_length = 20
rect_origin = merge_point - forward_unit * (merge_zone_length / 2) - lateral_unit * (lane_width_m / 2)
merge_rect = Rectangle(rect_origin, merge_zone_length, lane_width_m, angle=merge_heading_deg, facecolor='mediumseagreen', alpha=0.2, label='Merge zone')
ax.add_patch(merge_rect)
ax.scatter(merge_point[0], merge_point[1], color='mediumseagreen', edgecolor='black', zorder=5, s=80, label='Merge point')

ax.set_xlabel('Local X [m]')
ax.set_ylabel('Local Y [m]')
ax.set_title('Trajectory footprints with lane context')
ax.set_aspect('equal', adjustable='box')
ax.grid(True, linestyle='--', alpha=0.3)
legend_handles, legend_labels = ax.get_legend_handles_labels()
ax.legend(legend_handles, legend_labels, loc='best')
cbar = fig.colorbar(scatter, ax=ax, fraction=0.046, pad=0.04)
cbar.set_label('Merge vehicle speed')

plt.show()


## Acceleration distributions

Examine the distribution of acceleration measurements across axes for both vehicles.

In [None]:
acc_cols = ['RaccX', 'RaccY', 'RaccZ', 'MaccX', 'MaccY', 'MaccZ']
fig, axes = plt.subplots(2, 3, figsize=(18, 10), sharey=False)
for ax, col in zip(axes.flatten(), acc_cols):
    sns.histplot(df[col].dropna(), kde=True, ax=ax)
    ax.set_title(f'Distribution of {col}')
    ax.set_xlabel('Acceleration')
plt.tight_layout()
plt.show()

## Distance-related features

Look at the relationships between distance-to-goal features and vehicle speeds.

In [None]:
distance_cols = ['Dis2End', 'Dis2Road', 'Dis2End_M']
feature_subset = ['Rspeed', 'Mspeed'] + distance_cols
corr = df[feature_subset].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation between speeds and distance-based features')
plt.tight_layout()
plt.show()