# 1Xuanpeng Merge Dataset Exploration

This notebook loads the provided dataset, performs lightweight cleaning, and produces a few visualizations to help you reason about which features may be useful for further analysis or modeling.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set_theme(style="ticks", context="talk")

In [None]:
data_path = Path('1Xuanpeng_Merge.csv')
df_raw = pd.read_csv(data_path)
# Strip whitespace from column names and drop any empty trailing columns
df_raw.columns = [col.strip() for col in df_raw.columns]
df = df_raw[[col for col in df_raw.columns if col]]

# Convert numeric-looking columns
numeric_cols = df.columns
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Use elapsed time in seconds for plotting clarity
df['elapsed_time_s'] = df['time'] - df['time'].iloc[0]

df.head()

In [None]:
overview = df.describe().T
overview[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]

In [None]:
missing = df.isna().sum().sort_values(ascending=False)
missing = missing[missing > 0]
missing.to_frame(name='missing_values')

## Vehicle speed comparison

Compare the recorded speeds for the reference (R) and merge (M) vehicles over time to understand their relative dynamics.

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(df['elapsed_time_s'], df['Rspeed'], label='Reference speed (Rspeed)')
ax.plot(df['elapsed_time_s'], df['Mspeed'], label='Merge speed (Mspeed)')
ax.set_xlabel('Elapsed time [s]')
ax.set_ylabel('Speed')
ax.set_title('Speed profiles over time')
ax.legend()
ax.grid(True, linestyle='--', alpha=0.4)
plt.show()

## Trajectory footprints

Plot the latitude/longitude traces to see how both vehicles move through space.

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
ax.plot(df['Rlon'], df['Rlat'], label='Reference trajectory', alpha=0.8)
ax.plot(df['Mlon'], df['Mlat'], label='Merge trajectory', alpha=0.8)
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_title('Spatial trajectories')
ax.legend()
ax.axis('equal')
ax.grid(True, linestyle='--', alpha=0.4)
plt.show()

## Acceleration distributions

Examine the distribution of acceleration measurements across axes for both vehicles.

In [None]:
acc_cols = ['RaccX', 'RaccY', 'RaccZ', 'MaccX', 'MaccY', 'MaccZ']
fig, axes = plt.subplots(2, 3, figsize=(18, 10), sharey=False)
for ax, col in zip(axes.flatten(), acc_cols):
    sns.histplot(df[col].dropna(), kde=True, ax=ax)
    ax.set_title(f'Distribution of {col}')
    ax.set_xlabel('Acceleration')
plt.tight_layout()
plt.show()

## Distance-related features

Look at the relationships between distance-to-goal features and vehicle speeds.

In [None]:
distance_cols = ['Dis2End', 'Dis2Road', 'Dis2End_M']
feature_subset = ['Rspeed', 'Mspeed'] + distance_cols
corr = df[feature_subset].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation between speeds and distance-based features')
plt.tight_layout()
plt.show()