# EGO Vehicle Trajectory Clustering

Extract features from the last 6 seconds of EGO vehicle trajectories from the first 100 train files and perform clustering analysis

In [None]:
from pathlib import Path
import sys

project_root = Path.cwd().resolve()
if not (project_root / "src").exists():
    for parent in project_root.parents:
        if (parent / "src").exists():
            project_root = parent
            break

if (project_root / "src").exists() and str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import torch
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from src.utils.data_visualization import get_available_files, DataLoader

sns.set_style('whitegrid')
print("✅ Libraries loaded")

## 1. Load data and extract EGO features

In [None]:
# Configuration
DATA_PATH = Path('/data1/xiaowei/code/DeMo/data/DeMo_processed')
N_FILES = 1000
TOTAL_TIMESTEPS = 110  # 11 seconds * 10 Hz
HISTORY_TIMESTEPS = 50  # First 5 seconds
FUTURE_TIMESTEPS = 60   # Last 6 seconds

# Get files
train_files = get_available_files(DATA_PATH, 'train')[:N_FILES]
print(f"📂 Loaded {len(train_files)} files")

In [None]:
def normalize_trajectory(positions, velocity, angles):
    """
    Normalize trajectory: start at origin, end aligned to y-axis
    
    Args:
        positions: numpy array of shape (60, 2) - [x, y] positions
        velocity: numpy array of shape (60,) - velocity values
        angles: numpy array of shape (60,) - heading angles
    
    Returns:
        features: numpy array of shape (60, 4) - [normalized_x, normalized_y, velocity, angle]
    """
    # 1. Translation: Move start to origin
    start_pos = positions[0]
    end_pos = positions[-1]
    positions_normalized = positions - start_pos  # (60, 2)
    
    # 2. Rotation alignment: Align start->end vector to positive y-axis direction
    # Calculate vector from start to end
    start_to_end = end_pos - start_pos
    target_angle = np.arctan2(start_to_end[1], start_to_end[0])  # Current angle
    
    # Calculate rotation angle (rotate vector to positive y-axis, i.e., 90 degrees)
    rotation_angle = np.pi / 2 - target_angle
    
    # Construct rotation matrix
    cos_theta = np.cos(rotation_angle)
    sin_theta = np.sin(rotation_angle)
    rotation_matrix = np.array([
        [cos_theta, -sin_theta],
        [sin_theta, cos_theta]
    ])
    
    # Apply rotation to positions
    positions_aligned = positions_normalized @ rotation_matrix.T  # (60, 2)
    
    # Apply rotation to angles (adjust all angles by rotation_angle)
    angles_aligned = angles - rotation_angle
    
    # 3. Feature combination: [normalized x, y, velocity, angle]
    features = np.column_stack([
        positions_aligned[:, 0],  # aligned x (lateral offset)
        positions_aligned[:, 1],  # aligned y (longitudinal progress)
        velocity,                 # original velocity
        angles_aligned            # aligned heading angles
    ])  # (60, 4)
    
    return features


def extract_ego_features(file_path):
    """
    Extract features from the last 6 seconds of EGO vehicle trajectory using DataLoader
    
    Args:
        file_path: Path to the data file
    
    Returns:
        tuple: (features, agent_type_combined) where:
            - features: numpy array of shape (60, 4) or None if invalid
            - agent_type_combined: string indicating agent type ('Vehicle', 'Pedestrian', 'Cyclist', 'Other')
    """
    # Use DataLoader to load and extract ego data
    loader = DataLoader()
    data = loader.load_scenario(file_path)
    
    if data is None:
        return None, None
    
    # Get focal agent index
    focal_idx = loader.current_metadata['focal_agent_idx']
    
    # Use plot_ego_velocity_analysis to get comprehensive ego data
    # This includes positions, velocities, angles, and accelerations
    from src.utils.data_visualization import plot_ego_velocity_analysis
    
    analysis_data = plot_ego_velocity_analysis(loader, show_acceleration=False, time_window=None)
    
    if analysis_data is None:
        return None, None
    
    # Extract agent_type_combined
    agent_type_combined = analysis_data['agent_type_combined']
    
    # Get full trajectory data
    positions_full = analysis_data['positions']
    velocities_full = analysis_data['velocities']
    angles_full = analysis_data['angles']
    timesteps_full = analysis_data['timesteps']
    
    # Filter for future timesteps (timestep 50-110, i.e., last 6 seconds)
    future_mask = timesteps_full >= HISTORY_TIMESTEPS
    
    if not future_mask.any():
        return None, None
    
    positions = positions_full[future_mask]
    velocity = velocities_full[future_mask]
    angles = angles_full[future_mask]
    
    # Check if we have exactly 60 timesteps
    if len(positions) != FUTURE_TIMESTEPS:
        return None, None
    
    # Normalize trajectory with positions, velocity, and angles from DataLoader
    features = normalize_trajectory(positions, velocity, angles)
    
    return features, agent_type_combined


# Extract all features and agent types
all_features = []
all_agent_types = []
valid_files = []

print("Extracting features using DataLoader with plot_ego_velocity_analysis...")
print("💡 This provides comprehensive data: positions, velocities, angles from the data loader")
for i, file_path in enumerate(train_files):
    if (i + 1) % 100 == 0:
        print(f"  Processing {i + 1}/{len(train_files)}...")
    
    features, agent_type = extract_ego_features(file_path)
    if features is not None:
        all_features.append(features)
        all_agent_types.append(agent_type)
        valid_files.append(file_path.name)

print(f"\n✅ Successfully extracted features from {len(all_features)} scenarios")
print(f"Feature dimensions: {all_features[0].shape}")
print(f"💡 Features include: [normalized_x, normalized_y, velocity, angle]")
print(f"💡 All data sourced from DataLoader's get_agent_trajectory method")

# Show agent type distribution
from collections import Counter
agent_type_counts = Counter(all_agent_types)
print(f"\n📊 Agent Type Distribution:")
for agent_type, count in sorted(agent_type_counts.items()):
    print(f"  {agent_type}: {count} ({count/len(all_agent_types)*100:.1f}%)")


## 1.5 Heuristic Rule-Based Pre-Classification

Apply heuristic rules to identify **Stop** and **Lane Keeping** scenarios before clustering

In [None]:
# Heuristic rule-based pre-classification with agent type support
def classify_by_heuristics(features, agent_types):
    """
    Classify trajectories using heuristic rules:
    - Pedestrian: agent_type_combined == 'Pedestrian'
    - Cyclist: agent_type_combined == 'Cyclist'
    - Stop: velocity decreases and approaches 0
    - Start from Stop: velocity starts near 0 and increases
    - Lane Keeping: stable velocity with low variance
    
    Args:
        features: array of shape (n_samples, 60, 4) containing [x, y, velocity, angle]
        agent_types: list of agent_type_combined strings
    
    Returns:
        labels: array of shape (n_samples,) with values:
                -1 = Stop
                -2 = Lane Keeping
                -3 = Start from Stop
                -4 = Other (needs clustering)
                -5 = Pedestrian
                -6 = Cyclist/Motorcyclist
    """
    n_samples = len(features)
    labels = np.full(n_samples, -4, dtype=int)  # Default: Other
    
    # Thresholds for classification
    WINDOW_SIZE = 5  # Number of timesteps to average for initial/final speed
    STOP_FINAL_SPEED_THRESHOLD = 2.0  # m/s, final speed close to 0
    STOP_SPEED_DECREASE_THRESHOLD = -3.0  # m/s, significant speed decrease
    START_INITIAL_SPEED_THRESHOLD = 2.0  # m/s, initial speed close to 0
    START_SPEED_INCREASE_THRESHOLD = 3.0  # m/s, significant speed increase
    LANE_KEEPING_SPEED_STD_THRESHOLD = 1.5  # m/s, low speed variance
    LANE_KEEPING_ANGLE_STD_THRESHOLD = 0.15  # rad, low heading change (~8.6 degrees)
    LANE_KEEPING_LATERAL_THRESHOLD = 2.0  # m, small lateral deviation
    
    for i, sample in enumerate(features):
        agent_type = agent_types[i]
        
        # Rule 0: Agent type-based classification (highest priority)
        if agent_type == 'Pedestrian':
            labels[i] = -5  # Pedestrian
            continue
        elif agent_type == 'Cyclist':
            labels[i] = -6  # Cyclist/Motorcyclist
            continue
        
        # For vehicles, apply trajectory-based rules
        velocities = sample[:, 2]
        angles = sample[:, 3]
        x_positions = sample[:, 0]
        
        # Calculate statistics
        # Use average speed over first/last 5 timesteps for smoother detection
        initial_speed = np.mean(velocities[:WINDOW_SIZE])  # Average of first 5 points
        final_speed = np.mean(velocities[-WINDOW_SIZE:])   # Average of last 5 points
        speed_change = final_speed - initial_speed
        speed_std = np.std(velocities)
        angle_std = np.std(angles)
        lateral_deviation = np.std(x_positions)  # Lateral deviation from centerline
        
        # Rule 1: Stop - average final speed approaches 0 and speed decreases
        if final_speed < STOP_FINAL_SPEED_THRESHOLD and speed_change < STOP_SPEED_DECREASE_THRESHOLD:
            labels[i] = -1  # Stop
        
        # Rule 2: Start from Stop - average initial speed near 0 and speed increases
        elif initial_speed < START_INITIAL_SPEED_THRESHOLD and speed_change > START_SPEED_INCREASE_THRESHOLD:
            labels[i] = -3  # Start from Stop
        
        # Rule 3: Lane Keeping - stable speed, stable heading, small lateral deviation
        elif (speed_std < LANE_KEEPING_SPEED_STD_THRESHOLD and 
              angle_std < LANE_KEEPING_ANGLE_STD_THRESHOLD and
              lateral_deviation < LANE_KEEPING_LATERAL_THRESHOLD):
            labels[i] = -2  # Lane Keeping
        
        # Otherwise: remains as -4 (Other - needs clustering)
    
    return labels

# Apply heuristic classification with agent types
heuristic_labels = classify_by_heuristics(all_features, all_agent_types)

# Count samples in each category
n_stop = np.sum(heuristic_labels == -1)
n_lane_keeping = np.sum(heuristic_labels == -2)
n_start = np.sum(heuristic_labels == -3)
n_other = np.sum(heuristic_labels == -4)
n_pedestrian = np.sum(heuristic_labels == -5)
n_cyclist = np.sum(heuristic_labels == -6)

print("=" * 60)
print("🔍 Heuristic Rule-Based Pre-Classification Results:")
print("=" * 60)
print(f"  Pedestrian scenarios:      {n_pedestrian:3d} ({n_pedestrian/len(all_features)*100:5.1f}%)")
print(f"  Cyclist scenarios:         {n_cyclist:3d} ({n_cyclist/len(all_features)*100:5.1f}%)")
print(f"  Stop scenarios:            {n_stop:3d} ({n_stop/len(all_features)*100:5.1f}%)")
print(f"  Lane Keeping scenarios:    {n_lane_keeping:3d} ({n_lane_keeping/len(all_features)*100:5.1f}%)")
print(f"  Start from Stop scenarios: {n_start:3d} ({n_start/len(all_features)*100:5.1f}%)")
print(f"  Other scenarios:           {n_other:3d} ({n_other/len(all_features)*100:5.1f}%)")
print("=" * 60)
print(f"\n💡 {n_other} scenarios will proceed to DTW clustering")

In [None]:
N_CLUSTER = 4
# DTW clustering for "Other" scenarios only
try:
    from tslearn.clustering import TimeSeriesKMeans
    from tslearn.preprocessing import TimeSeriesScalerMeanVariance
    
    # Get indices of "Other" scenarios
    other_indices = np.where(heuristic_labels == -4)[0]
    other_features = [all_features[i] for i in other_indices]
    
    if len(other_features) > 0:
        # Standardize time series data for "Other" scenarios
        ts_scaler = TimeSeriesScalerMeanVariance()
        X_ts_scaled = ts_scaler.fit_transform(other_features)

        # Use DTW distance K-Means (4 clusters for remaining scenarios)
        n_clusters_dtw = N_CLUSTER
        ts_kmeans = TimeSeriesKMeans(n_clusters=n_clusters_dtw, metric="dtw", random_state=42)
        dtw_labels = ts_kmeans.fit_predict(X_ts_scaled)
        
        # Combine heuristic labels and DTW labels
        # Final labels: -1 (Stop), -2 (Lane Keeping), -3 (Start from Stop), 0-5 (DTW clusters)
        ts_labels = heuristic_labels.copy()
        ts_labels[other_indices] = dtw_labels
        
        print("\n✅ DTW time series clustering completed for 'Other' scenarios")
        print(f"DTW clustering distribution: {np.bincount(dtw_labels)}")
        print(f"\n📊 Final Label Distribution:")
        print(f"  Label -1 (Stop):            {np.sum(ts_labels == -1)} samples")
        print(f"  Label -2 (Lane Keeping):    {np.sum(ts_labels == -2)} samples")
        print(f"  Label -3 (Start from Stop): {np.sum(ts_labels == -3)} samples")
        for cluster_id in range(n_clusters_dtw):
            print(f"  Label {cluster_id}  (DTW Cluster):     {np.sum(ts_labels == cluster_id)} samples")
    else:
        print("⚠️ No 'Other' scenarios to cluster - all samples classified by heuristics")
        ts_labels = heuristic_labels.copy()
    
except ImportError:
    print("⚠️ tslearn not installed, skipping DTW clustering")
    print("   To use, run: pip install tslearn")
    ts_labels = heuristic_labels.copy()

## 2. Visualize clustering results

In [None]:
# 1. Cluster distribution statistics
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Get cluster counts including negative labels
unique_labels = np.unique(ts_labels)
label_names = []
cluster_counts_list = []

for label in sorted(unique_labels):
    count = np.sum(ts_labels == label)
    cluster_counts_list.append(count)
    if label == -6:
        label_names.append('Cyclist')
    elif label == -5:
        label_names.append('Pedestrian')
    elif label == -3:
        label_names.append('Start from Stop')
    elif label == -2:
        label_names.append('Lane Keeping')
    elif label == -1:
        label_names.append('Stop')
    else:
        label_names.append(f'Cluster {label}')

# Plot cluster distribution bar chart
colors_bar = ['#9B59B6' if l == -6 else '#3498DB' if l == -5 else '#FFB84D' if l == -3 
              else '#4ECDC4' if l == -2 else '#FF6B6B' if l == -1 else 'steelblue' 
              for l in sorted(unique_labels)]
axes[0].bar(range(len(cluster_counts_list)), cluster_counts_list, color=colors_bar, alpha=0.7)
axes[0].set_xticks(range(len(cluster_counts_list)))
axes[0].set_xticklabels(label_names, rotation=45, ha='right')
axes[0].set_xlabel('Cluster ID', fontsize=12)
axes[0].set_ylabel('Sample Count', fontsize=12)
axes[0].set_title('Sample Distribution Across Clusters (with Heuristic Rules)', fontsize=14, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

# Plot cluster distribution pie chart
colors_pie = ['#9B59B6' if l == -6 else '#3498DB' if l == -5 else '#FFB84D' if l == -3 
              else '#4ECDC4' if l == -2 else '#FF6B6B' if l == -1 else plt.cm.Set3(i) 
              for i, l in enumerate(sorted(unique_labels))]
axes[1].pie(cluster_counts_list, labels=label_names,
            autopct='%1.1f%%', colors=colors_pie, startangle=90)
axes[1].set_title('Cluster Sample Proportions', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\n📊 Cluster Distribution Details:")
for label, name, count in zip(sorted(unique_labels), label_names, cluster_counts_list):
    print(f"  {name:20s}: {count} samples ({count/len(ts_labels)*100:.1f}%)")

In [None]:
# 2. Typical trajectory visualization for each cluster
unique_labels = sorted(np.unique(ts_labels))
n_clusters = len(unique_labels)

# Create 2x5 layout for up to 10 clusters
fig, axes = plt.subplots(2, 5, figsize=(20, 8))
axes = axes.flatten()

# Get cluster counts
cluster_counts = {label: np.sum(ts_labels == label) for label in unique_labels}

for idx, cluster_id in enumerate(unique_labels):
    if idx >= 10:  # Only show first 10 clusters
        break
    
    # Get all samples for this cluster
    cluster_mask = ts_labels == cluster_id
    cluster_samples = [all_features[i] for i in range(len(all_features)) if cluster_mask[i]]
    
    # Plot all trajectories for this cluster (relative to start position)
    ax = axes[idx]
    for sample in cluster_samples:
        # Convert to relative positions
        x_rel = sample[:, 0] - sample[0, 0]
        y_rel = sample[:, 1] - sample[0, 1]
        ax.plot(x_rel, y_rel, alpha=0.3, linewidth=1)
    
    # Calculate and plot average trajectory (relative)
    mean_sample = np.mean(cluster_samples, axis=0)
    mean_x_rel = mean_sample[:, 0] - mean_sample[0, 0]
    mean_y_rel = mean_sample[:, 1] - mean_sample[0, 1]
    ax.plot(mean_x_rel, mean_y_rel, color='red', linewidth=3, label='Average Trajectory')
    
    ax.scatter(0, 0, color='green', s=100, marker='o', label='Start Point', zorder=5)
    ax.scatter(mean_x_rel[-1], mean_y_rel[-1], color='red', s=100, marker='*', label='End Point', zorder=5)
    
    # Set title with cluster name
    if cluster_id == -6:
        title_name = 'Cyclist'
        ax.set_facecolor('#F0E5F5')
    elif cluster_id == -5:
        title_name = 'Pedestrian'
        ax.set_facecolor('#E5EFF5')
    elif cluster_id == -3:
        title_name = 'Start from Stop'
        ax.set_facecolor('#FFF3E5')
    elif cluster_id == -2:
        title_name = 'Lane Keeping'
        ax.set_facecolor('#E5F5F4')
    elif cluster_id == -1:
        title_name = 'Stop'
        ax.set_facecolor('#FFE5E5')
    else:
        title_name = f'Cluster {cluster_id}'
    
    ax.set_xlabel('Relative X Position (m)', fontsize=10)
    ax.set_ylabel('Relative Y Position (m)', fontsize=10)
    ax.set_title(f'{title_name}\n({cluster_counts[cluster_id]} samples)', fontsize=12, fontweight='bold')
    ax.legend(loc='best', fontsize=8)
    ax.grid(True, alpha=0.3)
    ax.axis('equal')

# Hide unused subplots
for idx in range(n_clusters, 10):
    axes[idx].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# 3. Speed pattern visualization
unique_labels = sorted(np.unique(ts_labels))
n_clusters = len(unique_labels)

fig, axes = plt.subplots(2, 5, figsize=(20, 8))
axes = axes.flatten()

cluster_counts = {label: np.sum(ts_labels == label) for label in unique_labels}

for idx, cluster_id in enumerate(unique_labels):
    if idx >= 10:  # Only show first 10 clusters
        break
    
    cluster_mask = ts_labels == cluster_id
    cluster_samples = [all_features[i] for i in range(len(all_features)) if cluster_mask[i]]
    
    ax = axes[idx]
    
    # Plot velocity for all samples
    for sample in cluster_samples:
        vel = sample[:, 2]
        ax.plot(vel, alpha=0.2, linewidth=1, color='blue')
    
    # Plot average velocity
    mean_vel = np.mean([s[:, 2] for s in cluster_samples], axis=0)
    ax.plot(mean_vel, color='red', linewidth=2.5, label='Average')
    
    ax.axhline(y=0, color='black', linestyle='--', linewidth=1, alpha=0.5)
    
    # Set title with cluster name and background color
    if cluster_id == -6:
        title_name = 'Cyclist'
        ax.set_facecolor('#F0E5F5')
    elif cluster_id == -5:
        title_name = 'Pedestrian'
        ax.set_facecolor('#E5EFF5')
    elif cluster_id == -3:
        title_name = 'Start from Stop'
        ax.set_facecolor('#FFF3E5')
    elif cluster_id == -2:
        title_name = 'Lane Keeping'
        ax.set_facecolor('#E5F5F4')
    elif cluster_id == -1:
        title_name = 'Stop'
        ax.set_facecolor('#FFE5E5')
    else:
        title_name = f'Cluster {cluster_id}'
    
    ax.set_xlabel('Time Step', fontsize=10)
    ax.set_ylabel('Velocity (m/s)', fontsize=10)
    ax.set_title(f'{title_name} Speed Pattern', fontsize=12, fontweight='bold')
    ax.legend(loc='best', fontsize=8)
    ax.grid(True, alpha=0.3)

# Hide unused subplots
for idx in range(n_clusters, 10):
    axes[idx].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# 4. Cluster feature statistics comparison
unique_labels = sorted(np.unique(ts_labels))
cluster_stats = []

for cluster_id in unique_labels:
    cluster_mask = ts_labels == cluster_id
    cluster_samples = [all_features[i] for i in range(len(all_features)) if cluster_mask[i]]
    
    # Calculate statistics
    all_samples_array = np.array(cluster_samples)  # (n_samples, 60, 4)
    
    # Cluster name
    if cluster_id == -6:
        cluster_name = 'Cyclist'
    elif cluster_id == -5:
        cluster_name = 'Pedestrian'
    elif cluster_id == -3:
        cluster_name = 'Start from Stop'
    elif cluster_id == -2:
        cluster_name = 'Lane Keeping'
    elif cluster_id == -1:
        cluster_name = 'Stop'
    else:
        cluster_name = f'Cluster {cluster_id}'
    
    stats = {
        'cluster': cluster_name,
        'label': cluster_id,
        'count': len(cluster_samples),
        'avg_x_displacement': np.mean([s[-1, 0] - s[0, 0] for s in cluster_samples]),  # Final - initial X
        'avg_y_displacement': np.mean([s[-1, 1] - s[0, 1] for s in cluster_samples]),  # Final - initial Y
        'avg_total_displacement': np.mean([np.sqrt((s[-1, 0] - s[0, 0])**2 + (s[-1, 1] - s[0, 1])**2) for s in cluster_samples]),
        'avg_speed_change': np.mean([np.sum(np.diff(s[:, 2])) for s in cluster_samples]),  # Total speed change
        'speed_volatility': np.mean([np.std(s[:, 2]) for s in cluster_samples])
    }
    cluster_stats.append(stats)

# Convert to DataFrame and display
import pandas as pd
stats_df = pd.DataFrame(cluster_stats)
stats_df = stats_df.round(3)

print("\n📈 Statistical Feature Comparison Across Clusters:\n")
print(stats_df[['cluster', 'count', 'avg_x_displacement', 'avg_y_displacement', 
                'avg_total_displacement', 'avg_speed_change', 'speed_volatility']].to_string(index=False))

# Visualize statistical features
fig, axes = plt.subplots(2, 4, figsize=(16, 8))

# Prepare colors
bar_colors = ['#9B59B6' if label == -6 else '#3498DB' if label == -5 else '#FFB84D' if label == -3 
              else '#4ECDC4' if label == -2 else '#FF6B6B' if label == -1 else 'steelblue' 
              for label in stats_df['label']]

# Average X Displacement
axes[0, 0].bar(range(len(stats_df)), stats_df['avg_x_displacement'], color=bar_colors)
axes[0, 0].set_xticks(range(len(stats_df)))
axes[0, 0].set_xticklabels(stats_df['cluster'], rotation=45, ha='right', fontsize=8)
axes[0, 0].set_ylabel('Average X Displacement (m)')
axes[0, 0].set_title('Average X Displacement Comparison')
axes[0, 0].grid(axis='y', alpha=0.3)

# Average Y Displacement
axes[0, 1].bar(range(len(stats_df)), stats_df['avg_y_displacement'], color=bar_colors)
axes[0, 1].set_xticks(range(len(stats_df)))
axes[0, 1].set_xticklabels(stats_df['cluster'], rotation=45, ha='right', fontsize=8)
axes[0, 1].set_ylabel('Average Y Displacement (m)')
axes[0, 1].set_title('Average Y Displacement Comparison')
axes[0, 1].grid(axis='y', alpha=0.3)

# Total Displacement
axes[0, 2].bar(range(len(stats_df)), stats_df['avg_total_displacement'], color=bar_colors)
axes[0, 2].set_xticks(range(len(stats_df)))
axes[0, 2].set_xticklabels(stats_df['cluster'], rotation=45, ha='right', fontsize=8)
axes[0, 2].set_ylabel('Average Total Displacement (m)')
axes[0, 2].set_title('Total Displacement Comparison')
axes[0, 2].grid(axis='y', alpha=0.3)

# Speed Change
axes[0, 3].bar(range(len(stats_df)), stats_df['avg_speed_change'], color=bar_colors)
axes[0, 3].set_xticks(range(len(stats_df)))
axes[0, 3].set_xticklabels(stats_df['cluster'], rotation=45, ha='right', fontsize=8)
axes[0, 3].set_ylabel('Average Speed Change (m/s)')
axes[0, 3].set_title('Average Speed Change Comparison')
axes[0, 3].grid(axis='y', alpha=0.3)

# Speed Volatility
axes[1, 0].bar(range(len(stats_df)), stats_df['speed_volatility'], color=bar_colors)
axes[1, 0].set_xticks(range(len(stats_df)))
axes[1, 0].set_xticklabels(stats_df['cluster'], rotation=45, ha='right', fontsize=8)
axes[1, 0].set_ylabel('Speed Volatility (m/s)')
axes[1, 0].set_title('Speed Volatility Comparison')
axes[1, 0].grid(axis='y', alpha=0.3)

# Average Speed
avg_speed = [np.mean([s[:, 2] for s in cluster_samples]) for cluster_samples in 
             [[all_features[i] for i in range(len(all_features)) if ts_labels[i] == label] 
              for label in unique_labels]]
axes[1, 1].bar(range(len(stats_df)), avg_speed, color=bar_colors)
axes[1, 1].set_xticks(range(len(stats_df)))
axes[1, 1].set_xticklabels(stats_df['cluster'], rotation=45, ha='right', fontsize=8)
axes[1, 1].set_ylabel('Average Speed (m/s)')
axes[1, 1].set_title('Average Speed Comparison')
axes[1, 1].grid(axis='y', alpha=0.3)

# Max Speed
max_speed = [np.max([np.max(s[:, 2]) for s in cluster_samples]) for cluster_samples in 
             [[all_features[i] for i in range(len(all_features)) if ts_labels[i] == label] 
              for label in unique_labels]]
axes[1, 2].bar(range(len(stats_df)), max_speed, color=bar_colors)
axes[1, 2].set_xticks(range(len(stats_df)))
axes[1, 2].set_xticklabels(stats_df['cluster'], rotation=45, ha='right', fontsize=8)
axes[1, 2].set_ylabel('Max Speed (m/s)')
axes[1, 2].set_title('Max Speed Comparison')
axes[1, 2].grid(axis='y', alpha=0.3)

# Angle Volatility
angle_volatility = [np.mean([np.std(s[:, 3]) for s in cluster_samples]) for cluster_samples in 
                    [[all_features[i] for i in range(len(all_features)) if ts_labels[i] == label] 
                     for label in unique_labels]]
axes[1, 3].bar(range(len(stats_df)), angle_volatility, color=bar_colors)
axes[1, 3].set_xticks(range(len(stats_df)))
axes[1, 3].set_xticklabels(stats_df['cluster'], rotation=45, ha='right', fontsize=8)
axes[1, 3].set_ylabel('Angle Volatility (rad)')
axes[1, 3].set_title('Angle Volatility Comparison')
axes[1, 3].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# 5. Use t-SNE for dimensionality reduction visualization
from sklearn.manifold import TSNE

# Flatten time series data for t-SNE
X_flat = np.array([feat.flatten() for feat in all_features])

# t-SNE reduce to 2D
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X_flat)

# Visualization with different colors for all categories
unique_labels = sorted(np.unique(ts_labels))

plt.figure(figsize=(12, 9))

# Create color map
color_map = []
for label in ts_labels:
    if label == -6:
        color_map.append('#9B59B6')  # Purple for Cyclist
    elif label == -5:
        color_map.append('#3498DB')  # Blue for Pedestrian
    elif label == -3:
        color_map.append('#FFB84D')  # Orange for Start from Stop
    elif label == -2:
        color_map.append('#4ECDC4')  # Teal for Lane Keeping
    elif label == -1:
        color_map.append('#FF6B6B')  # Red for Stop
    else:
        color_map.append(plt.cm.tab10(label % 10))  # Different colors for DTW clusters

scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=color_map, 
                      s=100, alpha=0.7, edgecolors='black', linewidth=0.5)

# Add cluster center annotations
for cluster_id in unique_labels:
    cluster_mask = ts_labels == cluster_id
    cluster_center = X_tsne[cluster_mask].mean(axis=0)
    
    if cluster_id == -6:
        marker_color = '#9B59B6'
        label_text = 'Cyclist'
    elif cluster_id == -5:
        marker_color = '#3498DB'
        label_text = 'Pedestrian'
    elif cluster_id == -3:
        marker_color = '#FFB84D'
        label_text = 'Start'
    elif cluster_id == -2:
        marker_color = '#4ECDC4'
        label_text = 'Lane Keep'
    elif cluster_id == -1:
        marker_color = '#FF6B6B'
        label_text = 'Stop'
    else:
        marker_color = 'red'
        label_text = f'C{cluster_id}'
    
    plt.scatter(cluster_center[0], cluster_center[1], 
                marker='X', s=500, c=marker_color, edgecolors='black', linewidth=2)
    plt.text(cluster_center[0], cluster_center[1], label_text, 
             fontsize=12, fontweight='bold', ha='center', va='center',
             bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8))

# Create custom legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='#9B59B6', edgecolor='black', label='Cyclist (Heuristic)'),
    Patch(facecolor='#3498DB', edgecolor='black', label='Pedestrian (Heuristic)'),
    Patch(facecolor='#FFB84D', edgecolor='black', label='Start from Stop (Heuristic)'),
    Patch(facecolor='#4ECDC4', edgecolor='black', label='Lane Keeping (Heuristic)'),
    Patch(facecolor='#FF6B6B', edgecolor='black', label='Stop (Heuristic)'),
    Patch(facecolor='steelblue', edgecolor='black', label='DTW Clusters')
]
plt.legend(handles=legend_elements, loc='best', fontsize=11)

plt.xlabel('t-SNE Dimension 1', fontsize=12)
plt.ylabel('t-SNE Dimension 2', fontsize=12)
plt.title('Trajectory Feature t-SNE Visualization\n(Heuristic Rules + Agent Types + DTW Clustering)', 
          fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("✅ t-SNE visualization completed")