# Data Loading and Preprocessing Pipeline Demo

This notebook demonstrates the complete data preprocessing pipeline for shuttle tracking data:
1. Load vehicle locations from database/CSV
2. Convert timestamps to epoch seconds
3. Add closest route information
4. Compute distance deltas
5. Compute speed
6. Segment into consecutive trips
7. Visualize speed over time for a single segment

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sys
from pathlib import Path

# Add project root to Python path
project_root = Path.cwd().parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Import preprocessing functions
from ml.pipelines import segment_pipeline
from ml.data.preprocess import segment_by_consecutive

## Step 1: Load and Preprocess Data

The `preprocess_pipeline()` function runs all preprocessing steps and caches the result.

In [None]:
# Load preprocessed data (uses cache if available)
df = segment_pipeline()

print(f"Loaded {len(df)} location points")
print(f"\nColumns: {list(df.columns)}")
print(f"\nData shape: {df.shape}")

In [None]:
# Display first few rows
df.head(20)

In [None]:
# Display rows surrounding largest speed
max_speed_idx = df['speed_kmh'].idxmax()
start_idx = max(0, max_speed_idx - 5)
end_idx = min(len(df), max_speed_idx + 6)
df.iloc[start_idx:end_idx]

## Step 2: Basic Data Summary

In [None]:
# Summary statistics
print("Summary Statistics:")
print("="*60)
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Number of vehicles: {df['vehicle_id'].nunique()}")

# Handle NaN values in route column
route_values = df['route'].dropna().unique()
print(f"Number of routes: {len(route_values)}")
print(f"\nRoutes: {sorted(route_values)}")

# Count NaN routes
nan_routes = df['route'].isna().sum()
print(f"Unmatched locations (no route): {nan_routes} ({nan_routes/len(df)*100:.2f}%)")

print(f"\nSpeed statistics (km/h):")
print(df['speed_kmh'].describe())

## Step 3: Segment into Consecutive Trips

Segments are created based on:
- Vehicle ID changes
- Time gaps > max_timedelta seconds

In [None]:
# Segment the data with 5-minute maximum time gap
max_timedelta = 15
segmented_df = segment_by_consecutive(df, max_timedelta=max_timedelta, segment_column='segment_id')

num_segments = segmented_df['segment_id'].nunique()
print(f"Created {num_segments} segments with max time gap of {max_timedelta}s ({max_timedelta/60:.1f} minutes)")

# Segment size distribution
segment_sizes = segmented_df.groupby('segment_id').size()
print(f"\nSegment size statistics:")
print(segment_sizes.describe())

In [None]:
# Plot segment size distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
segment_sizes.hist(bins=50)
plt.xlabel('Segment Size (number of points)')
plt.ylabel('Frequency')
plt.title('Segment Size Distribution')
plt.grid(alpha=0.3)

plt.subplot(1, 2, 2)
segment_sizes.hist(bins=50, cumulative=True, density=True)
plt.xlabel('Segment Size (number of points)')
plt.ylabel('Cumulative Probability')
plt.title('Cumulative Distribution')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nSegments with >= 10 points: {(segment_sizes >= 10).sum()} ({(segment_sizes >= 10).sum() / len(segment_sizes) * 100:.1f}%)")
print(f"Segments with >= 20 points: {(segment_sizes >= 20).sum()} ({(segment_sizes >= 20).sum() / len(segment_sizes) * 100:.1f}%)")
print(f"Segments with >= 50 points: {(segment_sizes >= 50).sum()} ({(segment_sizes >= 50).sum() / len(segment_sizes) * 100:.1f}%)")

## Step 4: Select and Visualize One Segment

Let's select a segment with a reasonable number of points and visualize its speed over time.

In [None]:
# Filter to segments with valid routes (no NaN)
segmented_with_routes = segmented_df[segmented_df['route'].notna()].copy()
valid_segment_sizes = segmented_with_routes.groupby('segment_id').size()

# Find segments with at least 20 points for better visualization
large_segments = valid_segment_sizes[valid_segment_sizes >= 20].index

if len(large_segments) == 0:
    print("No segments with >= 20 points and valid routes found. Using largest valid segment.")
    selected_segment_id = valid_segment_sizes.idxmax()
else:
    # Select a segment from the middle of the dataset
    selected_segment_id = large_segments[len(large_segments) // 2]

# Extract the selected segment
segment = segmented_with_routes[segmented_with_routes['segment_id'] == selected_segment_id].copy()
segment = segment.sort_values('timestamp').reset_index(drop=True)

print(f"Selected segment {selected_segment_id}")
print(f"Number of points: {len(segment)}")
print(f"Vehicle ID: {segment['vehicle_id'].iloc[0]}")
print(f"Route: {segment['route'].iloc[0]}")
print(f"Time range: {segment['timestamp'].iloc[0]} to {segment['timestamp'].iloc[-1]}")
print(f"Duration: {(segment['timestamp'].iloc[-1] - segment['timestamp'].iloc[0]).total_seconds() / 60:.1f} minutes")

In [None]:
# Display segment data
segment[['vehicle_id', 'timestamp', 'latitude', 'longitude', 'route', 'distance_km', 'speed_kmh', 'epoch_seconds']].head(10)

## Step 5: Visualize Speed Over Time

In [None]:
# Create time-based plot
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Filter out NaN values for plotting
segment_clean = segment[segment['speed_kmh'].notna()].copy()

# Plot 1: Speed over time
ax1 = axes[0]
ax1.plot(segment_clean['timestamp'], segment_clean['speed_kmh'], marker='o', linestyle='-', linewidth=2, markersize=4)
ax1.set_xlabel('Time', fontsize=12)
ax1.set_ylabel('Speed (km/h)', fontsize=12)
ax1.set_title(f'Speed Over Time - Segment {selected_segment_id} (Route: {segment["route"].iloc[0]})', fontsize=14, fontweight='bold')
ax1.grid(alpha=0.3)
mean_speed = segment_clean['speed_kmh'].mean()
ax1.axhline(y=mean_speed, color='r', linestyle='--', label=f'Mean: {mean_speed:.2f} km/h')
ax1.legend()

# Rotate x-axis labels for better readability
plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45, ha='right')

# Plot 2: Distance traveled over time (cumulative)
ax2 = axes[1]
cumulative_distance = segment_clean['distance_km'].cumsum()
ax2.plot(segment_clean['timestamp'], cumulative_distance, marker='o', linestyle='-', linewidth=2, markersize=4, color='green')
ax2.set_xlabel('Time', fontsize=12)
ax2.set_ylabel('Cumulative Distance (km)', fontsize=12)
ax2.set_title(f'Cumulative Distance Over Time - Segment {selected_segment_id}', fontsize=14, fontweight='bold')
ax2.grid(alpha=0.3)
total_distance = cumulative_distance.iloc[-1]
ax2.text(0.02, 0.98, f'Total Distance: {total_distance:.2f} km',
         transform=ax2.transAxes, fontsize=11, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# Rotate x-axis labels
plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

print(f"\nSpeed Statistics for Segment {selected_segment_id}:")
print(f"  Mean speed: {segment_clean['speed_kmh'].mean():.2f} km/h")
print(f"  Median speed: {segment_clean['speed_kmh'].median():.2f} km/h")
print(f"  Max speed: {segment_clean['speed_kmh'].max():.2f} km/h")
print(f"  Min speed: {segment_clean['speed_kmh'].min():.2f} km/h")
print(f"  Std dev: {segment_clean['speed_kmh'].std():.2f} km/h")
print(f"  Total distance: {total_distance:.2f} km")

## Step 6: Additional Visualization - Speed Distribution

In [None]:
# Speed distribution histogram (using clean data without NaN)
segment_clean = segment[segment['speed_kmh'].notna()].copy()

plt.figure(figsize=(10, 6))
plt.hist(segment_clean['speed_kmh'], bins=20, edgecolor='black', alpha=0.7)
mean_speed = segment_clean['speed_kmh'].mean()
median_speed = segment_clean['speed_kmh'].median()
plt.axvline(mean_speed, color='r', linestyle='--', linewidth=2, label=f'Mean: {mean_speed:.2f} km/h')
plt.axvline(median_speed, color='g', linestyle='--', linewidth=2, label=f'Median: {median_speed:.2f} km/h')
plt.xlabel('Speed (km/h)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title(f'Speed Distribution - Segment {selected_segment_id}', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## Step 7: Geographic Visualization (Optional)

Plot the segment path on a simple coordinate plot.

In [None]:
# Plot the geographic path with speed as color (using clean data without NaN)
segment_clean = segment[segment['speed_kmh'].notna()].copy()

plt.figure(figsize=(12, 10))

# Create a scatter plot with speed as color
scatter = plt.scatter(segment_clean['longitude'], segment_clean['latitude'],
                     c=segment_clean['speed_kmh'], cmap='RdYlGn',
                     s=100, edgecolors='black', linewidth=0.5)

# Add line connecting the points
plt.plot(segment_clean['longitude'], segment_clean['latitude'],
         color='blue', alpha=0.3, linewidth=1, linestyle='--')

# Mark start and end
plt.scatter(segment_clean['longitude'].iloc[0], segment_clean['latitude'].iloc[0],
           color='green', s=200, marker='o', edgecolors='black', linewidth=2, label='Start', zorder=5)
plt.scatter(segment_clean['longitude'].iloc[-1], segment_clean['latitude'].iloc[-1],
           color='red', s=200, marker='s', edgecolors='black', linewidth=2, label='End', zorder=5)

plt.colorbar(scatter, label='Speed (km/h)')
plt.xlabel('Longitude', fontsize=12)
plt.ylabel('Latitude', fontsize=12)
plt.title(f'Geographic Path - Segment {selected_segment_id} (Route: {segment["route"].iloc[0]})',
         fontsize=14, fontweight='bold')
plt.legend(loc='best')
plt.grid(alpha=0.3)
plt.axis('equal')
plt.tight_layout()
plt.show()

## Summary

This notebook demonstrated:
1. ✓ Loading preprocessed vehicle location data
2. ✓ Segmenting data into consecutive trips
3. ✓ Visualizing speed over time for a single segment
4. ✓ Analyzing speed distribution and geographic path
5. ✓ **ETA analysis with stop information** (see below)

The preprocessing pipeline handles:
- Timestamp conversion to epoch seconds
- Route matching using closest point algorithm
- Distance calculation between consecutive points
- Speed calculation from distance and time
- Segmentation based on vehicle ID and time gaps
- Stop detection and ETA calculation

In [None]:
# Run the ETA pipeline (handles segmentation, stop detection, filtering, and ETA calculation)
from ml.pipelines import stops_pipeline

print("="*70)
print("RUNNING ETA PIPELINE")
print("="*70)
stops_df = stops_pipeline()

print(f"\nFinal ETA data:")
print(f"  Total points: {len(stops_df):,}")
print(f"  Total segments: {stops_df['segment_id'].nunique():,}")
print(f"\nColumns in ETA data:")
print(f"  {list(stops_df.columns)}")

# Display sample data
print(f"\nSample data with stop and ETA information:")
stops_df[['timestamp', 'vehicle_id', 'route', 'stop_name', 'stop_route', 'speed_kmh']].head(20)

In [None]:
# Analyze the ETA pipeline results
print("="*70)
print("ETA PIPELINE STATISTICS")
print("="*70)

# Stop information
points_with_stops = stops_df['stop_name'].notna().sum()
points_without_stops = stops_df['stop_name'].isna().sum()
print(f"\nStop Detection:")
print(f"  Points at stops: {points_with_stops:,} ({points_with_stops/len(stops_df)*100:.2f}%)")
print(f"  Points not at stops: {points_without_stops:,} ({points_without_stops/len(stops_df)*100:.2f}%)")
print(f"  Unique stops visited: {stops_df['stop_name'].nunique()}")
# List all unique stops
print(f"\nAll stops in dataset:")
for stop in sorted(stops_df['stop_name'].dropna().unique()):
    count = (stops_df['stop_name'] == stop).sum()
    print(f"  - {stop}: {count:,} visits")

# ETA information
eta_calculated = stops_df['eta_seconds'].notna().sum()
eta_null = stops_df['eta_seconds'].isna().sum()
print(f"\nETA Calculation:")
print(f"  ETAs calculated: {eta_calculated:,} ({eta_calculated/len(stops_df)*100:.2f}%)")
print(f"  NULL ETAs: {eta_null:,} ({eta_null/len(stops_df)*100:.2f}%)")
print(f"  (NULL ETAs occur after the last stop in each segment)")

# ETA statistics
eta_values = stops_df['eta_seconds'].dropna()
print(f"\nETA Distribution (seconds):")
print(eta_values.describe())
print(f"\nETA Distribution (minutes):")
print((eta_values / 60).describe())

# Segment information
segments_in_eta_data = stops_df['segment_id'].nunique()
print(f"\nSegments in ETA data: {segments_in_eta_data:,}")
print(f"  (These are segments that passed through at least one stop)")

# Compare to original segmented data
original_df = segment_pipeline()
original_segments = original_df['segment_id'].nunique()
filtered_segments = original_segments - segments_in_eta_data
print(f"\nSegment Filtering:")
print(f"  Original segments (from segment_pipeline): {original_segments:,}")
print(f"  Segments with stops (in ETA data): {segments_in_eta_data:,} ({segments_in_eta_data/original_segments*100:.2f}%)")
print(f"  Segments filtered out (no stops): {filtered_segments:,} ({filtered_segments/original_segments*100:.2f}%)")

## Step 9: Visualize ETA for a Segment with Stops

In [None]:
# Find a segment with multiple stops for better visualization
segment_stop_counts = stops_df.groupby('segment_id')['stop_name'].apply(lambda x: x.notna().sum())
segments_with_multiple_stops = segment_stop_counts[segment_stop_counts >= 2].index

if len(segments_with_multiple_stops) > 0:
    # Select a segment from the middle
    selected_eta_segment_id = segments_with_multiple_stops[len(segments_with_multiple_stops) // 2]
else:
    # Fallback to any segment with at least one stop
    segments_with_stops = segment_stop_counts[segment_stop_counts >= 1].index
    selected_eta_segment_id = segments_with_stops[len(segments_with_stops) // 2] if len(segments_with_stops) > 0 else stops_df['segment_id'].iloc[0]

# Extract the segment
eta_segment = stops_df[stops_df['segment_id'] == selected_eta_segment_id].copy()
eta_segment = eta_segment.sort_values('timestamp').reset_index(drop=True)

print(f"Selected segment {selected_eta_segment_id} for ETA visualization")
print(f"Number of points: {len(eta_segment)}")
print(f"Points at stops: {eta_segment['stop_name'].notna().sum()}")
print(f"Unique stops visited: {eta_segment['stop_name'].nunique()}")
print(f"ETAs calculated: {eta_segment['eta_seconds'].notna().sum()}")
print(f"Route: {eta_segment['route'].iloc[0]}")
print(f"\nStops in this segment:")
stops_in_segment = eta_segment[eta_segment['stop_name'].notna()][['timestamp', 'stop_name', 'stop_route']].drop_duplicates()
for idx, row in stops_in_segment.iterrows():
    print(f"  - {row['stop_name']} ({row['stop_route']}) at {row['timestamp']}")

# Display segment with ETA information
print(f"\nSegment data with ETAs:")
eta_segment[['timestamp', 'latitude', 'longitude', 'stop_name', 'eta_seconds', 'speed_kmh']].head(20)

In [None]:
# Summary visualization: ETA Pipeline Results
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Get original segment count for comparison
original_df = segment_pipeline()
original_segments = original_df['segment_id'].nunique()
segments_in_eta = stops_df['segment_id'].nunique()
segments_filtered = original_segments - segments_in_eta

# Get statistics
points_with_stops = stops_df['stop_name'].notna().sum()
points_without_stops = stops_df['stop_name'].isna().sum()
eta_calculated = stops_df['eta_seconds'].notna().sum()
eta_null = stops_df['eta_seconds'].isna().sum()

# Plot 1: Segment filtering
ax1 = axes[0]
segment_categories = ['Original\nSegments', 'Segments\nwith Stops', 'Segments\nFiltered Out']
segment_counts = [original_segments, segments_in_eta, segments_filtered]
colors = ['#3498db', '#2ecc71', '#e74c3c']
bars1 = ax1.bar(segment_categories, segment_counts, color=colors, edgecolor='black', linewidth=1.5)
ax1.set_ylabel('Number of Segments', fontsize=12)
ax1.set_title('Segment Filtering', fontsize=14, fontweight='bold')
ax1.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars1:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height):,}',
            ha='center', va='bottom', fontsize=10, fontweight='bold')

# Plot 2: Points at stops vs not at stops
ax2 = axes[1]
point_categories = ['Points\nat Stops', 'Points\nNot at Stops']
point_counts = [points_with_stops, points_without_stops]
colors2 = ['#2ecc71', '#95a5a6']
bars2 = ax2.bar(point_categories, point_counts, color=colors2, edgecolor='black', linewidth=1.5)
ax2.set_ylabel('Number of Points', fontsize=12)
ax2.set_title('Stop Detection Results', fontsize=14, fontweight='bold')
ax2.grid(axis='y', alpha=0.3)

# Add value labels and percentages
for bar, count in zip(bars2, point_counts):
    height = bar.get_height()
    percentage = count / len(stops_df) * 100
    ax2.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(count):,}\n({percentage:.1f}%)',
            ha='center', va='bottom', fontsize=10, fontweight='bold')

# Plot 3: ETA calculation results
ax3 = axes[2]
eta_categories = ['ETAs\nCalculated', 'NULL\nETAs']
eta_counts = [eta_calculated, eta_null]
colors3 = ['#9b59b6', '#e67e22']
bars3 = ax3.bar(eta_categories, eta_counts, color=colors3, edgecolor='black', linewidth=1.5)
ax3.set_ylabel('Number of Points', fontsize=12)
ax3.set_title('ETA Calculation Results', fontsize=14, fontweight='bold')
ax3.grid(axis='y', alpha=0.3)

# Add value labels and percentages
for bar, count in zip(bars3, eta_counts):
    height = bar.get_height()
    percentage = count / len(stops_df) * 100
    ax3.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(count):,}\n({percentage:.1f}%)',
            ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()

print("\n" + "="*70)
print("ETA PIPELINE SUMMARY")
print("="*70)
print(f"Original segments:              {original_segments:,}")
print(f"Segments with stops:            {segments_in_eta:,} ({segments_in_eta/original_segments*100:.1f}%)")
print(f"Segments filtered out:          {segments_filtered:,} ({segments_filtered/original_segments*100:.1f}%)")
print(f"\nTotal points in ETA data:       {len(stops_df):,}")
print(f"Points at stops:                {points_with_stops:,} ({points_with_stops/len(stops_df)*100:.1f}%)")
print(f"Points not at stops:            {points_without_stops:,} ({points_without_stops/len(stops_df)*100:.1f}%)")
print(f"\nETAs calculated:                {eta_calculated:,} ({eta_calculated/len(stops_df)*100:.1f}%)")
print(f"NULL ETAs:                      {eta_null:,} ({eta_null/len(stops_df)*100:.1f}%)")
print("="*70)

In [None]:
# display the segment with the maximum eta
max_eta_idx = stops_df['eta_seconds'].idxmax()
segment_with_max_eta = stops_df[stops_df['segment_id'] == stops_df.loc[max_eta_idx, 'segment_id']].copy()
segment_with_max_eta = segment_with_max_eta.sort_values('timestamp')
segment_with_max_eta

In [None]:
# Visualize ETA over time
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Filter to points with ETAs
eta_segment_clean = eta_segment[eta_segment['eta_seconds'].notna()].copy()

# Plot 1: ETA to next stop over time
ax1 = axes[0]
ax1.plot(eta_segment_clean['timestamp'], eta_segment_clean['eta_seconds'],
         marker='o', linestyle='-', linewidth=2, markersize=4, color='purple')
ax1.set_xlabel('Time', fontsize=12)
ax1.set_ylabel('ETA to Next Stop (seconds)', fontsize=12)
ax1.set_title(f'ETA Over Time - Segment {selected_eta_segment_id}', fontsize=14, fontweight='bold')
ax1.grid(alpha=0.3)

# Highlight points at stops
stops_mask = eta_segment_clean['stop_name'].notna()
if stops_mask.any():
    ax1.scatter(eta_segment_clean[stops_mask]['timestamp'],
               eta_segment_clean[stops_mask]['eta_seconds'],
               color='red', s=150, marker='*', label='At Stop', zorder=5)
    ax1.legend()

plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45, ha='right')

# Plot 2: ETA vs Speed
ax2 = axes[1]
# Filter to points with both ETA and speed
plot_data = eta_segment[(eta_segment['eta_seconds'].notna()) & (eta_segment['speed_kmh'].notna())].copy()

if len(plot_data) > 0:
    ax2_twin = ax2.twinx()

    # Plot ETA on left y-axis
    line1 = ax2.plot(plot_data['timestamp'], plot_data['eta_seconds'],
                     marker='o', linestyle='-', linewidth=2, markersize=4,
                     color='purple', label='ETA (seconds)')
    ax2.set_ylabel('ETA to Next Stop (seconds)', fontsize=12, color='purple')
    ax2.tick_params(axis='y', labelcolor='purple')

    # Plot speed on right y-axis
    line2 = ax2_twin.plot(plot_data['timestamp'], plot_data['speed_kmh'],
                          marker='s', linestyle='-', linewidth=2, markersize=4,
                          color='blue', label='Speed (km/h)', alpha=0.7)
    ax2_twin.set_ylabel('Speed (km/h)', fontsize=12, color='blue')
    ax2_twin.tick_params(axis='y', labelcolor='blue')

    ax2.set_xlabel('Time', fontsize=12)
    ax2.set_title(f'ETA vs Speed - Segment {selected_eta_segment_id}', fontsize=14, fontweight='bold')
    ax2.grid(alpha=0.3)

    # Combine legends
    lines = line1 + line2
    labels = [l.get_label() for l in lines]
    ax2.legend(lines, labels, loc='upper right')

    plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

# Statistics
if len(eta_segment_clean) > 0:
    print(f"\nETA Statistics for Segment {selected_eta_segment_id}:")
    print(f"  Mean ETA: {eta_segment_clean['eta_seconds'].mean():.2f} seconds ({eta_segment_clean['eta_seconds'].mean()/60:.2f} minutes)")
    print(f"  Median ETA: {eta_segment_clean['eta_seconds'].median():.2f} seconds ({eta_segment_clean['eta_seconds'].median()/60:.2f} minutes)")
    print(f"  Max ETA: {eta_segment_clean['eta_seconds'].max():.2f} seconds ({eta_segment_clean['eta_seconds'].max()/60:.2f} minutes)")
    print(f"  Min ETA: {eta_segment_clean['eta_seconds'].min():.2f} seconds ({eta_segment_clean['eta_seconds'].min()/60:.2f} minutes)")

## Conclusions

The ETA pipeline analysis shows:
- **Stop Detection**: A significant portion of segments pass through known stops
- **Segment Filtering**: Segments without any stops are filtered out, focusing the dataset on meaningful trip segments
- **ETA Calculation**: Most points have valid ETAs to the next stop, with NULL values primarily occurring after the final stop in each segment
- **Data Quality**: The pipeline effectively identifies stops and calculates realistic ETAs that can be used for machine learning prediction tasks

The data is now ready for LSTM and ARIMA model training!

## Test Cases for clean_stops() function
- **Test Case 1**: Current/next point distance is closer to stop than previous point
- **Test Case 2**: Previous point distance is closer to stop than current/next point
- **Test Case 3**: Tie (current point always wins ties)

In [None]:
# Test for `clean_stops` using ROUTES data
# Test Case 1: curr_distance is CLOSER to the stop
# Builds a DataFrame composed of: first polyline minus its last two points,
# then the first point of the next polyline. Then runs `clean_stops`.
import json
import pandas as pd
from ml.data.preprocess import clean_stops

routes_file = project_root / 'shared' / 'routes.json'
with open(routes_file, 'r', encoding='utf-8') as f:
    routes = json.load(f)

# Choose first route in the file
route_key = list(routes.keys())[0]
polylines = routes[route_key]['ROUTES']

# Defensive checks
if len(polylines) < 2:
    raise RuntimeError('Expected at least two polylines in the chosen route for this test')

first_poly = polylines[0]
second_poly = polylines[1]

# Take first polyline minus its last two points, then append the first point of next polyline
if len(first_poly) >= 3:
    coords = first_poly[:-2]
else:
    coords = first_poly.copy()
coords.append(second_poly[0])

# Build DataFrame
df = pd.DataFrame(coords, columns=['lat', 'lon'])
# All rows have same route name
df['route'] = route_key
# polyline idx: 0 for points from first polyline, 1 for the appended point
df['polyline_idx'] = [0] * (len(df) - 1) + [1]
# Initially no stops recorded
df['stop'] = pd.NA
# Distances: earlier points set far, last point set very close (so the closer coordinate should be labeled)
df['dist'] = [0.05] * (len(df) - 1) + [0.001]

print('Before clean_stops:')
print(df[['lat','lon','polyline_idx','dist','stop']])

# Run the function under test
clean_stops(
    df,
    route_column='route',
    polyline_index_column='polyline_idx',
    stop_column='stop',
    lat_column='lat',
    lon_column='lon',
    distance_column='dist'
)

print('\nAfter clean_stops:')
print(df[['lat','lon','polyline_idx','dist','stop']])

In [10]:
# Test Case 2: prev_distance is CLOSER to the stop
# The stop should be assigned to the PREVIOUS row (shifted assignment)
import json
import pandas as pd
from ml.data.preprocess import clean_stops

routes_file = project_root / 'shared' / 'routes.json'
with open(routes_file, 'r', encoding='utf-8') as f:
    routes = json.load(f)

route_key = list(routes.keys())[0]
polylines = routes[route_key]['ROUTES']

if len(polylines) < 2:
    raise RuntimeError('Expected at least two polylines')

first_poly = polylines[0]
second_poly = polylines[1]

if len(first_poly) >= 3:
    coords = first_poly[:-2]
else:
    coords = first_poly.copy()
coords.append(second_poly[0])

df2 = pd.DataFrame(coords, columns=['lat', 'lon'])
df2['route'] = route_key
df2['polyline_idx'] = [0] * (len(df2) - 1) + [1]
df2['stop'] = pd.NA
# Key difference: prev distance is CLOSE (0.001), current is FAR (0.05)
df2['dist'] = [0.001] * (len(df2) - 1) + [0.05]

print('\n' + '='*70)
print('TEST CASE 2: prev_distance is CLOSER (0.001 vs 0.05)')
print('='*70)
print('Before clean_stops:')
print(df2[['lat','lon','polyline_idx','dist','stop']])

clean_stops(
    df2,
    route_column='route',
    polyline_index_column='polyline_idx',
    stop_column='stop',
    lat_column='lat',
    lon_column='lon',
    distance_column='dist'
)

print('\nAfter clean_stops:')
print(df2[['lat','lon','polyline_idx','dist','stop']])
print('\n✓ Expected: stop should be assigned to row with prev_distance (0.001), NOT the last row')



TEST CASE 2: prev_distance is CLOSER (0.001 vs 0.05)
Before clean_stops:
          lat        lon  polyline_idx   dist  stop
0   42.730711 -73.676737             0  0.001  <NA>
1   42.730711 -73.676737             0  0.001  <NA>
2   42.730676 -73.676552             0  0.001  <NA>
3   42.730606 -73.676315             0  0.001  <NA>
4   42.730531 -73.676165             0  0.001  <NA>
5   42.730398 -73.675924             0  0.001  <NA>
6   42.730352 -73.675786             0  0.001  <NA>
7   42.730189 -73.675337             0  0.001  <NA>
8   42.730112 -73.675185             0  0.001  <NA>
9   42.730085 -73.675131             0  0.001  <NA>
10  42.730085 -73.675131             0  0.001  <NA>
11  42.730231 -73.674884             0  0.001  <NA>
12  42.730353 -73.674630             0  0.001  <NA>
13  42.730414 -73.674369             0  0.001  <NA>
14  42.730460 -73.674251             0  0.001  <NA>
15  42.730831 -73.673825             0  0.001  <NA>
16  42.730942 -73.673636             0  0.

In [None]:
# Test Case 3: prev_distance EQUALS current_distance (tie scenario)
# According to the code: prev_closer_mask = (df['prev_distance'] < df[distance_column])
# When equal, prev_closer_mask is False, so next_closer_mask (>=) will be True
# Stop should be assigned to the CURRENT (last) row
import json
import pandas as pd
from ml.data.preprocess import clean_stops

routes_file = project_root / 'shared' / 'routes.json'
with open(routes_file, 'r', encoding='utf-8') as f:
    routes = json.load(f)

route_key = list(routes.keys())[0]
polylines = routes[route_key]['ROUTES']

if len(polylines) < 2:
    raise RuntimeError('Expected at least two polylines')

first_poly = polylines[0]
second_poly = polylines[1]

if len(first_poly) >= 3:
    coords = first_poly[:-2]
else:
    coords = first_poly.copy()
coords.append(second_poly[0])

df4 = pd.DataFrame(coords, columns=['lat', 'lon'])
df4['route'] = route_key
df4['polyline_idx'] = [0] * (len(df4) - 1) + [1]
df4['stop'] = pd.NA
# Key: both distances are the SAME (0.025)
df4['dist'] = [0.025] * len(df4)

print('\n' + '='*70)
print('TEST CASE 4: prev_distance EQUALS current_distance (tie)')
print('='*70)
print('Before clean_stops:')
print(df4[['lat','lon','polyline_idx','dist','stop']])

clean_stops(
    df4,
    route_column='route',
    polyline_index_column='polyline_idx',
    stop_column='stop',
    lat_column='lat',
    lon_column='lon',
    distance_column='dist'
)

print('\nAfter clean_stops:')
print(df4[['lat','lon','polyline_idx','dist','stop']])
print('\n✓ Expected: on tie (equal distances), current_distance wins (>= operator), so stop on last row')



TEST CASE 4: prev_distance EQUALS current_distance (tie)
Before clean_stops:
          lat        lon  polyline_idx   dist  stop
0   42.730711 -73.676737             0  0.025  <NA>
1   42.730711 -73.676737             0  0.025  <NA>
2   42.730676 -73.676552             0  0.025  <NA>
3   42.730606 -73.676315             0  0.025  <NA>
4   42.730531 -73.676165             0  0.025  <NA>
5   42.730398 -73.675924             0  0.025  <NA>
6   42.730352 -73.675786             0  0.025  <NA>
7   42.730189 -73.675337             0  0.025  <NA>
8   42.730112 -73.675185             0  0.025  <NA>
9   42.730085 -73.675131             0  0.025  <NA>
10  42.730085 -73.675131             0  0.025  <NA>
11  42.730231 -73.674884             0  0.025  <NA>
12  42.730353 -73.674630             0  0.025  <NA>
13  42.730414 -73.674369             0  0.025  <NA>
14  42.730460 -73.674251             0  0.025  <NA>
15  42.730831 -73.673825             0  0.025  <NA>
16  42.730942 -73.673636             0