# Traffic Data Visualization Dashboard

**Maintainer:** THAT Le Quang (Xiel)  
**Email:** fxlqthat@gmail.com  
**Version:** 1.0  
**Created:** October 27, 2025

---

Interactive dashboard for exploring collected traffic data from GCP.

## Features

- 📁 Auto-select latest data folder or choose manually
- 📊 Display comprehensive data statistics
- 🗺️ Visualize traffic on geographic maps
- 📈 Analyze temporal patterns and trends
- Validate data quality and completeness

## 1. Setup and Imports

In [1]:
# Standard library imports
import json
import os
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional

# Data processing
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# Configure display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

print("All imports successful!")

All imports successful!


## 2. Data Folder Selection

In [2]:
# Project root
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA_DIR = PROJECT_ROOT / 'data' / 'downloads'

# Find all download folders
download_folders = sorted([f for f in DATA_DIR.iterdir() if f.is_dir() and f.name.startswith('download_')], 
                          reverse=True)

if not download_folders:
    print("❌ No download folders found in data/downloads/")
    print("Please run: bash scripts/data_management/download_data_compressed.sh")
    raise FileNotFoundError("No data folders available")

print(f"📁 Found {len(download_folders)} download folder(s):")
for i, folder in enumerate(download_folders[:5], 1):
    # Get folder size
    size_mb = sum(f.stat().st_size for f in folder.rglob('*') if f.is_file()) / 1024 / 1024
    print(f"  {i}. {folder.name} ({size_mb:.1f} MB)")

if len(download_folders) > 5:
    print(f"  ... and {len(download_folders) - 5} more")

📁 Found 3 download folder(s):
  1. download_20251027_185415 (4.6 MB)
  2. download_20251027_181522 (4.5 MB)
  2. download_20251027_181522 (4.5 MB)
  3. download_20251026_210927 (0.6 MB)
  3. download_20251026_210927 (0.6 MB)


In [3]:
# Select data folder (default to latest)
SELECTED_FOLDER_INDEX = 0  # Change this to select different folder (0 = latest)

selected_folder = download_folders[SELECTED_FOLDER_INDEX]
print(f"\nSelected: {selected_folder.name}")
print(f"   Path: {selected_folder}")

# Check folder structure
data_subdir = selected_folder / 'data'
if data_subdir.exists():
    print(f"\n📂 Data structure:")
    for item in sorted(data_subdir.iterdir())[:10]:
        if item.is_dir():
            print(f"   📁 {item.name}/")
        else:
            size_kb = item.stat().st_size / 1024
            print(f"   📄 {item.name} ({size_kb:.1f} KB)")


Selected: download_20251027_185415
   Path: d:\UNI\DSP391m\project\data\downloads\download_20251027_185415

📂 Data structure:
   📁 archive/
   📄 edges.json (0.0 KB)
   📁 images/
   📁 node/
   📄 nodes.json (21.5 KB)
   📁 processed/
   📄 quality_report.json (0.6 KB)
   📄 statistics.json (0.2 KB)
   📄 traffic_edges.json (33.6 KB)
   📄 weather_snapshot.json (8.9 KB)


## 3. Load and Validate Data

In [4]:
def load_json_file(filepath: Path) -> Optional[Dict]:
    """Load JSON file safely."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"⚠️ Error loading {filepath.name}: {e}")
        return None

def find_collection_runs(data_dir: Path) -> List[Path]:
    """Find all collection run directories."""
    node_dir = data_dir / 'node'
    if not node_dir.exists():
        return []
    return sorted([d for d in node_dir.iterdir() if d.is_dir()], reverse=True)

# Find all runs
runs = find_collection_runs(data_subdir)
print(f"\n🔍 Found {len(runs)} collection run(s)")

if runs:
    print(f"\nLatest 10 runs:")
    for i, run in enumerate(runs[:10], 1):
        # Parse timestamp from folder name
        try:
            ts = datetime.strptime(run.name, '%Y%m%d%H%M%S')
            print(f"  {i}. {run.name} - {ts.strftime('%Y-%m-%d %H:%M:%S')}")
        except:
            print(f"  {i}. {run.name}")


🔍 Found 100 collection run(s)

Latest 10 runs:
  1. 20251027113504 - 2025-10-27 11:35:04
  2. 20251027113420 - 2025-10-27 11:34:20
  3. 20251027110416 - 2025-10-27 11:04:16
  4. 20251027103413 - 2025-10-27 10:34:13
  5. 20251027100409 - 2025-10-27 10:04:09
  6. 20251027093406 - 2025-10-27 09:34:06
  7. 20251027090402 - 2025-10-27 09:04:02
  8. 20251027083359 - 2025-10-27 08:33:59
  9. 20251027080356 - 2025-10-27 08:03:56
  10. 20251027073352 - 2025-10-27 07:33:52


In [5]:
# Load data from latest run
if not runs:
    print("❌ No collection runs found!")
else:
    latest_run = runs[0]
    print(f"\n📥 Loading data from: {latest_run.name}")
    
    # Load manifest
    manifest = load_json_file(latest_run / 'manifest.json')
    
    # Load collectors data
    collectors_dir = latest_run / 'collectors'
    
    # Overpass data (nodes and edges)
    overpass_dir = collectors_dir / 'overpass'
    nodes = load_json_file(overpass_dir / 'nodes.json') if overpass_dir.exists() else None
    edges = load_json_file(overpass_dir / 'edges.json') if overpass_dir.exists() else None
    
    # Traffic data
    google_dir = collectors_dir / 'google'
    traffic = load_json_file(google_dir / 'traffic_edges.json') if google_dir.exists() else None
    
    # Mock traffic data
    mock_dir = collectors_dir / 'mock'
    mock_traffic = load_json_file(mock_dir / 'traffic_snapshot_normalized.json') if mock_dir.exists() else None
    
    # Weather data
    weather_dir = collectors_dir / 'open_meteo'
    weather = load_json_file(weather_dir / 'weather_snapshot.json') if weather_dir.exists() else None
    
    print("\nData loaded:")
    print(f"   Manifest: {'✓' if manifest else '✗'}")
    print(f"   Nodes: {len(nodes) if nodes else 0}")
    print(f"   Edges: {len(edges) if edges else 0}")
    print(f"   Traffic (Google): {len(traffic) if traffic else 0}")
    print(f"   Traffic (Mock): {len(mock_traffic) if mock_traffic else 0}")
    print(f"   Weather: {'✓' if weather else '✗'}")


📥 Loading data from: 20251027113504

Data loaded:
   Manifest: ✓
   Nodes: 40
   Edges: 0
   Traffic (Google): 120
   Traffic (Mock): 0
   Weather: ✓


## 4. Data Statistics and Summary

In [6]:
# Display manifest information
if manifest:
    print("📋 Collection Manifest:")
    print(f"   Run ID: {manifest.get('run_id', 'N/A')}")
    print(f"   Timestamp: {manifest.get('timestamp', 'N/A')}")
    print(f"   Status: {manifest.get('status', 'N/A')}")
    
    collectors = manifest.get('collectors', {})
    print(f"\n   Collectors:")
    for name, info in collectors.items():
        status = info.get('status', 'unknown')
        icon = '✅' if status == 'success' else '⚠️'
        print(f"     {icon} {name}: {status}")

📋 Collection Manifest:
   Run ID: N/A
   Timestamp: 20251027113504
   Status: N/A

   Collectors:


In [7]:
# Create nodes dataframe
if nodes:
    nodes_df = pd.DataFrame(nodes)
    print(f"\n📊 Nodes Summary ({len(nodes_df)} nodes):")
    print(f"\nColumns: {list(nodes_df.columns)}")
    print(f"\nSample data:")
    display(nodes_df.head())
    
    # Geographic bounds
    if 'lat' in nodes_df.columns and 'lon' in nodes_df.columns:
        print(f"\n🗺️ Geographic Bounds:")
        print(f"   Latitude:  {nodes_df['lat'].min():.6f} to {nodes_df['lat'].max():.6f}")
        print(f"   Longitude: {nodes_df['lon'].min():.6f} to {nodes_df['lon'].max():.6f}")
        print(f"   Center: ({nodes_df['lat'].mean():.6f}, {nodes_df['lon'].mean():.6f})")


📊 Nodes Summary (40 nodes):

Columns: ['node_id', 'lat', 'lon', 'degree', 'importance_score', 'road_type', 'connected_road_types', 'street_names', 'intersection_name', 'way_ids', 'is_major_intersection']

Sample data:


Unnamed: 0,node_id,lat,lon,degree,importance_score,road_type,connected_road_types,street_names,intersection_name,way_ids,is_major_intersection
0,node-10.771233-106.693127,10.771233,106.693127,3,19.0,primary,"[secondary, primary]","[Ngã sáu Phù Đổng, Nguyễn Trãi]",Ngã sáu Phù Đổng ∩ Nguyễn Trãi,"[466200149, 35115101, 1267138239]",True
1,node-10.764404-106.698972,10.764404,106.698972,4,24.0,trunk,"[trunk_link, trunk, primary]",[Đường Võ Văn Kiệt],Đường Võ Văn Kiệt,"[165228008, 165228001, 629904752, 724454974]",True
2,node-10.767345-106.706097,10.767345,106.706097,3,19.0,primary,"[secondary, primary]",[Nguyễn Tất Thành],Nguyễn Tất Thành,"[165367632, 211130577, 718458988]",True
3,node-10.765854-106.700628,10.765854,106.700628,3,19.0,primary,"[secondary, primary]","[Võ Văn Kiệt, Calmette]",Võ Văn Kiệt ∩ Calmette,"[724068483, 189027764, 1221898923]",True
4,node-10.766495-106.701049,10.766495,106.701049,3,21.0,trunk,"[trunk, primary]","[Đường Võ Văn Kiệt, Võ Văn Kiệt]",Đường Võ Văn Kiệt ∩ Võ Văn Kiệt,"[724064875, 189193717, 621432415]",True



🗺️ Geographic Bounds:
   Latitude:  10.763673 to 10.781534
   Longitude: 106.690539 to 106.707136
   Center: (10.770089, 106.698928)


In [8]:
# Traffic data summary
traffic_data = traffic or mock_traffic
if traffic_data:
    traffic_df = pd.DataFrame(traffic_data)
    print(f"\n🚗 Traffic Data Summary ({len(traffic_df)} edges):")
    print(f"\nColumns: {list(traffic_df.columns)}")
    print(f"\nSample data:")
    display(traffic_df.head())
    
    # Speed statistics
    if 'speed_kmh' in traffic_df.columns:
        print(f"\n📈 Speed Statistics:")
        print(traffic_df['speed_kmh'].describe())
        
    # Duration statistics
    if 'duration_minutes' in traffic_df.columns:
        print(f"\n⏱️ Duration Statistics:")
        print(traffic_df['duration_minutes'].describe())


🚗 Traffic Data Summary (120 edges):

Columns: ['node_a_id', 'node_b_id', 'distance_km', 'duration_sec', 'speed_kmh', 'timestamp', 'api_type']

Sample data:


Unnamed: 0,node_a_id,node_b_id,distance_km,duration_sec,speed_kmh,timestamp,api_type
0,node-10.766353-106.700978,node-10.765854-106.700628,0.067401,10.695214,22.687222,2025-10-27T11:35:10.297053,mock
1,node-10.766353-106.700978,node-10.765088-106.699763,0.193457,20.372977,34.18478,2025-10-27T11:35:10.297058,mock
2,node-10.763793-106.697804,node-10.763673-106.697915,0.018074,1.454975,44.719785,2025-10-27T11:35:10.297062,mock
3,node-10.763793-106.697804,node-10.763832-106.698113,0.033994,4.319797,28.329643,2025-10-27T11:35:10.297066,mock
4,node-10.763793-106.697804,node-10.764404-106.698972,0.14455,14.731567,35.324153,2025-10-27T11:35:10.297070,mock



📈 Speed Statistics:
count    120.000000
mean      28.985725
std        8.075075
min       15.228248
25%       22.674140
50%       27.952608
75%       36.723888
max       44.719785
Name: speed_kmh, dtype: float64


In [9]:
# Weather data summary
if weather:
    print(f"\n🌤️ Weather Data:")
    
    # Handle both dict and list formats
    if isinstance(weather, dict):
        for key, value in weather.items():
            if isinstance(value, (int, float, str)):
                print(f"   {key}: {value}")
    elif isinstance(weather, list):
        print(f"   Type: List with {len(weather)} items")
        if len(weather) > 0:
            print(f"\n   Sample item:")
            sample = weather[0]
            if isinstance(sample, dict):
                for key, value in sample.items():
                    if isinstance(value, (int, float, str)):
                        print(f"      {key}: {value}")
            else:
                print(f"      {sample}")
    else:
        print(f"   Type: {type(weather).__name__}")
        print(f"   Value: {weather}")



🌤️ Weather Data:
   Type: List with 40 items

   Sample item:
      node_id: node-10.771233-106.693127
      lat: 10.7712328
      lon: 106.6931266
      timestamp: 2025-10-27T11:35:09.059738
      temperature_c: 27.4
      precipitation_mm: 0.0
      wind_speed_kmh: 5.5


## 5. Geographic Visualization

In [10]:
# Plot nodes on map
if nodes and 'lat' in nodes_df.columns and 'lon' in nodes_df.columns:
    fig = px.scatter_mapbox(
        nodes_df,
        lat='lat',
        lon='lon',
        hover_name='id' if 'id' in nodes_df.columns else None,
        zoom=12,
        height=600,
        title='Traffic Monitoring Nodes'
    )
    
    fig.update_layout(
        mapbox_style='open-street-map',
        margin={'r': 0, 't': 40, 'l': 0, 'b': 0}
    )
    
    fig.show()
else:
    print("⚠️ No geographic data available for nodes")

  fig = px.scatter_mapbox(


## 6. Traffic Analysis

In [11]:
# Speed distribution
if traffic_data and 'speed_kmh' in traffic_df.columns:
    fig = px.histogram(
        traffic_df,
        x='speed_kmh',
        nbins=30,
        title='Traffic Speed Distribution',
        labels={'speed_kmh': 'Speed (km/h)', 'count': 'Number of Edges'}
    )
    fig.show()
    
    # Box plot
    fig = px.box(
        traffic_df,
        y='speed_kmh',
        title='Speed Box Plot',
        labels={'speed_kmh': 'Speed (km/h)'}
    )
    fig.show()
else:
    print("⚠️ No speed data available")

In [12]:
# Duration distribution
if traffic_data and 'duration_minutes' in traffic_df.columns:
    fig = px.histogram(
        traffic_df,
        x='duration_minutes',
        nbins=30,
        title='Travel Duration Distribution',
        labels={'duration_minutes': 'Duration (minutes)', 'count': 'Number of Edges'}
    )
    fig.show()
else:
    print("⚠️ No duration data available")

⚠️ No duration data available


## 7. Temporal Analysis (Multiple Runs)

In [13]:
# Load multiple runs for temporal analysis
num_runs_to_analyze = min(20, len(runs))  # Analyze last 20 runs

temporal_data = []

print(f"\n⏳ Loading {num_runs_to_analyze} runs for temporal analysis...")

for run in runs[:num_runs_to_analyze]:
    # Parse timestamp
    try:
        ts = datetime.strptime(run.name, '%Y%m%d%H%M%S')
    except:
        continue
    
    # Load traffic data
    google_file = run / 'collectors' / 'google' / 'traffic_edges.json'
    mock_file = run / 'collectors' / 'mock' / 'traffic_snapshot_normalized.json'
    
    traffic_file = google_file if google_file.exists() else mock_file
    
    if traffic_file.exists():
        data = load_json_file(traffic_file)
        if data:
            df = pd.DataFrame(data)
            if 'speed_kmh' in df.columns:
                temporal_data.append({
                    'timestamp': ts,
                    'avg_speed': df['speed_kmh'].mean(),
                    'min_speed': df['speed_kmh'].min(),
                    'max_speed': df['speed_kmh'].max(),
                    'num_edges': len(df)
                })

if temporal_data:
    temporal_df = pd.DataFrame(temporal_data)
    temporal_df = temporal_df.sort_values('timestamp')
    
    print(f"Loaded {len(temporal_df)} runs with traffic data")
    display(temporal_df.head())
else:
    print("⚠️ No temporal data available")


⏳ Loading 20 runs for temporal analysis...
Loaded 20 runs with traffic data
Loaded 20 runs with traffic data


Unnamed: 0,timestamp,avg_speed,min_speed,max_speed,num_edges
19,2025-10-27 02:33:19,28.518217,15.284437,44.292978,120
18,2025-10-27 03:03:22,32.212458,15.478228,44.849457,120
17,2025-10-27 03:33:25,29.998383,15.06759,44.837531,120
16,2025-10-27 04:03:28,31.434046,15.415523,44.939202,120
15,2025-10-27 04:33:32,29.763498,15.040446,44.672849,120


In [14]:
# Plot temporal trends
if temporal_data:
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(
        x=temporal_df['timestamp'],
        y=temporal_df['avg_speed'],
        mode='lines+markers',
        name='Average Speed',
        line=dict(color='blue', width=2)
    ))
    
    fig.add_trace(go.Scatter(
        x=temporal_df['timestamp'],
        y=temporal_df['max_speed'],
        mode='lines',
        name='Max Speed',
        line=dict(color='green', width=1, dash='dash')
    ))
    
    fig.add_trace(go.Scatter(
        x=temporal_df['timestamp'],
        y=temporal_df['min_speed'],
        mode='lines',
        name='Min Speed',
        line=dict(color='red', width=1, dash='dash')
    ))
    
    fig.update_layout(
        title='Traffic Speed Over Time',
        xaxis_title='Timestamp',
        yaxis_title='Speed (km/h)',
        hovermode='x unified',
        height=500
    )
    
    fig.show()
else:
    print("⚠️ No temporal data to plot")

## 8. Data Quality Report

In [15]:
print("📋 Data Quality Report")
print("=" * 60)

# Check data completeness
quality_metrics = {
    'Total Runs': len(runs),
    'Runs with Traffic Data': len([r for r in runs[:num_runs_to_analyze] 
                                   if (r / 'collectors' / 'google' / 'traffic_edges.json').exists() or
                                      (r / 'collectors' / 'mock' / 'traffic_snapshot_normalized.json').exists()]),
    'Runs with Overpass Data': len([r for r in runs[:num_runs_to_analyze]
                                    if (r / 'collectors' / 'overpass' / 'nodes.json').exists()]),
    'Runs with Weather Data': len([r for r in runs[:num_runs_to_analyze]
                                   if (r / 'collectors' / 'open_meteo' / 'weather_snapshot.json').exists()]),
}

for metric, value in quality_metrics.items():
    print(f"   {metric}: {value}")

# Calculate completeness percentage
if len(runs) > 0:
    traffic_completeness = (quality_metrics['Runs with Traffic Data'] / num_runs_to_analyze) * 100
    overpass_completeness = (quality_metrics['Runs with Overpass Data'] / num_runs_to_analyze) * 100
    weather_completeness = (quality_metrics['Runs with Weather Data'] / num_runs_to_analyze) * 100
    
    print(f"\n📊 Completeness (last {num_runs_to_analyze} runs):")
    print(f"   Traffic Data: {traffic_completeness:.1f}%")
    print(f"   Overpass Data: {overpass_completeness:.1f}%")
    print(f"   Weather Data: {weather_completeness:.1f}%")

📋 Data Quality Report
   Total Runs: 100
   Runs with Traffic Data: 20
   Runs with Overpass Data: 20
   Runs with Weather Data: 20

📊 Completeness (last 20 runs):
   Traffic Data: 100.0%
   Overpass Data: 100.0%
   Weather Data: 100.0%


## 9. Summary and Recommendations

In [16]:
print("\n📝 Summary and Recommendations")
print("=" * 60)

print(f"\nData Folder: {selected_folder.name}")
print(f"Total Runs: {len(runs)}")
print(f"Latest Run: {runs[0].name if runs else 'N/A'}")

if nodes:
    print(f"Nodes: {len(nodes)}")
else:
    print("⚠️ No nodes data found")

if traffic_data:
    print(f"Traffic Edges: {len(traffic_data)}")
    if 'speed_kmh' in traffic_df.columns:
        print(f"   Average Speed: {traffic_df['speed_kmh'].mean():.2f} km/h")
else:
    print("⚠️ No traffic data found")

if temporal_data:
    print(f"Temporal Analysis: {len(temporal_data)} data points")
else:
    print("⚠️ Limited temporal data for trend analysis")

print("\n💡 Recommendations:")
if len(runs) < 10:
    print("   - Collect more data for better temporal analysis")
if quality_metrics.get('Runs with Traffic Data', 0) < len(runs) * 0.8:
    print("   - Some runs are missing traffic data - check collection process")
if quality_metrics.get('Runs with Overpass Data', 0) < len(runs) * 0.8:
    print("   - Some runs are missing Overpass data - may need backfill")

print("\n✨ Dashboard complete!")


📝 Summary and Recommendations

Data Folder: download_20251027_185415
Total Runs: 100
Latest Run: 20251027113504
Nodes: 40
Traffic Edges: 120
   Average Speed: 28.99 km/h
Temporal Analysis: 20 data points

💡 Recommendations:
   - Some runs are missing traffic data - check collection process
   - Some runs are missing Overpass data - may need backfill

✨ Dashboard complete!
