In [1]:
import pandas as pd
import geopandas as gpd
import movingpandas as mpd
import numpy as np
from datetime import timedelta, datetime
import folium
import warnings
import sys
warnings.filterwarnings('ignore')

print("Geopandas has version {}".format(gpd.__version__))
print("Movingpandas has version {}".format(mpd.__version__))

Geopandas has version 0.13.2
Movingpandas has version 0.17.1


In [2]:
# add paths for modules
sys.path.append('../visualization')
print(sys.path)

# import modules
import visualize

['/Users/janhendrikwebert/maritime_route_prediction/src/models', '/Users/janhendrikwebert/miniforge3/envs/env_geo/lib/python311.zip', '/Users/janhendrikwebert/miniforge3/envs/env_geo/lib/python3.11', '/Users/janhendrikwebert/miniforge3/envs/env_geo/lib/python3.11/lib-dynload', '', '/Users/janhendrikwebert/miniforge3/envs/env_geo/lib/python3.11/site-packages', '../visualization']


In [3]:
# read data from file
filename = '../../data/processed/202204_points_stavanger_cleaned_500k.parquet'
# filename = '../../data/processed/202204_points_stavanger_cleaned_full.parquet'
gdf = gpd.read_parquet(filename)
gdf.head()

Unnamed: 0_level_0,mmsi,imo_nr,length,lon,lat,sog,cog,true_heading,nav_status,message_nr,geometry,speed
date_time_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2022-04-01 06:30:21,209989000_0,9235505,90,4.6236,59.5881,10.0,167.2,174,0,1,POINT (4.62360 59.58810),4.473722
2022-04-01 06:30:31,209989000_0,9235505,90,4.62367,59.5877,9.7,179.6,174,0,1,POINT (4.62367 59.58770),4.473722
2022-04-01 06:30:40,209989000_0,9235505,90,4.62375,59.5873,9.9,173.0,174,0,1,POINT (4.62375 59.58730),4.976744
2022-04-01 06:30:50,209989000_0,9235505,90,4.62384,59.5868,9.8,174.7,174,0,1,POINT (4.62384 59.58680),5.593419
2022-04-01 06:31:10,209989000_0,9235505,90,4.62402,59.5859,9.7,177.4,174,0,1,POINT (4.62402 59.58590),5.038954


In [4]:
# convert to Trajectory Collection
trajectories = mpd.TrajectoryCollection(gdf, traj_id_col='mmsi', obj_id_col='mmsi')

print(f'Loaded dataset: {filename}')
print(f'AIS messages: {len(gdf)}')
print(f'Trajectories: {len(trajectories)}')

Loaded dataset: ../../data/processed/202204_points_stavanger_cleaned_500k.parquet
AIS messages: 483430
Trajectories: 617


In [5]:
# Douglas Peucker trajectory generalization to reduce the number of trajectory points (for plotting purposes)
simplified_trajectories = mpd.DouglasPeuckerGeneralizer(trajectories).generalize(tolerance=0.0005)
n_points, n_DP_points = len(gdf), len(simplified_trajectories.to_point_gdf())
print(f'DP reduced {n_points} AIS messages to {n_DP_points} points ({n_DP_points/n_points*100:.2f}%)')

DP reduced 483430 AIS messages to 14950 points (3.09%)


In [6]:
# plot n random trajectories against the DP simplified trajectories
plot_comparison = False
if plot_comparison:
    n_trajectories = 5  # -1 selects all trajectories
    columns = ['mmsi', 'geometry']  # columns to be plotted
    selection = np.random.randint(0, high=len(trajectories), size=n_trajectories)
    map = trajectories.to_traj_gdf()[columns].iloc[selection].explore(cmap='jet', column='mmsi', name='Trajectories', style_kwds={'opacity':0.5, 'weight':1})
    map = simplified_trajectories.to_traj_gdf()[columns].iloc[selection].explore(m=map, cmap='jet', column='mmsi', name='Simplified trajectories')
    folium.LayerControl().add_to(map)
    map

In [7]:
# detect significant turning points with Douglas Peucker algorithm
tolerance = 0.001 # the smaller the tolerance, the more detailed the trajectories
waypoints = mpd.DouglasPeuckerGeneralizer(trajectories).generalize(tolerance=tolerance)
print(f'Number of waypoints detected: {len(waypoints.to_point_gdf())}')

Number of waypoints detected: 10576


In [14]:
df_waypoints = waypoints.to_point_gdf()
################
### Apply DBSCAN
################
from sklearn.cluster import DBSCAN
# DBSCAN parameters
eps = 0.0007
min_samples = 5
# 
clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(df_waypoints[['lat', 'lon']])

# compute cluster centroids
cluster_centroids = pd.DataFrame(columns=['clusterID', 'lat', 'lon', 'convex_hull'])
for i in range(0, max(clustering.labels_)+1):
    lat = df_waypoints[clustering.labels_ == i].lat.mean()
    lon = df_waypoints[clustering.labels_ == i].lon.mean()
    centroid = pd.DataFrame([[i, lat, lon]], columns=['clusterID', 'lat', 'lon'])
    cluster_centroids = pd.concat([cluster_centroids, centroid])

df_waypoints['clusterID'] = clustering.labels_  # assign clusterID to each waypoint

# convert waypoint and cluster centroid DataFrames to GeoDataFrames
df_waypoints = gpd.GeoDataFrame(df_waypoints, geometry=gpd.points_from_xy(df_waypoints.lon, df_waypoints.lat), crs="EPSG:4326")
df_waypoints.reset_index(inplace=True)
cluster_centroids = gpd.GeoDataFrame(cluster_centroids, geometry=gpd.points_from_xy(cluster_centroids.lon, cluster_centroids.lat), crs="EPSG:4326")

# compute convex hull of each cluster
for i in range(0, max(clustering.labels_)+1):
    hull = df_waypoints[df_waypoints.clusterID == i].unary_union.convex_hull
    cluster_centroids['convex_hull'].iloc[i] = hull
print(f'{len(cluster_centroids)} clusters detected')

152 clusters detected


In [15]:
# plotting
n_trajectories = -1  # -1 selects all trajectories
if n_trajectories > 0:
    selection = np.random.randint(0, high=len(trajectories), size=n_trajectories)
else:
    selection = range(0,len(trajectories))

columns = ['geometry', 'mmsi']  # columns to be plotted
map = simplified_trajectories.to_traj_gdf()[columns].iloc[selection].explore(column='mmsi', name='Simplified trajectories', 
                                                                             style_kwds={'weight':1, 'color':'black', 'opacity':0.5}, 
                                                                             legend=False)
map = df_waypoints[['clusterID', 'geometry']].explore(m=map, name='all waypoints with cluster ID', legend=False,
                                                      marker_kwds={'radius':2},
                                                      style_kwds={'opacity':0.2})
cluster_centroids.set_geometry('geometry', inplace=True)
map = cluster_centroids[['clusterID', 'geometry']].explore(m=map, name='cluster centroids', legend=False,
                                                           marker_kwds={'radius':3},
                                                           style_kwds={'color':'red', 'fillColor':'red', 'fillOpacity':1})
cluster_centroids.set_geometry('convex_hull', inplace=True)
map = cluster_centroids[['clusterID', 'convex_hull']].explore(m=map, name='cluster convex hulls', legend=False,
                                                              style_kwds={'color':'red', 'fillColor':'red', 'fillOpacity':0.2})
folium.LayerControl().add_to(map)

# add a title
title = f'DBSCAN parameters: eps={eps}, min_samples={min_samples}'
title_html = '''
             <h3 align="center" style="font-size:16px"><b>{}</b></h3>
             '''.format(title)   

map.get_root().html.add_child(folium.Element(title_html))

map

In [16]:
df_waypoints.head()

Unnamed: 0,date_time_utc,mmsi,imo_nr,length,lon,lat,sog,cog,true_heading,nav_status,message_nr,geometry,speed,clusterID
0,2022-04-01 06:30:21,209989000_0,9235505,90,4.6236,59.5881,10.0,167.2,174,0,1,POINT (4.62360 59.58810),4.473722,-1
1,2022-04-01 07:23:40,209989000_0,9235505,90,4.66145,59.4478,10.0,168.1,175,0,1,POINT (4.66145 59.44780),4.507809,-1
2,2022-04-01 08:22:50,209989000_0,9235505,90,4.70173,59.2814,11.1,169.7,175,0,1,POINT (4.70173 59.28140),5.626883,-1
3,2022-04-01 08:47:21,209989000_0,9235505,90,4.71771,59.2073,10.5,155.7,149,0,3,POINT (4.71771 59.20730),5.397732,-1
4,2022-04-01 09:04:00,209989000_0,9235505,90,4.76252,59.1618,11.0,149.0,145,0,1,POINT (4.76252 59.16180),6.28771,-1
