In [None]:
'''
This notebook plot Maritime Traffic Network (MTN) evaluation data.
- Load evaluation data from file (an export file from experiments logged with neptune)
- output: plots of results that explore the dependence between different hyperparameters
'''

In [None]:
import pandas as pd
import geopandas as gpd
import movingpandas as mpd
import numpy as np
from datetime import timedelta, datetime
import time
import networkx as nx
import matplotlib.pyplot as plt
import folium
import warnings
import sys

warnings.filterwarnings('ignore')

print("Geopandas has version {}".format(gpd.__version__))
print("Movingpandas has version {}".format(mpd.__version__))

In [None]:
# Load experiment result data from file (export from neptune)
eval_df = pd.read_csv('../reports/Maritime-Traffic-Network.csv')
eval_df['algorithm'].fillna('V1.0', inplace=True)
eval_df.info()

In [None]:
# get a summary of important hyperparameters
n_points = sorted(eval_df.n_points.unique())
min_samples = sorted(eval_df.min_samples.unique())
v34 = sorted(eval_df['V 3,4'].unique())
v5 = sorted(eval_df['V 5'].unique())
print(f'n_points: {n_points}')
print(f'HDBSCAN min_samples: {min_samples}')
print(f'v34: {v34}')
print(f'v5: {v5}')

In [None]:
# DEPENDENCE of n_edges on min_samples
eval_df.sort_values(by='min_samples', inplace=True)

filter = (eval_df['n_points']==5422129) & (eval_df['V 5']==1e-4) & (eval_df['V 3,4']==1e-2) & (eval_df['test_data/n_trajectories']==752)
x = eval_df[filter]['min_samples']
y1 = eval_df[filter]['n_nodes']
y2 = eval_df[filter]['n_edges']
plt.plot(x, y1, color='red')
plt.plot(x, y2, color='blue')
plt.xlabel('min_samples')
plt.legend(['# of nodes', '# of edges'])

plt.title('Network complexity depending on min_samples')
plt.show()

In [None]:
# DEPENDENCE of evaluation metrics on min_samples (full dataset Stavanger)
# sort and filter data
eval_df.sort_values(by='min_samples', inplace=True)
filter = (eval_df['n_points']==5422129) & (eval_df['V 5']==5e-4) & (eval_df['V 3,4']==1e-2) & (eval_df['algorithm']=='V7.0(SSPD)')
x = eval_df[filter]['min_samples']

# prepare plot and axes
fig, ax1 = plt.subplots(figsize=[8, 5])

ax1.set_xlabel('min_samples')
ax1.set_ylabel('MedAE (m)', color='red')
ax1.tick_params(axis='y', labelcolor='red')
ax1.set_xticks(x)

ax2 = ax1.twinx()
ax2.set_ylabel('failure rate', color='blue') 
ax2.tick_params(axis='y', labelcolor='blue')

ax3 = ax1.twinx()  
ax3.set_ylabel('# of nodes', color='green')
ax3.tick_params(axis='y', labelcolor='green')
ax2.spines['right'].set_position(('outward', 0))  # Adjust the offset as needed
ax3.spines['right'].set_position(('outward', 50))  # Adjust the offset as needed

# plot
y1 = eval_df[filter]['Median']
y2 = 1-eval_df[filter]['success']
y3 = eval_df[filter]['n_nodes']
ax1.plot(x, y1, color='red')
ax2.plot(x, y2, color='blue')
ax3.plot(x, y3, color='green')

#plt.title('Network performance depending on min_samples \n (Stavanger, April 22 dataset, V = [1, 1, 1e-2, 1e-2, 5e-4])')
plt.title('Stavanger')
fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.savefig('min_samples_Stavanger.pdf')
fig.show()

In [None]:
# DEPENDENCE of evaluation metrics on min_samples (medium dataset Stavanger)
# sort and filter data
eval_df.sort_values(by='min_samples', inplace=True)
filter = ((eval_df['n_points']==1794132) & (eval_df['V 5']==5e-4) & (eval_df['V 3,4']==1e-2) & (eval_df['algorithm']=='V7.0(SSPD)') 
           & (eval_df['pruning']==1) & (eval_df['DP_tol']==10))
x = eval_df[filter]['min_samples']

# prepare figure and axes
fig, ax1 = plt.subplots()

ax1.set_xlabel('min_samples')
ax1.set_ylabel('MAE (m)', color='red')
ax1.tick_params(axis='y', labelcolor='red')
ax1.set_xticks(x)
ax2 = ax1.twinx()
ax2.set_ylabel('failure rate', color='blue') 
ax2.tick_params(axis='y', labelcolor='blue')

# plot
y1 = eval_df[filter]['Median']
y2 = 1-eval_df[filter]['success']
ax1.plot(x, y1, 'r')
ax2.plot(x, y2, 'b')

#plt.title('Network performance depending on min_samples \n (Stavanger, April 22 medium dataset, V = [1, 1, 1e-2, 1e-2, 5e-4])')
plt.title('Stavanger')
fig.tight_layout()  # otherwise the right y-label is slightly clipped
fig.show()

In [None]:
# DEPENDENCE of evaluation metrics on min_samples (Tromso)
# sort and filter data
eval_df.sort_values(by='min_samples', inplace=True)
filter = (eval_df['n_points']==796177) & (eval_df['V 5']==1) & (eval_df['V 3,4']==1e-2) & (eval_df['algorithm']=='V7.0(SSPD,std)') & (eval_df['min_samples']!=30)
x = eval_df[filter]['min_samples']

# prepare figure and axes
fig, ax1 = plt.subplots(figsize=[8, 5])
ax1.set_xlabel('min_samples')
ax1.set_ylabel('MedAE (m)', color='red')
ax1.tick_params(axis='y', labelcolor='red')
ax1.set_xticks(x)
ax2 = ax1.twinx()  
ax2.set_ylabel('Failure Rate', color='blue') 
ax2.tick_params(axis='y', labelcolor='blue')
ax3 = ax1.twinx()  
#ax3.set_ylabel('Running Time (s)', color='green')
ax3.set_ylabel('# of nodes', color='green')
ax3.tick_params(axis='y', labelcolor='green')
ax2.spines['right'].set_position(('outward', 0))  # Adjust the offset as needed
ax3.spines['right'].set_position(('outward', 50))  # Adjust the offset as needed

# plot
y1 = eval_df[filter]['Median']
y2 = 1-eval_df[filter]['success']
y3 = eval_df[filter]['n_nodes']
ax1.plot(x, y1, color='red')
ax2.plot(x, y2, color='blue')
ax3.plot(x, y3, color='green')

#plt.title('Network performance depending on min_samples \n (Tromso, April 22 dataset, V = [1, 1, 1e-2, 1e-2, 1])')
plt.title('Tromsø')
fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.savefig('min_samples_Tromso.pdf')
fig.show()

In [None]:
# DEPENDENCE of evaluation metrics on min_samples (Oslo)
# sort and filter data
eval_df.sort_values(by='min_samples', inplace=True)
filter_train = ((eval_df['n_points']==2173821) & (eval_df['V 5']==1) & (eval_df['V 3,4']==1e-2) & 
                (eval_df['algorithm']=='V7.0(SSPD)') & (eval_df['test_data/n_trajectories']==360))
filter_test = ((eval_df['n_points']==2173821) & (eval_df['V 5']==1) & (eval_df['V 3,4']==1e-2) & 
                (eval_df['algorithm']=='V7.0(SSPD)') & (eval_df['test_data/n_trajectories']==357))
x = eval_df[filter_train]['min_samples']

# prepare figure and configure axes
fig, ax1 = plt.subplots(figsize=[8, 5])
ax1.set_xlabel('min_samples')
ax1.set_ylabel('MedAE (m)', color='red')
ax1.tick_params(axis='y', labelcolor='red')
ax1.set_xticks(x)
ax2 = ax1.twinx()  
ax2.set_ylabel('failure rate', color='blue') 
ax2.tick_params(axis='y', labelcolor='blue')

#ax3 = ax1.twinx()  
#ax3.set_ylabel('Running Time (s)', color='green')
#ax3.set_ylabel('# of nodes', color='green')
#ax3.tick_params(axis='y', labelcolor='green')
#ax2.spines['right'].set_position(('outward', 0))  # Adjust the offset as needed
#ax3.spines['right'].set_position(('outward', 50))  # Adjust the offset as needed

# plot
y1_test = eval_df[filter_test]['Median']
y2_test = 1-eval_df[filter_test]['success']
y1_train = eval_df[filter_train]['Median']
y2_train = 1-eval_df[filter_train]['success']
#y3 = eval_df[filter_test]['n_nodes']
ax1.plot(x, y1_test, color='red', label='Test')
ax2.plot(x, y2_test, color='blue', label='Test')
#ax3.plot(x, y3, color='green')
ax1.plot(x, y1_train, color='red', linestyle='--', label='Train')
ax2.plot(x, y2_train, color='blue', linestyle='--', label='Train')

# Add legend
ax1.legend(loc='upper left')
ax2.legend(loc='lower right')

plt.title('Oslo')
fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.savefig('min_samples_Oslo_train_test.pdf')
fig.show()

In [None]:
# DEPENDENCE of evaluation metrics on sigma_cog (Mahalanobis distance)
# sort data
eval_df.sort_values(by='V 3,4', inplace=True)

# prepare figure and axes
fig, axes = plt.subplots(1, 2, figsize=(10, 4))  # Set the figure size

axes[0].set_xlabel('$\sigma_{cog}$')
axes[0].set_xscale('log')  # Corrected the x-scale
axes[0].set_ylabel('MAE (m)')
axes[0].tick_params(axis='y')

axes[1].set_xlabel('$\sigma_{cog}$')  # Added x-label for the second plot
axes[1].set_xscale('log')  # Corrected the x-scale
axes[1].set_ylabel('failure rate') 
axes[1].tick_params(axis='y')

for val in [100, 10, 1, 1e-1, 1e-2, 1e-3, 5e-4, 1e-4]:
    # filter data
    filter = ((eval_df['n_points'] == 1794132) & (eval_df['V 5'] == val) & (eval_df['min_samples'] == 13) 
              & (eval_df['DP_tol']==10) & (eval_df['algorithm']=='V7.0(SSPD)') & (eval_df['pruning']==1) & (eval_df['max_distance']==10))
    x = eval_df[filter]['V 3,4']
    y1 = eval_df[filter]['Mean']
    y2 = 1-eval_df[filter]['success']
    # plot
    axes[0].plot(x, y1)
    axes[1].plot(x, y2)

# add legend
legend_labels = [f'$\sigma_s$: {val}' for val in ['$10^2$', '$10^1$', '$10^0$', '$10^{-1}$', '$10^{-2}$', '$10^{-3}$', '5*$10^{-4}$', '$10^{-4}$']]
plt.legend(legend_labels, loc='upper left', ncol=2)

# Add title
fig.suptitle('Network performance depending on metric matrix V[3,4]\n(Stavanger, April 22 medium dataset, min_samples=13)')

fig.tight_layout()  # Adjust layout
plt.savefig('V34.pdf')
plt.show()

In [None]:
# DEPENDENCE of evaluation metrics on sigma_sog (Mahalanobis distance)
# sort data
eval_df.sort_values(by='V 5', inplace=True)

# prepare figure and axes
fig, axes = plt.subplots(1, 2, figsize=(12, 4))  # Set the figure size

axes[0].set_xlabel('V5')
axes[0].set_xscale('log')  # Corrected the x-scale
axes[0].set_ylabel('MAE (m)')
axes[0].tick_params(axis='y')

axes[1].set_xlabel('V5')  # Added x-label for the second plot
axes[1].set_xscale('log')  # Corrected the x-scale
axes[1].set_ylabel('failure rate') 
axes[1].tick_params(axis='y')

for val in [1, 1e-1, 1e-2, 1e-3, 1e-4]:
    # filter data
    filter = ((eval_df['n_points'] == 1794132) & (eval_df['V 3,4'] == val) & (eval_df['min_samples'] == 13) 
              & (eval_df['DP_tol']==10) & (eval_df['algorithm']=='V7.0(SSPD)') & (eval_df['pruning']==1) & (eval_df['max_distance']==10))
    x = eval_df[filter]['V 5']
    y1 = eval_df[filter]['Median']
    y2 = 1-eval_df[filter]['success']
    # plot
    axes[0].plot(x, y1)
    axes[1].plot(x, y2)

# add legend
legend_labels = [f'V34: {val}' for val in ['1', '1e-1', '1e-2', '1e-3', '1e-4']]
plt.legend(legend_labels)

# Add title
fig.suptitle('Network performance depending on metric matrix V[5]\n(Stavanger, April 22 medium dataset, min_samples=13)')

fig.tight_layout()  # Adjust layout
plt.savefig('V34.png')
plt.show()

In [None]:
# DEPENDENCE of evaluation metrics on DP tolerance and min_samples
# sort data
eval_df.sort_values(by='DP_tol', inplace=True)

# prepare figure and axes
fig, axes = plt.subplots(1, 3, figsize=(12, 4))  # Set the figure size

# MAE on the first axis (axes[0])
axes[0].set_xlabel('$\epsilon_{DP}$ (m)')
axes[0].set_ylabel('MAE (m)')
axes[0].tick_params(axis='y')

# MedAE on the second axis (axes[1])
axes[1].set_xlabel('$\epsilon_{DP}$ (m)')
axes[1].set_ylabel('MedAE (m)')
axes[1].tick_params(axis='y')

# failure rate on the third axis (axes[2])
axes[2].set_xlabel('$\epsilon_{DP}$ (m)')  # Added x-label for the second plot
axes[2].set_ylabel('failure rate') 
axes[2].tick_params(axis='y')

for val in [10, 13, 20]:
    # filter data
    filter = ((eval_df['n_points'] == 1794132) & (eval_df['V 3,4'] == 0.01) & (eval_df['min_samples'] == val) & 
              (eval_df['V 5'] == 5e-4) & (eval_df['pruning']==1) & (eval_df['algorithm']=='V7.0(SSPD)'))
    x = eval_df[filter]['DP_tol']
    y1 = eval_df[filter]['Mean']
    y2 = eval_df[filter]['Median']
    y3 = 1-eval_df[filter]['success']
    # plot
    axes[0].plot(x, y1)
    axes[1].plot(x, y2)
    axes[2].plot(x, y3)

# add legend
legend_labels = [f'min_samples: {val}' for val in [10, 13, 20]]
plt.legend(legend_labels)

# Add title
fig.suptitle('Network performance depending on DP tolerance \n(Stavanger, April 22 medium dataset, V[3,4] = 0.01, V[5] = 5e-4)')
fig.tight_layout()  # Adjust layout
plt.savefig('DPtol_vs_min_samples.pdf')
plt.show()

In [None]:
# DEPENDENCE of evaluation metrics on pruning
# sort data
eval_df.sort_values(by='pruning', inplace=True)

# prepare figure and axes
fig, ax1 = plt.subplots()

ax1.set_xlabel('pruning')
ax1.set_ylabel('MAE (m)', color='red')
ax1.set_ylim([200, 300])
ax1.tick_params(axis='y', labelcolor='red')

ax2 = ax1.twinx()  
ax2.set_ylabel('failure rate', color='blue') 
ax2.tick_params(axis='y', labelcolor='blue')

for val in [13]:
    # filter data
    filter = ((eval_df['n_points']==1794132) & (eval_df['V 5']==5e-4) & (eval_df['V 3,4']==1e-2) & 
              (eval_df['min_samples']==val) & (eval_df['DP_tol']==10)) & (eval_df['algorithm']=='V7.0(SSPD)')
    x = eval_df[filter]['pruning']
    y1 = eval_df[filter]['Mean']
    y2 = 1-eval_df[filter]['success']
    # plot
    ax1.plot(x, y1, color='red')
    ax2.plot(x, y2, color='blue')

plt.title('Network performance depending on pruning \n (Stavanger, April 22 medium dataset, \n min_samples = 10, V = [1, 1, 1e-2, 1e-2, 5e-4])')
fig.tight_layout()  # otherwise the right y-label is slightly clipped
fig.show()

In [None]:
# DEPENDENCE of evaluation metrics on max_distance and max_angle
# sort data
eval_df.sort_values(by='max_distance', inplace=True)

# prepare figure and axes
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].set_xlabel('max_distance')
axes[0].set_ylabel('MAE (m)')
axes[0].tick_params(axis='y')

axes[1].set_xlabel('max_distance')
axes[1].set_ylabel('failure rate') 
axes[1].tick_params(axis='y')

# filter data
filter = ((eval_df['n_points'] == 1794132) & (eval_df['V 3,4'] == 0.01) & (eval_df['min_samples'] == 13) & 
          (eval_df['DP_tol']==10) & (eval_df['V 5'] == 1) & (eval_df['algorithm']=='V7.0(SSPD)'))
x = eval_df[filter]['max_distance']
y1 = eval_df[filter]['Mean']
y2 = 1-eval_df[filter]['success']

# plot
axes[0].plot(x, y1)
axes[1].plot(x, y2)

# Add title
fig.suptitle('Network performance depending max_distance / max_angle \n(Stavanger, April 22 medium dataset)')
plt.savefig('dist_angle.png')
fig.tight_layout()  # Adjust layout
plt.show()

In [None]:
# DEPENDENCE between n_nodes and network quality
# sort data
eval_df.sort_values(by='n_nodes', inplace=True)

# prepare figure and axes
fig, axes = plt.subplots(1, 3, figsize=(10, 4))  # Set the figure size

axes[0].set_xlabel('# of nodes')
axes[0].set_ylabel('MAE (m)')
axes[0].tick_params(axis='y')

axes[1].set_xlabel('# of nodes')  # Added x-label for the second plot
axes[1].set_ylabel('failure rate') 
axes[1].tick_params(axis='y')

axes[2].set_xlabel('# of nodes')  # Added x-label for the second plot
axes[2].set_ylabel('runtime') 
axes[2].tick_params(axis='y')

# filter data
filter = ((eval_df['n_points'].isin([448382.0, 1794132.0, 5422129.0])) & (eval_df['algorithm'].isin(['V7.0(SSPD)', 'V7.0(SSPD,std)'])))
x = eval_df[filter]['n_nodes']
y1 = eval_df[filter]['Mean']
y2 = 1-eval_df[filter]['success']
y3 = eval_df[filter]['Running Time']

# plot
m, b = np.polyfit(x,y1,1)
axes[0].scatter(x, y1, color='b')
axes[0].plot(x, m*x+b, color='black')
m, b = np.polyfit(x,y2,1)
axes[1].scatter(x, y2, color='b')
axes[1].plot(x, m*x+b, color='black')

axes[2].scatter(x, y3)
axes[2].plot(x, np.poly1d(np.polyfit(x, y3, 2))(x), color='black')

# Add title
fig.suptitle('Network performance depending on number of nodes \n(Stavanger, April 22)')

fig.tight_layout()  # Adjust layout
plt.savefig('quality_vs_complexity_nodes.pdf')
plt.show()

In [None]:
# DEPENDENCE between n_edges and network quality
# sort data
eval_df.sort_values(by='n_edges', inplace=True)

# prepare figure and axes
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].set_xlabel('# of edges')
axes[0].set_ylabel('MAE (m)')
axes[0].tick_params(axis='y')

axes[1].set_xlabel('# of edges')
axes[1].set_ylabel('failure rate') 
axes[1].tick_params(axis='y')

# filter data
filter = ((eval_df['n_points'].isin([448382.0, 1794132.0, 5422129.0])) & (eval_df['algorithm'].isin(['V7.0(SSPD)', 'V7.0(SSPD,std)'])))
x = eval_df[filter]['n_edges']
y1 = eval_df[filter]['Mean']
y2 = 1-eval_df[filter]['success']

# plot
m, b = np.polyfit(x,y1,1)
axes[0].scatter(x, y1)
axes[0].plot(x, m*x+b, color='black')
m, b = np.polyfit(x,y2,1)
axes[1].scatter(x, y2)
axes[1].plot(x, m*x+b, color='black')

# Add title
fig.suptitle('Network performance depending on number of edges \n(Stavanger, April 22)')

fig.tight_layout()  # Adjust layout
plt.savefig('quality_vs_complexity_edges.pdf')
plt.show()

In [None]:
# Load file with additional experiments (including DBSCAN clustering and Euclidean distance)
eval_df = pd.read_csv('../reports/Maritime-Traffic-Network_new.csv')
eval_df['algorithm'].fillna('V1.0', inplace=True)
#eval_df = eval_df[eval_df['algorithm']=='V2.0']
#eval_df.dropna(inplace = True)
eval_df.info()

In [None]:
# DEPENDENCE between n_nodes, network quality and clustering algorithm
# sort data
eval_df.sort_values(by='n_nodes', inplace=True)

# prepare figure and axes
fig, axes = plt.subplots(1, 2, figsize=(10, 4))

axes[0].set_xlabel('# of nodes')
axes[0].set_ylabel('MAE (m)')
axes[0].set_ylim([0, 700])
axes[0].tick_params(axis='y')

axes[1].set_xlabel('# of nodes')
axes[1].set_ylabel('failure rate') 
axes[1].tick_params(axis='y')
axes[1].set_ylim([0.1, 0.8])

# filter data (HDBSCAN)
filter1 = ((eval_df['n_points'].isin([796177])) & (eval_df['algorithm'].isin(['V7.0(SSPD)', 'V7.0(SSPD,std)'])) & (eval_df['clustering_alg']=='HDBSCAN'))
x = eval_df[filter1]['n_nodes']
y1 = eval_df[filter1]['Mean']
y2 = 1-eval_df[filter1]['success']
# plot
axes[0].scatter(x, y1, color='b')
axes[1].scatter(x, y2, color='b')

# filter data (DBSCAN)
filter3 = ((eval_df['n_points'].isin([796177])) & (eval_df['algorithm'].isin(['V7.0(SSPD)', 'V7.0(SSPD,std)'])) & (eval_df['clustering_alg']=='DBSCAN'))
x = eval_df[filter3]['n_nodes']
y1 = eval_df[filter3]['Mean']
y2 = 1-eval_df[filter3]['success']
# plot
axes[0].scatter(x, y1, color='r', marker='x')
axes[1].scatter(x, y2, color='r', marker='x')

# Add title
fig.suptitle('Network performance depending on number of nodes \n(Tromso, April 22)')
legend_labels = ['HDBSCAN', 'DBSCAN']
plt.legend(legend_labels)

fig.tight_layout()  # Adjust layout
plt.savefig('HDBSCAN_vs_DBSCAN.pdf')
plt.show()

In [None]:
# DEPENDENCE between n_nodes, network quality and distance metric
# sort data
eval_df.sort_values(by='n_nodes', inplace=True)

# prepare figure and axes
fig, axes = plt.subplots(1, 2, figsize=(10, 4))  # Set the figure size

axes[0].set_xlabel('# of nodes')
axes[0].set_ylabel('MAE (m)')
axes[0].tick_params(axis='y')

axes[1].set_xlabel('# of nodes')  # Added x-label for the second plot
axes[1].set_ylabel('failure rate') 
axes[1].tick_params(axis='y')

# filter data (Mahalanobis distance)
filter1 = ((eval_df['n_points'].isin([796177])) & (eval_df['algorithm'].isin(['V7.0(SSPD)', 'V7.0(SSPD,std)'])) & (eval_df['clustering_alg']=='HDBSCAN') & (eval_df['metric']=='mahalanobis'))
x = eval_df[filter1]['n_nodes']
y1 = eval_df[filter1]['Mean']
y2 = 1-eval_df[filter1]['success']
# plot
axes[0].scatter(x, y1, color='b')
axes[1].scatter(x, y2, color='b')

# filter data (Euclidean distance)
filter2 = ((eval_df['n_points'].isin([796177])) & (eval_df['algorithm'].isin(['V7.0(SSPD)', 'V7.0(SSPD,std)'])) & (eval_df['clustering_alg']=='HDBSCAN') & (eval_df['metric']=='euclidean'))
x = eval_df[filter2]['n_nodes']
y1 = eval_df[filter2]['Mean']
y2 = 1-eval_df[filter2]['success']
# plot
axes[0].scatter(x, y1, color='r')
axes[1].scatter(x, y2, color='r')

# Add title
fig.suptitle('Network performance depending on number of nodes \n(Tromso, April 22)')
legend_labels = ['Mahalanobis', 'Euclidean']
plt.legend(legend_labels)

fig.tight_layout()  # Adjust layout
plt.savefig('Euclidean_vs_Mahalanobis.pdf')
plt.show()