In [1]:
import pandas as pd
import geopandas as gpd
import movingpandas as mpd
import numpy as np
import matplotlib.pyplot as plt
import folium
import warnings
import sys
#warnings.filterwarnings('ignore')

print("Geopandas has version {}".format(gpd.__version__))

Geopandas has version 0.13.2


In [2]:
# read ship metadata from file
filename = '../../data/external/seilas-2022.csv'
df_meta = pd.read_csv(filename, delimiter=';', decimal=',', encoding='ISO-8859-1')
# rename MMSI column
df_meta.rename(columns={'mmsi_nummer':'mmsi'}, inplace=True)
# drop duplicate MMSI's
df_meta.drop_duplicates(subset='mmsi', inplace=True)

# read ship trajectory data from file
filename = '../../data/raw/routes_all.parquet'
df_trajectories = gpd.read_parquet(filename)

In [37]:
# match MMSI columns from both dataframes with each other
n_matching = len(pd.Series(list(set(df_meta['mmsi']).intersection(set((df_trajectories['mmsi']))))))
print(f'Ship metadata has   {df_meta.mmsi.nunique()} unique MMSIs')
print(f'Trajectory file has {df_trajectories.mmsi.nunique()} unique MMSIs')
print(f'Overlap:            {n_matching} MMSIs')

Ship metadata has   4142 unique MMSIs
Trajectory file has 916 unique MMSIs
Overlap:            450 MMSIs


In [3]:
# merge dataframes on mmsi
merge_columns = ['mmsi', 'lengde', 'bredde', 'dypgaaende', 'skipstype', 'skipsgruppe']
df = df_trajectories.merge(df_meta[merge_columns], on='mmsi', how='left')
df = gpd.GeoDataFrame(df)

In [4]:
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 58546 entries, 0 to 58545
Data columns (total 29 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   mmsi             58546 non-null  int64         
 1   imo              58546 non-null  float64       
 2   length           58546 non-null  float64       
 3   start_lon        58546 non-null  float64       
 4   start_lat        58546 non-null  float64       
 5   stop_lon         58546 non-null  float64       
 6   stop_lat         58546 non-null  float64       
 7   start_loc        58546 non-null  object        
 8   stop_loc         58546 non-null  object        
 9   start_geom       58546 non-null  object        
 10  stop_geom        58546 non-null  object        
 11  start_time       58546 non-null  datetime64[us]
 12  stop_time        58546 non-null  datetime64[us]
 13  cog              58546 non-null  object        
 14  avg_cog          58546 non-nul

In [31]:
df.iloc[3]

mmsi                                                       258373000
imo                                                        5071729.0
length                                                          63.0
start_lon                                                    10.7517
start_lat                                                     59.905
stop_lon                                                     10.7221
stop_lat                                                     59.9062
start_loc                                     POINT (10.7517 59.905)
stop_loc                                     POINT (10.7221 59.9062)
start_geom         0101000020E610000070CE88D2DE802540A4703D0AD7F3...
stop_geom          0101000020E610000019E25817B77125404ED1915CFEF3...
start_time                                       2019-05-16 15:30:33
stop_time                                        2019-05-16 16:04:33
cog                [210.1, 201.0, 198.4, 290.7125, 295.9, 233.65,...
avg_cog                           