In [1]:
import pandas as pd
import geopandas as gpd
import movingpandas as mpd
from hvplot import pandas
from datetime import timedelta, datetime
import folium
import warnings
import sys
warnings.filterwarnings('ignore')

print("Geopandas has version {}".format(gpd.__version__))
print("Movingpandas has version {}".format(mpd.__version__))

Geopandas has version 0.13.2
Movingpandas has version 0.17.1


In [2]:
'''
This notebook takes raw AIS data as input, cleans and enriches the data and saves it to a parquet file.

Cleaning steps:
* Drop duplicates (AIS messages can be recorded multiple times by different stations, e.g. satellite, coastal station etc)
  Only the first registered message at a certain location is retained
* The data is split into trajectories, where each trajectory receives a unique ID. 
  A trajectory is split into sub-trajectories, when the observation gap between AIS messages exceeds 10min and if the resulting
  sub trajectory is longer than 100m
* Drop trajectories with 'hops' in the AIS messages (Sometimes the GPS location jumps inexplainably between two consecutive timesteps)

Enrichment with metadata:
* Ship metadata (width, draught, shiptype, shipgroup, name) is added to the raw AIS data
'''

"\nThis notebook takes raw AIS data as input, cleans and enriches the data and saves it to a parquet file.\n\nCleaning steps:\n* Drop duplicates (AIS messages can be recorded multiple times by different stations, e.g. satellite, coastal station etc)\n  Only the first registered message at a certain location is retained\n* The data is split into trajectories, where each trajectory receives a unique ID. \n  A trajectory is split into sub-trajectories, when the observation gap between AIS messages exceeds 10min and if the resulting\n  sub trajectory is longer than 100m\n* Drop trajectories with 'hops' in the AIS messages (Sometimes the GPS location jumps inexplainably between two consecutive timesteps)\n\nEnrichment with metadata:\n* Ship metadata (width, draught, shiptype, shipgroup, name) is added to the raw AIS data\n"

In [3]:
# add paths for modules
sys.path.append('../visualization')
print(sys.path)

# import modules
import visualize

['/Users/janhendrikwebert/maritime_route_prediction/src/datawrangling', '/Users/janhendrikwebert/miniforge3/envs/env_geo/lib/python311.zip', '/Users/janhendrikwebert/miniforge3/envs/env_geo/lib/python3.11', '/Users/janhendrikwebert/miniforge3/envs/env_geo/lib/python3.11/lib-dynload', '', '/Users/janhendrikwebert/miniforge3/envs/env_geo/lib/python3.11/site-packages', '../visualization']


In [4]:
#################################
# Parameters ####################
#################################
filename = '../../data/raw/AIS_04-09_2022_stavanger/ais_202204.csv'  # read file from
size = 200000  # set the number of AIS messages for processing: len(gdf) processes all
start = 0  # index of the first message to process
save_to = '../../data/processed/202204_points_stavanger_cleaned_meta_full_dualSplit.parquet'  # save file to

In [5]:
# read data from file
df = pd.read_csv(filename, delimiter=';', decimal='.')
n_messages = len(df)
print(f'{n_messages} raw AIS messages loaded from file {filename}')

14052976 raw AIS messages loaded from file ../../data/raw/AIS_04-09_2022_stavanger/ais_202204.csv


In [6]:
# join ship metadata from external source
filename = '../../data/external/seilas-2022.csv'
df_meta = pd.read_csv(filename, delimiter=';', decimal=',', encoding='ISO-8859-1')
df_meta.rename(columns={'mmsi_nummer':'mmsi'}, inplace=True)  # rename MMSI column
df_meta.drop_duplicates(subset='mmsi', inplace=True)  # drop duplicate MMSI's

# merge dataframes on mmsi
merge_columns = ['mmsi', 'bredde', 'dypgaaende', 'skipstype', 'skipsgruppe', 'fartoynavn']
df = df.merge(df_meta[merge_columns], on='mmsi', how='left')

# output report about join
n_matching = len(pd.Series(list(set(df_meta['mmsi']).intersection(set((df['mmsi']))))))
print(f'Ship metadata has   {df_meta.mmsi.nunique()} unique MMSIs')
print(f'AIS raw data has    {df.mmsi.nunique()} unique MMSIs')
print(f'Overlap:            {n_matching} MMSIs')

Ship metadata has   4142 unique MMSIs
AIS raw data has    658 unique MMSIs
Overlap:            595 MMSIs


In [7]:
# convert to geopandas df
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat), crs="EPSG:4326")
df = []  # free memory

# drop duplicate AIS data (reported by multiple stations)
before = len(gdf)
gdf.drop_duplicates(subset = ['mmsi', 'lat', 'lon'],
                    keep = 'first', inplace=True)
after = len(gdf)
print(f'{before-after} superfluous AIS messages dropped')

8036239 superfluous AIS messages dropped


In [8]:
# convert to trajectories
size = len(gdf)
trajectories = mpd.TrajectoryCollection(gdf.iloc[start:start+size], traj_id_col='mmsi', 
                                        obj_id_col='mmsi', t='date_time_utc')

In [9]:
# add a trajectory splitter
obs_split_trajectories = mpd.ObservationGapSplitter(trajectories).split(gap=timedelta(minutes=10), min_length=500)
print(f'Observation Gap splitter split {len(trajectories)} trajectories into {len(obs_split_trajectories)} sub-trajectories')
split_trajectories = mpd.StopSplitter(obs_split_trajectories).split(max_diameter=25, min_duration=timedelta(minutes=1), min_length=500)
print(f'Stop splitter split {len(obs_split_trajectories)} trajectories into {len(split_trajectories)} sub-trajectories')
#split_trajectories = mpd.ObservationGapSplitter(trajectories).split(gap=timedelta(minutes=10), min_length=100)
#print(f'Observation Gap splitter split {len(trajectories)} trajectories into {len(split_trajectories)} sub-trajectories')

Observation Gap splitter split 658 trajectories into 10068 sub-trajectories
Stop splitter split 10068 trajectories into 12573 sub-trajectories


In [10]:
# drop trajectories with 'hops' due to corrupted AIS data
# We measure the speed of a vessel between consecutive points. If the speed exceeds a certain threshold we discard the trajectory
split_trajectories.add_speed()  # calculate speed
speed_thresh = 500 / 3.6  # speed in m/s
split_gdf = split_trajectories.to_point_gdf()
bad_track_ids = split_gdf[split_gdf.speed > speed_thresh]['mmsi'].unique()  # IDs that violate the threshold
valid_track_ids = list(set(split_gdf.mmsi.unique()) - set(bad_track_ids))  # IDs that satisfy the threshold
split_trajectories = split_trajectories.filter('mmsi', valid_track_ids)  # retain valid trajectories
print(f'{len(bad_track_ids)} trajectories were found that exceed the speed limit and dropped from the list of trajectories')

416 trajectories were found that exceed the speed limit and dropped from the list of trajectories


In [11]:
# report about cleaning
n_retained = len(split_trajectories.to_point_gdf())
print(f'Cleaning reduced {n_messages} AIS messages to {n_retained} points ({n_retained/n_messages*100:.2f}%)')

Cleaning reduced 14052976 AIS messages to 5565999 points (39.61%)


In [12]:
# save to file
#split_trajectories.to_traj_gdf().to_parquet('../../data/processed/202204_trajectories_stavanger_cleaned_meta_200k.parquet')
split_trajectories.to_point_gdf().to_parquet(save_to)