In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geohash2

In [None]:
# Requires rerunning code for every individual AIS dataset as there is 1 dataset per day
# Use combine_datasets.ipynb to combine them all for final preprocesssing
i = 26
file_path = os.path.join(f'ais/raw_data/aisdk-2025-07-{i}/aisdk-2025-07-{i}.csv')
ais = pd.read_csv(file_path)

In [3]:
ais = ais[['# Timestamp', 'MMSI', 'Latitude', 'Longitude']].dropna()
ais.rename(columns={'Longitude': 'lon', 'Latitude': 'lat'}, inplace=True)

In [4]:
ais.head()

Unnamed: 0,# Timestamp,MMSI,lat,lon
0,26/07/2025 00:00:00,2190064,56.716565,11.519018
1,26/07/2025 00:00:00,219026197,91.0,0.0
2,26/07/2025 00:00:00,219026197,91.0,0.0
3,26/07/2025 00:00:00,2194005,56.34425,4.272
4,26/07/2025 00:00:00,2194006,55.53887,5.0332


In [5]:
# Calculate the time difference between consecutive timestamps
ais['timestamp_datetime'] = pd.to_datetime(ais['# Timestamp'], format="%d/%m/%Y %H:%M:%S").dt.floor('min')
# Sort by MMSI and then timestamp_datetime
ais = ais.sort_values(by=['MMSI', 'timestamp_datetime']).reset_index(drop=True)

# Then calculate time difference per MMSI group
ais['time_diff'] = ais.groupby('MMSI')['timestamp_datetime'].diff()
ais

Unnamed: 0,# Timestamp,MMSI,lat,lon,timestamp_datetime,time_diff
0,26/07/2025 00:00:05,3638,55.332238,10.964883,2025-07-26 00:00:00,NaT
1,26/07/2025 00:00:05,3638,55.332238,10.964883,2025-07-26 00:00:00,0 days 00:00:00
2,26/07/2025 00:00:05,3638,55.332238,10.964883,2025-07-26 00:00:00,0 days 00:00:00
3,26/07/2025 00:00:05,3638,55.332238,10.964883,2025-07-26 00:00:00,0 days 00:00:00
4,26/07/2025 00:00:05,3638,55.332238,10.964883,2025-07-26 00:00:00,0 days 00:00:00
...,...,...,...,...,...,...
19248383,26/07/2025 23:58:47,992651067,57.690097,11.901852,2025-07-26 23:58:00,0 days 00:00:00
19248384,26/07/2025 23:59:41,992651067,57.690097,11.901852,2025-07-26 23:59:00,0 days 00:01:00
19248385,26/07/2025 23:59:52,992651067,57.690097,11.901852,2025-07-26 23:59:00,0 days 00:00:00
19248386,26/07/2025 20:54:23,1014066455,-30.020000,122.470000,2025-07-26 20:54:00,NaT


In [6]:
def preprocess_tracks(df, blackout_threshold=pd.Timedelta(hours=1), min_duration=pd.Timedelta(hours=4)):
    """
    Preprocesses `ais` by:
    1. Splitting tracks with blackout periods longer than `blackout_threshold`.
    2. Removing tracks shorter than `min_duration`.

    Parameters:
    - df (pd.DataFrame): Data with ['id', 'latitude', 'longitude', 'timestamp_datetime', 'time_diff'].
    - blackout_threshold (pd.Timedelta): Time gap threshold for blackout detection.
    - min_duration (pd.Timedelta): Minimum track duration for valid tracks.

    Returns:
    - pd.DataFrame: Preprocessed track data.
    """

    # Split tracks by creating a new track ID whenever a blackout occurs
    df['sub_track'] = (df['time_diff'] > blackout_threshold).cumsum()

    # Compute track durations
    track_durations = df.groupby(['MMSI', 'sub_track'])['timestamp_datetime'].agg(['min', 'max']).reset_index()
    track_durations['duration'] = track_durations['max'] - track_durations['min']

    # Filter out short tracks
    valid_tracks = track_durations[track_durations['duration'] >= min_duration]

    # Retain only valid tracks
    result = df.merge(valid_tracks[['MMSI', 'sub_track']], on=['MMSI', 'sub_track'])

    return result
# Run Preprocessing
ais_preprocessed = preprocess_tracks(ais)
ais_preprocessed

Unnamed: 0,# Timestamp,MMSI,lat,lon,timestamp_datetime,time_diff,sub_track
0,26/07/2025 00:00:05,3638,55.332238,10.964883,2025-07-26 00:00:00,NaT,0
1,26/07/2025 00:00:05,3638,55.332238,10.964883,2025-07-26 00:00:00,0 days 00:00:00,0
2,26/07/2025 00:00:05,3638,55.332238,10.964883,2025-07-26 00:00:00,0 days 00:00:00,0
3,26/07/2025 00:00:05,3638,55.332238,10.964883,2025-07-26 00:00:00,0 days 00:00:00,0
4,26/07/2025 00:00:05,3638,55.332238,10.964883,2025-07-26 00:00:00,0 days 00:00:00,0
...,...,...,...,...,...,...,...
18839935,26/07/2025 23:58:17,992651067,57.690097,11.901852,2025-07-26 23:58:00,0 days 00:00:00,3568
18839936,26/07/2025 23:58:47,992651067,57.690097,11.901852,2025-07-26 23:58:00,0 days 00:00:00,3568
18839937,26/07/2025 23:58:47,992651067,57.690097,11.901852,2025-07-26 23:58:00,0 days 00:00:00,3568
18839938,26/07/2025 23:59:41,992651067,57.690097,11.901852,2025-07-26 23:59:00,0 days 00:01:00,3568


In [7]:
ais_preprocessed.set_index("timestamp_datetime")

Unnamed: 0_level_0,# Timestamp,MMSI,lat,lon,time_diff,sub_track
timestamp_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-07-26 00:00:00,26/07/2025 00:00:05,3638,55.332238,10.964883,NaT,0
2025-07-26 00:00:00,26/07/2025 00:00:05,3638,55.332238,10.964883,0 days 00:00:00,0
2025-07-26 00:00:00,26/07/2025 00:00:05,3638,55.332238,10.964883,0 days 00:00:00,0
2025-07-26 00:00:00,26/07/2025 00:00:05,3638,55.332238,10.964883,0 days 00:00:00,0
2025-07-26 00:00:00,26/07/2025 00:00:05,3638,55.332238,10.964883,0 days 00:00:00,0
...,...,...,...,...,...,...
2025-07-26 23:58:00,26/07/2025 23:58:17,992651067,57.690097,11.901852,0 days 00:00:00,3568
2025-07-26 23:58:00,26/07/2025 23:58:47,992651067,57.690097,11.901852,0 days 00:00:00,3568
2025-07-26 23:58:00,26/07/2025 23:58:47,992651067,57.690097,11.901852,0 days 00:00:00,3568
2025-07-26 23:59:00,26/07/2025 23:59:41,992651067,57.690097,11.901852,0 days 00:01:00,3568


In [8]:
def interpolate_and_resample(df, resample_interval='10min'):
    """
    Interpolates ship tracks at a fixed interval while maintaining the original trajectory.

    Steps:
    1. Resample to create timestamps every 10 minutes.
    2. Interpolate each new timestamp by using the closest before and after points.
    3. Maintain the original trajectory instead of creating a single straight line.

    Parameters:
    - df (pd.DataFrame): Must contain ['MMSI', 'lat', 'lon', 'timestamp_datetime'].
    - resample_interval (str): Interval for resampling (e.g., '10T' for 10 minutes).

    Returns:
    - pd.DataFrame: Resampled and interpolated track data.
    """
    
    # Store processed tracks
    interpolated_data = []
    
    for track_MMSI, track_data in df.groupby(['MMSI', 'sub_track']):
        # Set timestamp as index and sort
        track_data = track_data.set_index('timestamp_datetime')
        track_data = track_data[~track_data.index.duplicated(keep='first')]  # Drop duplicate timestamps
        
        # Create resample timestamps based on the first row's timestamp and resample interval
        start_time = track_data.index[0]
        end_time = track_data.index[-1]
        resample_times = pd.date_range(start=start_time, end=end_time, freq=resample_interval)
        
        # Reindex the track data to the resampled times, introducing NaNs where necessary
        track_data_resampled = track_data.reindex(resample_times)

        # Merge with original data, keeping existing values
        track_data_resampled[['lat', 'lon']] = track_data[['lat', 'lon']]

        # Get timestamps that need interpolation
        missing_mask = track_data_resampled['lat'].isna()
        missing_timestamps = track_data_resampled.loc[missing_mask].index

        # Initialize the first `before` and `after` values from the original data
        timestamps = track_data.index
        before = timestamps[0]
        after = timestamps[1]

        # Perform interpolation using surrounding points
        for ts in missing_timestamps:
            # Check if the current timestamp lies between the existing `before` and `after`
            while after < ts:  # Move to the next pair of timestamps if `ts` is after `after`
                before = after
                after = timestamps[timestamps.searchsorted(after) + 1]  # Move to the next "after"

            # Now `ts` is between `before` and `after`
            lat_before, lon_before = track_data.loc[before, ['lat', 'lon']]
            lat_after, lon_after = track_data.loc[after, ['lat', 'lon']]

            # Calculate the time difference ratios for interpolation
            total_time_diff = (after - before).total_seconds()
            if total_time_diff == 0:
                continue  # Avoid division by zero

            weight_before = (after - ts).total_seconds() / total_time_diff
            weight_after = (ts - before).total_seconds() / total_time_diff

            # Perform linear interpolation
            track_data_resampled.at[ts, 'lat'] = lat_before * weight_before + lat_after * weight_after
            track_data_resampled.at[ts, 'lon'] = lon_before * weight_before + lon_after * weight_after

        # Add back 'MMSI' and 'sub_track' columns
        track_data_resampled['MMSI'] = track_MMSI[0]
        track_data_resampled['sub_track'] = track_MMSI[1]

        # Append result
        interpolated_data.append(track_data_resampled)

    # Combine all processed tracks
    result = pd.concat(interpolated_data).reset_index()

    return result

# Run Interpolation and Resampling
ais_preprocessed1 = interpolate_and_resample(ais_preprocessed)
ais_preprocessed1 = ais_preprocessed1.drop(
    columns=['distance_from_shore', 'distance_from_port', 'speed', 'course', 'is_fishing', 'source', 'time_diff'],
    errors='ignore'  # Avoid errors if columns are missing
)

# Display the first few rows
ais_preprocessed1

Unnamed: 0,index,# Timestamp,MMSI,lat,lon,sub_track
0,2025-07-26 00:00:00,26/07/2025 00:00:05,3638,55.332238,10.964883,0
1,2025-07-26 00:10:00,26/07/2025 00:10:06,3638,55.332238,10.964888,0
2,2025-07-26 00:20:00,26/07/2025 00:20:05,3638,55.332243,10.964892,0
3,2025-07-26 00:30:00,26/07/2025 00:30:06,3638,55.332243,10.964897,0
4,2025-07-26 00:40:00,26/07/2025 00:40:06,3638,55.332232,10.964893,0
...,...,...,...,...,...,...
592451,2025-07-26 23:11:00,26/07/2025 23:11:52,992651067,57.690097,11.901852,3568
592452,2025-07-26 23:21:00,,992651067,57.690097,11.901852,3568
592453,2025-07-26 23:31:00,26/07/2025 23:31:10,992651067,57.690097,11.901852,3568
592454,2025-07-26 23:41:00,26/07/2025 23:41:41,992651067,57.690097,11.901852,3568


In [9]:
ais_preprocessed1 = ais_preprocessed1.rename(columns = {'index':'timestamp_datetime'})

In [10]:
ais_preprocessed1['time_diff'] = ais_preprocessed1['timestamp_datetime'].diff()

precision = 5
ais_preprocessed1['geohash'] = ais_preprocessed1.apply(lambda row: geohash2.encode(row['lat'], row['lon'], precision=precision), axis=1)
ais_preprocessed1 = ais_preprocessed1[ais_preprocessed1['geohash'].str[0] == 'u'].copy()
# ais_preprocessed1['geohash_trimmed'] = ais_preprocessed1['geohash'].str[1:]
ais_preprocessed1


Unnamed: 0,timestamp_datetime,# Timestamp,MMSI,lat,lon,sub_track,time_diff,geohash
0,2025-07-26 00:00:00,26/07/2025 00:00:05,3638,55.332238,10.964883,0,NaT,u1zfc
1,2025-07-26 00:10:00,26/07/2025 00:10:06,3638,55.332238,10.964888,0,0 days 00:10:00,u1zfc
2,2025-07-26 00:20:00,26/07/2025 00:20:05,3638,55.332243,10.964892,0,0 days 00:10:00,u1zfc
3,2025-07-26 00:30:00,26/07/2025 00:30:06,3638,55.332243,10.964897,0,0 days 00:10:00,u1zfc
4,2025-07-26 00:40:00,26/07/2025 00:40:06,3638,55.332232,10.964893,0,0 days 00:10:00,u1zfc
...,...,...,...,...,...,...,...,...
592451,2025-07-26 23:11:00,26/07/2025 23:11:52,992651067,57.690097,11.901852,3568,0 days 00:10:00,u622n
592452,2025-07-26 23:21:00,,992651067,57.690097,11.901852,3568,0 days 00:10:00,u622n
592453,2025-07-26 23:31:00,26/07/2025 23:31:10,992651067,57.690097,11.901852,3568,0 days 00:10:00,u622n
592454,2025-07-26 23:41:00,26/07/2025 23:41:41,992651067,57.690097,11.901852,3568,0 days 00:10:00,u622n


In [11]:
def remove_stationary_tracks(df):
    """
    Removes tracks where all positions have the same geohash (stationary tracks).
    
    Parameters:
    - df (pd.DataFrame): Data with ['MMSI', 'sub_track', 'geohash'] columns.
    
    Returns:
    - pd.DataFrame: Data with stationary tracks removed.
    """
    
    # Count unique geohashes per track
    unique_geohashes = df.groupby(['MMSI', 'sub_track'])['geohash'].nunique().reset_index()
    unique_geohashes.columns = ['MMSI', 'sub_track', 'unique_geohash_count']
    
    # Keep only tracks with more than 1 unique geohash
    moving_tracks = unique_geohashes[unique_geohashes['unique_geohash_count'] > 1]
    
    # Filter original dataframe to keep only moving tracks
    result = df.merge(moving_tracks[['MMSI', 'sub_track']], on=['MMSI', 'sub_track'])
    
    return result

# Apply the function
ais_preprocessed1 = remove_stationary_tracks(ais_preprocessed1)
ais_preprocessed1

Unnamed: 0,timestamp_datetime,# Timestamp,MMSI,lat,lon,sub_track,time_diff,geohash
0,2025-07-26 00:00:00,26/07/2025 00:00:00,2579999,58.459430,12.727182,6,-1 days +00:10:00,u63h9
1,2025-07-26 00:10:00,26/07/2025 00:10:00,2579999,54.148615,15.636853,6,0 days 00:10:00,u3eh5
2,2025-07-26 00:20:00,26/07/2025 00:20:00,2579999,58.459428,12.727210,6,0 days 00:10:00,u63h9
3,2025-07-26 00:30:00,26/07/2025 00:30:00,2579999,54.148673,15.636843,6,0 days 00:10:00,u3eh5
4,2025-07-26 00:40:00,26/07/2025 00:40:00,2579999,54.148643,15.636828,6,0 days 00:10:00,u3eh5
...,...,...,...,...,...,...,...,...
292453,2025-07-26 23:00:00,26/07/2025 23:00:09,992111799,54.599822,11.337852,3498,0 days 00:10:00,u38n9
292454,2025-07-26 23:10:00,26/07/2025 23:10:09,992111799,54.599812,11.337847,3498,0 days 00:10:00,u38n9
292455,2025-07-26 23:20:00,26/07/2025 23:20:10,992111799,54.599818,11.337842,3498,0 days 00:10:00,u38n9
292456,2025-07-26 23:30:00,26/07/2025 23:30:10,992111799,54.599818,11.337843,3498,0 days 00:10:00,u38n9


In [None]:
ais_preprocessed1.to_csv(f'ais_preprocessed_{i}.csv', index=False)

# Visualize Interpolated vs. Original Track

In [22]:
ais_preprocessed1['sub_track'].value_counts().tail()

sub_track
2638    25
2322    24
887     24
1066    24
193     22
Name: count, dtype: int64

In [23]:
sub_track_num = 2638

## Interpolated & Resampled Track

Recall that we resampled in 10-minute intervals

In [24]:
ais_preprocessed1_filtered = ais_preprocessed1[ais_preprocessed1["sub_track"] == sub_track_num]
print(ais_preprocessed1_filtered.shape)
ais_preprocessed1_filtered.tail()

(25, 8)


Unnamed: 0,timestamp_datetime,# Timestamp,MMSI,lat,lon,sub_track,time_diff,geohash
236382,2025-07-26 23:08:00,,257941000,57.667909,11.798873,2638,0 days 00:10:00,u622h
236383,2025-07-26 23:18:00,,257941000,57.66788,11.798887,2638,0 days 00:10:00,u622h
236384,2025-07-26 23:28:00,,257941000,57.667864,11.798992,2638,0 days 00:10:00,u622h
236385,2025-07-26 23:38:00,,257941000,57.667858,11.799004,2638,0 days 00:10:00,u622h
236386,2025-07-26 23:48:00,26/07/2025 23:48:13,257941000,57.66788,11.798998,2638,0 days 00:10:00,u622h


In [25]:
import folium
# Initialize the map centered around the first coordinate
start_lat, start_lon = ais_preprocessed1_filtered.iloc[0]['lat'], ais_preprocessed1_filtered.iloc[0]['lon']
mymap = folium.Map(location=[start_lat, start_lon], zoom_start=14)

# Convert the dataframe into a list of tuples of coordinates (lat, lon)
coordinates = ais_preprocessed1_filtered[['lat', 'lon']].values.tolist()

# Add the PolyLine for the trajectory (path)
folium.PolyLine(locations=coordinates, color='blue', weight=2.5, opacity=1).add_to(mymap)

# Add markers for each point in the trajectory
for lat, lon, timestamp in zip(ais_preprocessed1_filtered['lat'], ais_preprocessed1_filtered['lon'], ais_preprocessed1_filtered['# Timestamp']):
    print(lat, lon, timestamp)  # Check if this prints for each row
    folium.Marker(
        location=[lat, lon],
        popup=f"Time: {timestamp}",
        icon=folium.Icon(color="blue", icon="info-sign")
    ).add_to(mymap)

# Display the map
mymap


57.691247 11.870773 26/07/2025 19:48:47
57.686293 11.858108 26/07/2025 19:58:18
57.681367 11.82078 26/07/2025 20:08:09
57.673387 11.801088 26/07/2025 20:18:07
57.666973 11.796748 26/07/2025 20:28:08
57.667415250000005 11.7993755 nan
57.667567 11.799373 26/07/2025 20:48:16
57.667533666666664 11.799362166666667 nan
57.667558 11.799333 nan
57.667513 11.799408 26/07/2025 21:18:19
57.66754041666667 11.799382416666667 nan
57.6675825 11.799311166666667 nan
57.667687 11.799238 26/07/2025 21:48:20
57.66769883333333 11.799231666666667 nan
57.66769825 11.79917375 nan
57.667756999999995 11.799089 nan
57.66779913333334 11.798977333333335 nan
57.66779046666666 11.798980666666667 nan
57.667848 11.79891175 nan
57.667895333333334 11.798887500000001 nan
57.66790866666666 11.798873 nan
57.66788 11.798887 nan
57.66786433333333 11.79899233333333 nan
57.667857999999995 11.799003666666666 nan
57.66788 11.798998 26/07/2025 23:48:13


## Original Track

In [26]:
import folium
original = ais_preprocessed[ais_preprocessed["sub_track"] == sub_track_num]
# Initialize the map centered around the first coordinate
start_lat, start_lon = original.iloc[0]['lat'], original.iloc[0]['lon']
mymap = folium.Map(location=[start_lat, start_lon], zoom_start=14)

# Convert the dataframe into a list of tuples of coordinates (lat, lon)
coordinates = original[['lat', 'lon']].values.tolist()

# Add the PolyLine for the trajectory (path)
folium.PolyLine(locations=coordinates, color='blue', weight=2.5, opacity=1).add_to(mymap)

# Add markers for each point in the trajectory
for lat, lon, timestamp in zip(original['lat'], original['lon'], original['# Timestamp']):
    print(lat, lon, timestamp)  # Check if this prints for each row
    folium.Marker(
        location=[lat, lon],
        popup=f"Time: {timestamp}",
        icon=folium.Icon(color="blue", icon="info-sign")
    ).add_to(mymap)

# Display the map
mymap

57.691247 11.870773 26/07/2025 19:48:47
57.691187 11.870658 26/07/2025 19:50:08
57.691073 11.870583 26/07/2025 19:50:27
57.691073 11.870583 26/07/2025 19:50:27
57.691007 11.870565 26/07/2025 19:50:37
57.690773 11.870428 26/07/2025 19:51:08
57.690773 11.870428 26/07/2025 19:51:08
57.689887 11.869627 26/07/2025 19:52:27
57.689487 11.869063 26/07/2025 19:52:57
57.689193 11.868707 26/07/2025 19:53:17
57.6888 11.868097 26/07/2025 19:53:47
57.687827 11.866365 26/07/2025 19:55:05
57.68768 11.865928 26/07/2025 19:55:21
57.687607 11.865712 26/07/2025 19:55:28
57.68758 11.865612 26/07/2025 19:55:31
57.68758 11.865612 26/07/2025 19:55:31
57.687547 11.865508 26/07/2025 19:55:34
57.687547 11.865508 26/07/2025 19:55:34
57.687467 11.865145 26/07/2025 19:55:44
57.687467 11.865145 26/07/2025 19:55:44
57.68744 11.865035 26/07/2025 19:55:47
57.687367 11.864668 26/07/2025 19:55:57
57.687293 11.864287 26/07/2025 19:56:07
57.68722 11.863847 26/07/2025 19:56:18
57.686987 11.862587 26/07/2025 19:56:47
57.6869