In [1]:
from typing import List, Dict
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import LineString
import matplotlib.pyplot as plt
import contextily as ctx
import geoviews as gv
import cartopy.crs as ccrs

import plotly.express as px

from dataclasses import dataclass

from sqlalchemy import create_engine, text
import psycopg2

# -------------------------------------
# Database Configuration (Postgres)
# -------------------------------------
db_config = {
    "host": "143.248.230.55",
    "port": "5432",
    "dbname": "AIS_DB_Prototype",
    "user": "root",
    "password": "password",
}

# Build SQLAlchemy engine
database_url = (
    f"postgresql://{db_config['user']}:{db_config['password']}"
    f"@{db_config['host']}:{db_config['port']}/{db_config['dbname']}"
)
engine = create_engine(database_url)

## Functions

In [5]:
def load_fishing_vessels_voyages_with_loitering(sample_n: int = 5) -> pd.DataFrame:

    sql = text(
        """
        SELECT
            mmsi,
            CAST(eta AS TIMESTAMP)
            AS eta_time,
            COUNT(*)
            AS entry_count
            FROM public.loitering_new_v2
            GROUP BY
            mmsi,
            CAST(eta AS TIMESTAMP)
            HAVING
            COUNT(*) > 10
            ORDER BY
            entry_count,
            mmsi,
            eta_time
        LIMIT :n;
    """
    )
    return pd.read_sql(sql, engine, params={"n": sample_n}, parse_dates=["eta_time"])

def load_loitering_part_of_trajectory(mmsi: int, eta_time: pd.Timestamp) -> pd.DataFrame:
    sql = text(
        """
        SELECT 
            posutc AS ts_string,
            latitude,
            longitude,
            sog,
            cog,
            heading
        FROM public.loitering_new_v2
        WHERE mmsi = :m
          AND CAST(eta AS TIMESTAMP) = :e
        ORDER BY posutc::timestamp;
    """
    )
    df = pd.read_sql(sql, engine, params={"m": mmsi, "e": eta_time})
    df["timestamp"] = pd.to_datetime(df["ts_string"], errors="coerce")

    # Convert columns to numeric, coercing invalid values to NaN
    numeric_columns = ["latitude", "longitude", "sog", "cog", "heading"]
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    df.drop(columns=["ts_string"], inplace=True)
    # Drop rows where any of the specified columns or timestamp is NaN
    return df.dropna(subset=["timestamp"] + numeric_columns)

def load_full_trajectory(mmsi: int, eta_time: pd.Timestamp) -> pd.DataFrame:
    """
    Load ALL AIS points (posutc, latitude, longitude, sog, cog, heading)
    for a given (mmsi, eta) from ais_korea.
    Returns a DataFrame with parsed timestamps.
    """
    sql = text(
        """
        SELECT 
            posutc AS ts_string,
            latitude,
            longitude,
            sog,
            cog,
            heading
        FROM public.ais_korea
        WHERE mmsi = :m
          AND CAST(eta AS TIMESTAMP) = :e
        ORDER BY posutc::timestamp;
    """
    )
    df = pd.read_sql(sql, engine, params={"m": mmsi, "e": eta_time})
    df["timestamp"] = pd.to_datetime(df["ts_string"], errors="coerce")

    # Convert columns to numeric, coercing invalid values to NaN
    numeric_columns = ["latitude", "longitude", "sog", "cog", "heading"]
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    df.drop(columns=["ts_string"], inplace=True)
    # Drop rows where any of the specified columns or timestamp is NaN
    return df.dropna(subset=["timestamp"] + numeric_columns)


# latitude	longitude	sog	cog	heading	timestamp
@dataclass
class AISColumnNames:
    # Date: str = "Date"
    # Sampled_Date: str = "Sampled_Date"
    # Latitude: str = "Latitude"
    # Longitude: str = "Longitude"
    # Pseudo_Longitude: str = "Pseudo_Longitude"
    # SOG: str = "SOG"
    # COG: str = "COG"
    # Heading: str = "Heading"

    # n_Latitude: str = "norm Latitude"
    # n_Longitude: str = "norm Longitude"
    # n_SOG: str = "norm SOG"
    # n_COG: str = "norm COG"
    # n_Heading: str = "norm Heading"
    Date: str = "timestamp"
    Sampled_Date: str = "sampled_timestamp"
    Latitude: str = "latitude"
    Longitude: str = "longitude"
    Pseudo_Longitude: str = "pseudo_longitude"
    SOG: str = "sog"
    COG: str = "cog"
    Heading: str = "heading"

    n_Latitude: str = "norm_latitude"
    n_Longitude: str = "norm_longitude"
    n_SOG: str = "norm_sog"
    n_COG: str = "norm_cog"
    n_Heading: str = "norm_heading"

    is_synthetic: str = "is_synthetic"
    to_predict: str = "to_predict"


cols: AISColumnNames = AISColumnNames()
target_freq_in_minutes = 10
target_freq: str = f"{target_freq_in_minutes}min"
sample_T: pd.Timedelta = pd.Timedelta(minutes=target_freq_in_minutes)


def get_sampled_trajectory(trajectory: pd.DataFrame) -> pd.DataFrame:
    trajectory[cols.Date] = pd.to_datetime(trajectory[cols.Date])
    trajectory = trajectory.set_index(cols.Date)
    trajectory = trajectory.sort_index()

    # add first and last steps of trajectory which are divisible by 10 minutes
    first = trajectory.iloc[:1].copy()
    first.index = [trajectory.index.min().floor(target_freq)]
    last = trajectory.iloc[-1:].copy()
    last.index = [trajectory.index.max().ceil(target_freq)]
    trajectory = pd.concat([first, trajectory, last])

    # Define exact 10-minute sampling times
    start_time = trajectory.index.min().floor("h")  # Round down to the nearest hour
    end_time = trajectory.index.max().ceil("h")  # Round up to the nearest hour
    sampling_times = pd.date_range(start_time, end_time, freq=target_freq)

    # Filter only timestamps where at least one real record exists within ±10 minutes
    valid_sampling_times = [
        t for t in sampling_times if any(abs(trajectory.index - t) <= sample_T)
    ]

    trajectory = trajectory[~trajectory.index.duplicated(keep="first")]
    trajectory_interpolated = trajectory.reindex(
        trajectory.index.union(valid_sampling_times)
    ).sort_index()

    # Perform linear interpolation
    trajectory_interpolated = trajectory_interpolated.interpolate(method="time")

    # Keep only the sampled timestamps and drop any remaining NaNs
    trajectory_sampled = (
        trajectory_interpolated.loc[valid_sampling_times].dropna().reset_index()
    )
    trajectory_sampled.rename(columns={"index": cols.Sampled_Date}, inplace=True)
    return trajectory_sampled


def get_trajectory_sequences(
    trajectory_sampled: pd.DataFrame, time_column_name=None
) -> List[pd.DataFrame]:
    if time_column_name is None:
        time_column_name = cols.Sampled_Date
    trajectory_sequences: List[pd.DataFrame] = []  # To store the sequences
    current_sequence = pd.DataFrame(
        columns=trajectory_sampled.columns
    )  # DF To track the current sequence

    # Iterate through the timestamps
    for i in range(len(trajectory_sampled) - 1):
        if (
            trajectory_sampled[time_column_name][i + 1]
            - trajectory_sampled[time_column_name][i]
            == sample_T
        ):
            # If the difference is 10 minutes, add the current timestamp to the sequence
            if len(current_sequence) == 0:
                current_sequence = trajectory_sampled.iloc[
                    [i]
                ]  # Add the first timestamp of the sequence
            current_sequence = pd.concat(
                [current_sequence, trajectory_sampled.iloc[[i + 1]]],
                ignore_index=True,
            )  # Add the next timestamp
        else:
            # If the difference is not 10 minutes, end the current sequence
            if len(current_sequence) != 0:
                trajectory_sequences.append(
                    current_sequence
                )  # Store the completed sequence
                current_sequence = pd.DataFrame(
                    columns=trajectory_sampled.columns
                )  # Reset the current sequence

    # Handle the last sequence if it ends at the last timestamp
    if len(current_sequence) != 0:
        trajectory_sequences.append(current_sequence)

    return trajectory_sequences


def plot_plotly_trajectory_groups(
    df_groups: List[List[pd.DataFrame]],
    group_names,
    color_sequence=None,
    line_width=2,
    marker_size=4,
):
    if not df_groups:
        raise ValueError("Empty list of DataFrame groups provided")

    if color_sequence is None:
        color_sequence = px.colors.qualitative.Plotly

    # Create empty figure with proper mapbox setup
    fig = px.scatter_mapbox(lat=[None], lon=[None]).update_layout(
        mapbox_style="open-street-map", mapbox_zoom=8, height=600
    )
    min_lat = 360
    max_lat = 0
    min_lon = 360
    max_lon = 0
    for group_id, df_group in enumerate(df_groups):
        group_color = color_sequence[group_id % len(color_sequence)]

        for segment_id, df in enumerate(df_group):
            if len(df) == 0:
                continue  # Skip empty dataframes

            # Add line trace for this segment
            fig.add_trace(
                px.line_mapbox(
                    df,
                    lat=cols.Latitude,
                    lon=cols.Longitude,
                    color_discrete_sequence=[group_color],
                )
                .data[0]
                .update(
                    mode="lines+markers",
                    line=dict(width=line_width),
                    marker=dict(size=marker_size),
                    name=f"{group_names[group_id]}",
                    showlegend=(segment_id == 0),  # Only show legend for first segment
                    legendgroup=f"{group_names[group_id]}",
                    hoverinfo="text",
                    customdata=df[[cols.Sampled_Date, cols.SOG, cols.COG]],
                    hovertemplate=(
                        "Latitude: %{lat}<br>"
                        "Longitude: %{lon}<br>"
                        "Date: %{customdata[0]}<br>"
                        "SOG: %{customdata[1]}<br>"
                        "COG: %{customdata[2]}<br>"
                    ),
                )
            )

            # min/max lat/lot
            min_lat = min(min_lat, df[cols.Latitude].min())
            max_lat = max(max_lat, df[cols.Latitude].max())
            min_lon = min(min_lon, df[cols.Longitude].min())
            max_lon = max(max_lon, df[cols.Longitude].max())

    fig.update_layout(
        margin={"r": 0, "t": 40, "l": 0, "b": 0},
        showlegend=True,
        legend_title_text="Trajectory Groups",
        title="Vessel Trajectory",
    )

    # Auto-zoom to the data
    if len(df_groups) > 0 and len(df_groups[0]) > 0:
        fig.update_mapboxes(
            center=dict(lat=(min_lat + max_lat) / 2, lon=(min_lon + max_lon) / 2)
        )

    return fig

## Code

In [3]:
voyages_df = load_fishing_vessels_voyages_with_loitering(sample_n=3)
voyages_df  # display the sample selection

Unnamed: 0,mmsi,eta_time,entry_count
0,36968098,2023-01-01 00:00:00,11
1,100900256,2023-12-10 12:00:00,11
2,310696000,2023-11-20 06:00:00,11


In [6]:
# voyages_df = load_fishing_vessels_voyages(sample_n=100)
# voyages_df  # display the sample selection

for idx, row in voyages_df.iterrows():
    print(idx, row)
    try:
        m       = int(row["mmsi"])
        eta_val = row["eta_time"]
        
        # Load full trajectory points
        df = load_full_trajectory(m, eta_val)
        sampled_boat_trajectory = get_sampled_trajectory(df)
        # print(sampled_boat_trajectory)
        sampled_boat_trajectory.to_csv(f"../../data/loitering_sampled/len_{len(sampled_boat_trajectory)}_mmsi_{m}_eta_val_{eta_val}.csv")
        sample_traj_sequences = get_trajectory_sequences(sampled_boat_trajectory)
        fig = plot_plotly_trajectory_groups([sample_traj_sequences], group_names=["Initial trajectory"])
        fig.write_image(f"../../results/loitering_sampled/png_len_{len(sampled_boat_trajectory)}_mmsi_{m}_eta_val_{eta_val}.png")
        fig.write_html(f"../../results/loitering_sampled/html_len_{len(sampled_boat_trajectory)}_mmsi_{m}_eta_val_{eta_val}.html")
        
        
        df_loitering = load_loitering_part_of_trajectory(m, eta_val)
        sampled_boat_trajectory_loitering = get_sampled_trajectory(df_loitering)
        # print(sampled_boat_trajectory)
        # print(sampled_boat_trajectory)
        sampled_boat_trajectory_loitering.to_csv(f"../../data/loitering_sampled/len_{len(sampled_boat_trajectory_loitering)}_mmsi_{m}_eta_val_{eta_val}_loitering.csv")
        sample_traj_sequences_loitering = get_trajectory_sequences(sampled_boat_trajectory_loitering)
        fig = plot_plotly_trajectory_groups([sample_traj_sequences_loitering], group_names=["Initial trajectory"])
        fig.write_image(f"../../results/loitering_sampled/png_len_{len(sampled_boat_trajectory_loitering)}_mmsi_{m}_eta_val_{eta_val}_loitering.png")
        fig.write_html(f"../../results/loitering_sampled/html_len_{len(sampled_boat_trajectory_loitering)}_mmsi_{m}_eta_val_{eta_val}_loitering.html")
    except Exception as e:
        print(f"Couldn't process {row}: e")

0 mmsi                      36968098
eta_time       2023-01-01 00:00:00
entry_count                     11
Name: 0, dtype: object
1 mmsi                     100900256
eta_time       2023-12-10 12:00:00
entry_count                     11
Name: 1, dtype: object
2 mmsi                     310696000
eta_time       2023-11-20 06:00:00
entry_count                     11
Name: 2, dtype: object
