In [30]:
from typing import List, Dict
import numpy as np
from numpy.linalg import inv
import pandas as pd
import glob
from dataclasses import dataclass
import plotly.express as px


In [None]:
all_boats_trajectories = {}
dataset_path = "../../data/FishingKoreaAIS_sampled/*.csv"
dynamic_data_files = glob.glob(dataset_path)
dynamic_data_files.sort(key=lambda data: int(data.split("len_")[1].split("_mmsi_")[0]), reverse=True)


# Functions

In [47]:
@dataclass
class AISColumnNames:
    Date: str = "Date"
    Sampled_Date: str = "Sampled_Date"
    Latitude: str = "Latitude"
    Longitude: str = "Longitude"
    Pseudo_Longitude: str = "Pseudo_Longitude"
    SOG: str = "SOG"
    COG: str = "COG"
    Heading: str = "Heading"

    n_Latitude: str = "norm Latitude"
    n_Longitude: str = "norm Longitude"
    n_SOG: str = "norm SOG"
    n_COG: str = "norm COG"
    n_Heading: str = "norm Heading"

    is_synthetic: str = "is_synthetic"
    to_predict: str = "to_predict"

cols: AISColumnNames = AISColumnNames()
target_freq_in_minutes = 10
target_freq: str = f"{target_freq_in_minutes}min"
sample_T: pd.Timedelta = pd.Timedelta(minutes=target_freq_in_minutes)


def get_trajectory_sequences(trajectory_sampled: pd.DataFrame, time_column_name=None
    ) -> List[pd.DataFrame]:
        if time_column_name is None:
            time_column_name = cols.Sampled_Date
        trajectory_sequences: List[pd.DataFrame] = []  # To store the sequences
        current_sequence = pd.DataFrame(
            columns=trajectory_sampled.columns
        )  # DF To track the current sequence

        # Iterate through the timestamps
        for i in range(len(trajectory_sampled) - 1):
            if (
                trajectory_sampled[time_column_name][i + 1]
                - trajectory_sampled[time_column_name][i]
                == sample_T
            ):
                # If the difference is 10 minutes, add the current timestamp to the sequence
                if len(current_sequence) == 0:
                    current_sequence = trajectory_sampled.iloc[
                        [i]
                    ]  # Add the first timestamp of the sequence
                current_sequence = pd.concat(
                    [current_sequence, trajectory_sampled.iloc[[i + 1]]],
                    ignore_index=True,
                )  # Add the next timestamp
            else:
                # If the difference is not 10 minutes, end the current sequence
                if len(current_sequence) != 0:
                    trajectory_sequences.append(
                        current_sequence
                    )  # Store the completed sequence
                    current_sequence = pd.DataFrame(
                        columns=trajectory_sampled.columns
                    )  # Reset the current sequence

        # Handle the last sequence if it ends at the last timestamp
        if len(current_sequence) != 0:
            trajectory_sequences.append(current_sequence)

        return trajectory_sequences
    
class AdaptiveExtendedKalmanFilter:
    def __init__(self, initial_state, initial_covariance, process_noise, measurement_noise, alpha=0.7):
        """
        Initialize the Adaptive Extended Kalman Filter.
        
        Parameters:
        - initial_state: Initial state vector [lat, lon, sog, cog]
        - initial_covariance: Initial covariance matrix
        - process_noise: Process noise covariance matrix
        - measurement_noise: Measurement noise covariance matrix
        """
        self.state = initial_state
        self.covariance = initial_covariance
        self.Q = process_noise  # Process noise covariance
        self.R = measurement_noise  # Measurement noise covariance
        self.innovation_history = []
        self.window_size = 5  # Window size for adaptive estimation
        self.alpha = alpha  # Forgetting factor
        
    def predict(self, dt):
        """
        Prediction step of the AEKF.
        
        Parameters:
        - dt: Time step in hours (since coordinates are in degrees)
        """
        # State transition matrix (simple constant velocity model)
        F = np.eye(4)
        F[0, 2] = dt * np.cos(np.radians(self.state[3])) / 60  # latitude change from SOG/COG
        F[1, 2] = dt * np.sin(np.radians(self.state[3])) / 60  # longitude change from SOG/COG
        
        # Predict state
        self.state = F @ self.state
        
        # Predict covariance
        self.covariance = F @ self.covariance @ F.T + self.Q
        
        return self.state
    
    def update(self, measurement):
        """
        Update step of the AEKF with adaptive noise estimation.
        
        Parameters:
        - measurement: [lat, lon, sog, cog]
        """
        # Measurement matrix (we directly observe all states)
        H = np.eye(4)
        
        # Calculate innovation
        innovation = measurement - H @ self.state
        self.innovation_history.append(innovation)
        
        # Keep only the most recent innovations
        if len(self.innovation_history) > self.window_size:
            self.innovation_history.pop(0)
        
        # Adaptive estimation of measurement noise
        if len(self.innovation_history) >= 2:
            innovation_cov = np.cov(np.array(self.innovation_history).T)
            self.R = self.alpha * innovation_cov + (1 - self.alpha) * self.R
        
        # Kalman gain
        S = H @ self.covariance @ H.T + self.R
        K = self.covariance @ H.T @ inv(S)
        
        # Update state
        self.state = self.state + K @ innovation
        
        # Update covariance
        self.covariance = (np.eye(4) - K @ H) @ self.covariance
        
        return self.state
    
    def get_state(self):
        """Return the current state estimate."""
        return self.state


def AEKF_traj(df, alpha = 0.7):
    """
    Apply Adaptive Extended Kalman Filter to the vessel tracking dataframe.
    
    Parameters:
    - df: Pandas DataFrame with columns ['Sampled_Date', 'MMSI', 'Latitude', 'Longitude', 'SOG', 'COG', 'Heading']
    
    Returns:
    - DataFrame with filtered positions and additional AEKF output
    """
    # Sort by MMSI and timestamp
    df = df.sort_values(['MMSI', 'Sampled_Date'])
    
    # Convert datetime to seconds for delta time calculation
    df['time_seconds'] = pd.to_datetime(df['Sampled_Date']).astype('int64') // 10**9
    
    # Initialize output columns
    df['filtered_lat'] = np.nan
    df['filtered_lon'] = np.nan
    df['filtered_sog'] = np.nan
    df['filtered_cog'] = np.nan
    
    # Group by vessel (MMSI)
    for mmsi, group in df.groupby('MMSI'):
        if len(group) < 2:
            continue  # Need at least 2 points for filtering
            
        # Initialize AEKF with first measurement
        initial_state = np.array([
            group.iloc[0]['Latitude'],
            group.iloc[0]['Longitude'],
            group.iloc[0]['SOG'],
            group.iloc[0]['COG']
        ])
        
        # Initial covariance (tune these based on your application)
        initial_covariance =  np.diag([1e-4, 1e-4, 0.1, 1.0])
        
        # Process noise covariance (tune these)
        process_noise = np.diag([1e-6, 1e-6, 0.01, 0.1]) 
        
        # Measurement noise covariance (tune these)
        measurement_noise = np.diag([1e-5, 1e-5, 0.1, 1.0]) 
        
        aekf = AdaptiveExtendedKalmanFilter(
            initial_state, initial_covariance, process_noise, measurement_noise, alpha
        )
        
        # Store first filtered values (same as measurement)
        # df.loc[group.index[0], 'filtered_lat'] = initial_state[0]
        # df.loc[group.index[0], 'filtered_lon'] = initial_state[1]
        # df.loc[group.index[0], 'filtered_sog'] = initial_state[2]
        # df.loc[group.index[0], 'filtered_cog'] = initial_state[3]
        
        # Iterate through remaining points
        for i in range(1, len(group)):
            prev_time = group.iloc[i-1]['time_seconds']
            curr_time = group.iloc[i]['time_seconds']
            dt = (curr_time - prev_time) / 3600  # hours
            
            # Prediction step
            aekf.predict(dt)
            
            # Update step with current measurement
            measurement = np.array([
                group.iloc[i]['Latitude'],
                group.iloc[i]['Longitude'],
                group.iloc[i]['SOG'],
                group.iloc[i]['COG']
            ])
            
            filtered_state = aekf.update(measurement)
            
            # Store filtered values
            df.loc[group.index[i], cols.Latitude] = filtered_state[0]
            df.loc[group.index[i], cols.Longitude] = filtered_state[1]
            df.loc[group.index[i], cols.SOG] = filtered_state[2]
            df.loc[group.index[i], cols.COG] = filtered_state[3]
    
    return df


def restore_missing_timestamps(df, freq='10T', interpolation_method='linear', noise_level=0.0):
    np.random.seed(42)
    
    # Make sure the time column is in datetime format
    df[cols.Sampled_Date] = pd.to_datetime(df[cols.Sampled_Date])
    
    # Set the timestamp as index
    df = df.set_index(cols.Sampled_Date)

    full_range = pd.date_range(
        start=df.index.min(),
        end=df.index.max(),
        freq=freq
    )
        
    # Reindex to the complete time range
    df = df.reindex(full_range)
        
    # Reset index to make Sampled_Date a column again
    df = df.reset_index().rename(columns={'index': cols.Sampled_Date})
            
    # Interpolate numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    # Store original non-null values to only add noise to interpolated points
    original_mask = df[numeric_cols].notna()
    
    # Perform interpolation
    df[numeric_cols] = df[numeric_cols].interpolate(method=interpolation_method)
    
    # Add noise (either independent or random walk)
    if noise_level > 0:
        # Apply random walk noise between known points
        for col in [cols.Longitude, cols.Latitude]:
            # Find where original data exists (anchor points)
            anchors = original_mask[col]
            anchor_indices = np.where(anchors)[0]
            
            # Iterate through each segment between anchors
            for i in range(len(anchor_indices) - 1):
                start_idx = anchor_indices[i]
                end_idx = anchor_indices[i + 1]
                segment_length = end_idx - start_idx - 1
                
                if segment_length > 0:
                    # Generate random steps (Brownian motion)
                    steps = np.random.normal(
                        scale=noise_level, 
                        size=segment_length
                    )
                    # Accumulate noise (cumulative sum)
                    noise = np.cumsum(steps)
                    # Apply noise to the interpolated segment
                    df.loc[start_idx + 1 : end_idx - 1, col] += noise
        
    return df

# Example usage:
# df = restore_missing_timestamps(your_dataframe)
# Example usage:
# df = pd.read_csv('your_data.csv')  # Load your data
# filtered_df = apply_aekf_to_dataframe(df)


def plot_plotly_trajectory(dfs: List[pd.DataFrame], 
                                   color_sequence=None,
                                   line_width=2,
                                   marker_size=4):
    if not dfs:
        raise ValueError("Empty list of DataFrames provided")
    
    # Combine all segments with a segment ID
    combined_df = pd.concat(
        [df.assign(segment_id=i) for i, df in enumerate(dfs)],
        ignore_index=True
    )
    
    if color_sequence is None:
        color_sequence = px.colors.qualitative.Plotly
    
    fig = px.line_mapbox(
        combined_df,
        lat="Latitude",
        lon="Longitude",
        color="segment_id",
        color_discrete_sequence=["blue"],
        hover_name="Sampled_Date",
        hover_data=["SOG", "COG", "MMSI"],
        zoom=10,
        height=600,
        title="Vessel Trajectory Segments"
    )
    
    # Update marker appearance
    fig.update_traces(
        mode="lines+markers",
        line=dict(width=line_width),
        marker=dict(size=marker_size)
    )
    
    fig.update_layout(
        mapbox_style="carto-positron",
        margin={"r":0,"t":40,"l":0,"b":0},
        showlegend=False,
        legend_title_text="Trajectory Segment"
    )
    
    return fig

def plot_plotly_trajectory_groups(df_groups: List[List[pd.DataFrame]],
                         group_names, 
                         color_sequence=None,
                         line_width=2,
                         marker_size=4):
    if not df_groups:
        raise ValueError("Empty list of DataFrame groups provided")
    
    if color_sequence is None:
        color_sequence = px.colors.qualitative.Plotly
    
    # Create empty figure with proper mapbox setup
    fig = px.scatter_mapbox(lat=[None], lon=[None]).update_layout(
        mapbox_style="open-street-map",
        mapbox_zoom=8,
        height=600
    )
    
    for group_id, df_group in enumerate(df_groups):
        group_color = color_sequence[group_id % len(color_sequence)]
        
        for segment_id, df in enumerate(df_group):
            if len(df) == 0:
                continue  # Skip empty dataframes
                
            # Add line trace for this segment
            fig.add_trace(
                px.line_mapbox(
                    df,
                    lat="Latitude",
                    lon="Longitude",
                    color_discrete_sequence=[group_color]
                ).data[0].update(
                    mode="lines+markers",
                    line=dict(width=line_width),
                    marker=dict(size=marker_size),
                    name=f"{group_names[group_id]}",
                    showlegend=(segment_id == 0),  # Only show legend for first segment
                    legendgroup=f"{group_names[group_id]}",
                    hoverinfo="text",
                    customdata=df[["Sampled_Date", "SOG", "COG", "MMSI"]],
                    hovertemplate=(
                        "Latitude: %{lat}<br>"
                        "Longitude: %{lon}<br>"
                        "Date: %{customdata[0]}<br>"
                        "SOG: %{customdata[1]}<br>"
                        "COG: %{customdata[2]}<br>"
                        "MMSI: %{customdata[3]}<extra></extra>"
                    )
                )
            )
    
    fig.update_layout(
        margin={"r":0,"t":40,"l":0,"b":0},
        showlegend=True,
        legend_title_text="Trajectory Groups",
        title="Vessel Trajectory"
    )
    
    # Auto-zoom to the data
    if len(df_groups) > 0 and len(df_groups[0]) > 0:
        first_df = df_groups[0][0]
        fig.update_mapboxes(
            center=dict(
                lat=first_df["Latitude"].mean(),
                lon=first_df["Longitude"].mean()
            )
        )
    
    return fig

# Code

In [64]:
int(dynamic_data_files[0].split("len_")[1].split("_mmsi_")[0])

4454

In [None]:
for sampled_data_file in dynamic_data_files:
    if int(sampled_data_file.split("len_")[1].split("_mmsi_")[0]) > 2096:
        continue
    if int(sampled_data_file.split("len_")[1].split("_mmsi_")[0]) < 2000:
        break
    sample_traj = pd.read_csv(sampled_data_file, index_col=0)
    sample_traj['Sampled_Date'] = pd.to_datetime(sample_traj['Sampled_Date'])
    # sample_traj
    sample_traj_sequences = get_trajectory_sequences(sample_traj)
    fig = plot_plotly_trajectory_groups([sample_traj_sequences], group_names=["Initial trajectory"])
    fig.write_image(f"results/png_{int(sample_traj["MMSI"][0])}.png")
    fig.write_html(f"results/html_{int(sample_traj["MMSI"][0])}.html")

In [68]:
sampled_data_file.split("len_")[1].split("_mmsi_")[0]

'1991'

In [63]:
alpha = 0.89
sample_traj_sequences = get_trajectory_sequences(sample_traj)
### JUST AEKF
interpolated_sample_traj = restore_missing_timestamps(sample_traj, noise_level=0.0000)
interpolated_sample_traj_sequences = get_trajectory_sequences(interpolated_sample_traj)
AEKF_interpolated_sample_traj= AEKF_traj(interpolated_sample_traj, alpha)
AEKF_interpolated_sample_traj_sequences = get_trajectory_sequences(AEKF_interpolated_sample_traj)

fig = plot_plotly_trajectory_groups([AEKF_interpolated_sample_traj_sequences, sample_traj_sequences], group_names=["Interpolated + AEKF", "Initial trajectory"])
fig.write_image(f"results/png_{int(sample_traj["MMSI"][0])}_interpolated_traj_alpha_{alpha}.png")
fig.write_html(f"results/html_{int(sample_traj["MMSI"][0])}_interpolated_traj_alpha_{alpha}.html")


'T' is deprecated and will be removed in a future version, please use 'min' instead.

