In [371]:
from typing import List, Dict, Tuple, Any
import numpy as np
import pandas as pd
from shapely.geometry import LineString, Point
from scipy.interpolate import CubicSpline
from dataclasses import dataclass
from sklearn.cluster import DBSCAN
import glob
import plotly.express as px
from scipy.signal import savgol_filter
from scipy.ndimage import gaussian_filter1d
from pykalman import KalmanFilter
from geopy.distance import geodesic
from statsmodels.tsa.stattools import acf
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [6]:
dataset_path = "../../data/FishingKoreaAIS_sampled/*.csv"
dynamic_data_files = glob.glob(dataset_path)
dynamic_data_files.sort(key=lambda data: int(data.split("len_")[1].split("_mmsi_")[0]), reverse=True)
dynamic_data_files_dict = {dynamic_data_file.split("_mmsi_")[1].split(".csv")[0] : dynamic_data_file for dynamic_data_file in dynamic_data_files}

# Functions

In [95]:
@dataclass
class AISColumnNames:
    Date: str = "Date"
    Sampled_Date: str = "Sampled_Date"
    Latitude: str = "Latitude"
    Longitude: str = "Longitude"
    Pseudo_Longitude: str = "Pseudo_Longitude"
    SOG: str = "SOG"
    COG: str = "COG"
    Heading: str = "Heading"

    n_Latitude: str = "norm Latitude"
    n_Longitude: str = "norm Longitude"
    n_SOG: str = "norm SOG"
    n_COG: str = "norm COG"
    n_Heading: str = "norm Heading"

    is_synthetic: str = "is_synthetic"
    to_predict: str = "to_predict"

cols: AISColumnNames = AISColumnNames()
target_freq_in_minutes = 10
target_freq: str = f"{target_freq_in_minutes}min"
sample_T: pd.Timedelta = pd.Timedelta(minutes=target_freq_in_minutes)


def prepare_trajectory(df):
    return df.rename(columns={
        'Sampled_Date': 'timestamp',
        'SOG': 'sog',
        'COG': 'cog'
    })

def haversine(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
    """
    Great-circle distance (km) between two latitude/longitude points.
    """
    R = 6371.0
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = (np.sin(delta_phi / 2)**2 
         + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2)**2)
    return R * 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))


def initial_bearing(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
    """
    Bearing (°) from first point to second.
    """
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    delta_lambda = np.radians(lon2 - lon1)
    x = np.sin(delta_lambda) * np.cos(phi2)
    y = (np.cos(phi1) * np.sin(phi2) 
         - np.sin(phi1) * np.cos(phi2) * np.cos(delta_lambda))
    theta = np.degrees(np.arctan2(x, y))
    return (theta + 360) % 360


def sliding_windows(df: pd.DataFrame, window_size: int = 10, step: int = 5) -> List[pd.DataFrame]:
    """
    Split into overlapping windows of fixed size and step.
    """
    if len(df) < window_size:
        return []
    return [df.iloc[i:i+window_size] for i in range(0, len(df) - window_size + 1, step)]


def extract_features(seg: pd.DataFrame, arrival_center: Tuple[float, float]) -> Dict[str, float]:
    """
    Compute features for a trajectory segment.
    """
    lat0, lon0 = seg.iloc[0][['Latitude', 'Longitude']]
    lat1, lon1 = seg.iloc[-1][['Latitude', 'Longitude']]
    # detour factor
    dist = sum(
        haversine(a.Latitude, a.Longitude, b.Latitude, b.Longitude)
        for a, b in zip(seg.itertuples(), seg.iloc[1:].itertuples())
    )
    straight = haversine(lat0, lon0, lat1, lon1)
    detour = dist / straight if straight > 1e-6 else np.inf
    # drift angle
    seg_bear = initial_bearing(lat0, lon0, lat1, lon1)
    dest_bear = initial_bearing(lat0, lon0, *arrival_center)
    drift = abs((seg_bear - dest_bear + 180) % 360 - 180)
    # course change
    cogs = seg['cog'].to_numpy()
    delta_cog = abs((cogs[-1] - cogs[0] + 180) % 360 - 180)
    accum_cog = np.sum(np.abs(np.diff(cogs)))
    # speed variability
    spd_std = seg['sog'].std()
    # lateral deviation
    line = LineString([(lon0, lat0), (lon1, lat1)])
    lat_dist = max(
        Point(r.Longitude, r.Latitude).distance(line)
        for r in seg.itertuples()
    )
    return {
        'detour_factor': detour,
        'drift_angle': drift,
        'delta_cog': delta_cog,
        'accum_cog': accum_cog,
        'spd_std': spd_std,
        'lat_dist': lat_dist
    }


def find_outlier_windows(features: pd.DataFrame) -> List[int]:
    """
    Identify window indices flagged as outliers via DBSCAN.
    """
    if features.empty:
        return []
    db = DBSCAN(eps=0.8, min_samples=3)
    labels = db.fit_predict(features.values)
    return [i for i, lab in enumerate(labels) if lab == -1]


def detect_anomalous_segments(
    trajectory: pd.DataFrame,
    window_size: int = 10,
    step: int = 5
) -> List[Dict[str, Any]]:
    """
    Main entry: returns list of suspicious segment dicts or empty if none.
    """
    trajectory = prepare_trajectory(trajectory)
    
    # smooth
    t = (trajectory.index.astype(np.int64)).values # // 10**9).values
    for col in ['Latitude', 'Longitude']:
        cs = CubicSpline(t, trajectory[col].values)
        trajectory[col] = cs(t)

    arrival_center = (trajectory['Latitude'].iloc[-1], trajectory['Longitude'].iloc[-1])
    window_size = 10
    step = 5

    windows = sliding_windows(trajectory, window_size, step)
    
    if not windows:
        print("not windows")
        return []

    # feature computation
    feats = [extract_features(w, arrival_center) for w in windows]
    feats_df = pd.DataFrame(feats)
    # outlier detection
    outlier_idx = find_outlier_windows(feats_df)
    # build anomalies
    anomalies: List[Dict[str, Any]] = []
    for idx in outlier_idx:
        seg = windows[idx]
        f = feats[idx]
        anomalies.append({
            'segment_start': seg.iloc[0]['timestamp'],
            'segment_end': seg.iloc[-1]['timestamp'],
            **f,
            'anomaly_types': 'outlier'
        })

    return anomalies


def get_trajectory_sequences(trajectory_sampled: pd.DataFrame, time_column_name=None
    ) -> List[pd.DataFrame]:
        if time_column_name is None:
            time_column_name = cols.Sampled_Date
        trajectory_sequences: List[pd.DataFrame] = []  # To store the sequences
        current_sequence = pd.DataFrame(
            columns=trajectory_sampled.columns
        )  # DF To track the current sequence

        # Iterate through the timestamps
        for i in range(len(trajectory_sampled) - 1):
            if (
                trajectory_sampled[time_column_name][i + 1]
                - trajectory_sampled[time_column_name][i]
                == sample_T
            ):
                # If the difference is 10 minutes, add the current timestamp to the sequence
                if len(current_sequence) == 0:
                    current_sequence = trajectory_sampled.iloc[
                        [i]
                    ]  # Add the first timestamp of the sequence
                current_sequence = pd.concat(
                    [current_sequence, trajectory_sampled.iloc[[i + 1]]],
                    ignore_index=True,
                )  # Add the next timestamp
            else:
                # If the difference is not 10 minutes, end the current sequence
                if len(current_sequence) != 0:
                    trajectory_sequences.append(
                        current_sequence
                    )  # Store the completed sequence
                    current_sequence = pd.DataFrame(
                        columns=trajectory_sampled.columns
                    )  # Reset the current sequence

        # Handle the last sequence if it ends at the last timestamp
        if len(current_sequence) != 0:
            trajectory_sequences.append(current_sequence)

        return trajectory_sequences

In [96]:
def detect_anomalous_segments_2(
    trajectory: pd.DataFrame,
    window_size: int = 10,
    step: int = 5,
    smoothing_factor: float = 0.2,
    arrival_window: int = 20
) -> List[Dict[str, Any]]:
    """
    Improved anomaly detection with dynamic arrival center calculation and additional smoothing.
    
    Args:
        trajectory: Input trajectory data
        window_size: Size of sliding window for segment analysis
        step: Step size for sliding window
        smoothing_factor: Factor for exponential smoothing of arrival center (0-1)
        arrival_window: Number of points at end to consider for arrival center
    
    Returns:
        List of suspicious segment dicts or empty if none
    """
    trajectory = prepare_trajectory(trajectory)
    
    # Initial smoothing of trajectory
    t = trajectory.index.astype(np.int64).values
    for col in ['Latitude', 'Longitude']:
        cs = CubicSpline(t, trajectory[col].values)
        trajectory[col] = cs(t)
    
    # Calculate dynamic arrival center using exponential smoothing
    def get_dynamic_arrival_center(traj, current_idx, window=arrival_window, alpha=smoothing_factor):
        # Consider last 'window' points after current position as potential arrival area
        end_segment = traj.iloc[current_idx:].tail(window)
        if len(end_segment) < 3:  # If not enough points, use final point
            return (traj['Latitude'].iloc[-1], traj['Longitude'].iloc[-1])
        
        # Apply exponentially weighted mean - more weight to recent points
        weights = np.array([(1-alpha)**i for i in range(len(end_segment))][::-1])
        weights /= weights.sum()
        
        lat = np.sum(end_segment['Latitude'].values * weights)
        lon = np.sum(end_segment['Longitude'].values * weights)
        return (lat, lon)

    windows = sliding_windows(trajectory, window_size, step)
    
    if not windows:
        return []

    # Feature computation with dynamic arrival center
    feats = []
    for i, w in enumerate(windows):
        # Get the index of the first point in this window
        window_start_idx = trajectory.index.get_loc(w.iloc[0].name)
        arrival_center = get_dynamic_arrival_center(trajectory, window_start_idx)
        feats.append(extract_features(w, arrival_center))
    
    feats_df = pd.DataFrame(feats)
    
    # Outlier detection
    outlier_idx = find_outlier_windows(feats_df)
    
    # Build anomalies
    anomalies: List[Dict[str, Any]] = []
    for idx in outlier_idx:
        seg = windows[idx]
        f = feats[idx]
        anomalies.append({
            'segment_start': seg.iloc[0]['timestamp'],
            'segment_end': seg.iloc[-1]['timestamp'],
            **f,
            'anomaly_types': 'outlier'
        })

    return anomalies

In [112]:

def plot_plotly_trajectory_groups(df_groups: List[List[pd.DataFrame]],
                         group_names, 
                         color_sequence=None,
                         line_width=2,
                         marker_size=4):
    if not df_groups:
        raise ValueError("Empty list of DataFrame groups provided")
    
    if color_sequence is None:
        color_sequence = px.colors.qualitative.Plotly
    
    # Create empty figure with proper mapbox setup
    fig = px.scatter_mapbox(lat=[None], lon=[None]).update_layout(
        mapbox_style="open-street-map",
        mapbox_zoom=8,
        height=600
    )
    
    for group_id, df_group in enumerate(df_groups):
        group_color = color_sequence[group_id % len(color_sequence)]
        
        for segment_id, df in enumerate(df_group):
            if len(df) == 0:
                continue  # Skip empty dataframes
                
            # Add line trace for this segment
            fig.add_trace(
                px.line_mapbox(
                    df,
                    lat="Latitude",
                    lon="Longitude",
                    color_discrete_sequence=[group_color]
                ).data[0].update(
                    mode="lines+markers",
                    line=dict(width=line_width),
                    marker=dict(size=marker_size),
                    name=f"{group_names[group_id]}",
                    showlegend=(segment_id == 0),  # Only show legend for first segment
                    legendgroup=f"{group_names[group_id]}",
                    hoverinfo="text",
                    customdata=df[["Sampled_Date", "SOG", "COG", "MMSI"]],
                    hovertemplate=(
                        "Latitude: %{lat}<br>"
                        "Longitude: %{lon}<br>"
                        "Date: %{customdata[0]}<br>"
                        "SOG: %{customdata[1]}<br>"
                        "COG: %{customdata[2]}<br>"
                        "MMSI: %{customdata[3]}<extra></extra>"
                    )
                )
            )
    
    fig.update_layout(
        margin={"r":0,"t":40,"l":0,"b":0},
        showlegend=True,
        legend_title_text="Trajectory Groups",
        title="Vessel Trajectory"
    )
    
    # Auto-zoom to the data
    if len(df_groups) > 0 and len(df_groups[0]) > 0:
        first_df = df_groups[0][0]
        fig.update_mapboxes(
            center=dict(
                lat=first_df["Latitude"].mean(),
                lon=first_df["Longitude"].mean()
            )
        )
    
    return fig

# Code

In [236]:
sample_traj = pd.read_csv(dynamic_data_files_dict["440045160"], index_col=0)
sample_traj['Sampled_Date'] = pd.to_datetime(sample_traj['Sampled_Date'])
sample_traj_sequences = get_trajectory_sequences(sample_traj)
print([len(i) for i in sample_traj_sequences])

[1282, 1463, 201]


In [237]:
smooth_cols = ['Latitude', 'Longitude']
window_size = 20  # Adjust based on your needs

for sample_traj_sequence_i in sample_traj_sequences:
    if len(sample_traj_sequence_i) < 900:
        continue
    # sample_traj_smooth = sample_traj_sequence_i[smooth_cols].rolling(window=window_size, center=True, min_periods=1).mean()
    # sample_traj_smooth = sample_traj_sequence_i[smooth_cols].ewm(span=window_size, adjust=False).mean()
    sample_traj_smooth = sample_traj_sequence_i[smooth_cols].apply(
        lambda x: savgol_filter(x, window_length=window_size, polyorder=3)
    )
    # sample_traj_smooth = sample_traj_sequence_i[smooth_cols].apply(
    #     lambda x: gaussian_filter1d(x, sigma=window_size/3)
    # )
    # sample_traj_smooth = sample_traj_sequence_i[smooth_cols].rolling(
    #     window=window_size, center=True, min_periods=1
    # ).median()
    # kf = KalmanFilter()
    # sample_traj_smooth = kf.em(sample_traj_sequence_i[smooth_cols].values).smooth()[0]
    
    sample_traj_smooth['SOG'] = sample_traj_sequence_i['SOG']
    sample_traj_smooth['COG'] = sample_traj_sequence_i['COG']
    sample_traj_smooth['Sampled_Date'] = sample_traj_sequence_i['Sampled_Date']
    sample_traj_smooth['MMSI'] = sample_traj_sequence_i['MMSI']

    fig = plot_plotly_trajectory_groups([[sample_traj_sequence_i], [sample_traj_smooth]], ["initial trajectory", "smooth"])
    fig.show()

In [189]:
len(sample_traj_smooth)

1463

In [190]:
len(sample_traj_sequences[-2])

1463

In [191]:
sample_traj_sequences[-2]['combined_residual'] = 0

for col in smooth_cols:
    # Calculate residual for each column
    sample_traj_sequences[-2][f'{col}_residual'] = sample_traj_sequences[-2][col] - sample_traj_smooth[col]
    sample_traj_sequences[-2]['combined_residual'] += sample_traj_sequences[-2][f'{col}_residual']**2

# Take square root to get Euclidean norm of residuals
sample_traj_sequences[-2]['combined_residual'] = np.sqrt(sample_traj_sequences[-2]['combined_residual'])

# Calculate a combined threshold (e.g., mean + 2*std of combined residuals)
combined_threshold = sample_traj_sequences[-2]['combined_residual'].mean() + 2 * sample_traj_sequences[-2]['combined_residual'].std()

# Flag as anomaly if combined residual exceeds threshold
sample_traj_sequences[-2]['combined_anomaly'] = sample_traj_sequences[-2]['combined_residual'] > combined_threshold
sample_traj_sequence_anomaly = sample_traj_sequences[-2][sample_traj_sequences[-2]["combined_anomaly"]].reset_index()

sample_traj_sequence_anomaly_sequences = get_trajectory_sequences(sample_traj_sequence_anomaly)

In [192]:
fig = plot_plotly_trajectory_groups([[sample_traj_sequences[-2]], [sample_traj_smooth], sample_traj_sequence_anomaly_sequences[1:-2]], ["initial trajectory", "smooth", "anomaly"])
# fig = plot_plotly_trajectory_groups([[sample_traj_sequences[-2]], sample_traj_sequence_anomaly_sequences[1:-2]], ["initial trajectory", "anomaly"])

fig.show()


In [None]:
# anomalies = detect_anomalous_segments_2(
#         trajectory=sample_traj_sequences[0],
#         window_size=8,  # Smaller window for short trajectory
#         step=4,
#         arrival_window=15
#     )
# print(len(sample_traj_sequences[0]))
# print(len(anomalies) * 4)



177
172


In [84]:
# sample_traj = pd.read_csv(dynamic_data_files_dict["440045160"], index_col=0)
# for dynamic_data_file in dynamic_data_files:
#     sample_traj = pd.read_csv(dynamic_data_file, index_col=0)
#     sample_traj['Sampled_Date'] = pd.to_datetime(sample_traj['Sampled_Date'])
#     # Convert your DataFrame to the expected format
#     trajectory_data = prepare_trajectory(sample_traj)


#     # Define arrival center (using last point)
#     arrival_center = (sample_traj['Latitude'].iloc[-1], sample_traj['Longitude'].iloc[-1])

#     # Run anomaly detection with smaller window size
#     anomalies = detect_anomalous_segments(
#         trajectory=trajectory_data,
#         arrival_center=arrival_center,
#         window_size=10,  # Smaller window for short trajectory
#         step=5
#     )
#     # if len(anomalies) == 0:
#     #     continue
#     # Print results
#     # print(dynamic_data_file)
#     print("Detected anomalies:")
#     for anomaly in anomalies:
#         print(f"From {anomaly['segment_start']} to {anomaly['segment_end']}")
#         print(f"Detour factor: {anomaly['detour_factor']:.2f}")
#         print(f"Drift angle: {anomaly['drift_angle']:.2f}°")
#         print(f"Speed std: {anomaly['spd_std']:.2f}")
#         print("-" * 40)
#     break

In [404]:
# def detect_random_walk_anomalies(
#     df,
#     window_size=3,      # Number of points per segment (e.g., 3 for a 3-point sliding window)
#     step=1,             # Step size (e.g., 1 to slide by 1 point at a time)
#     K=2,                # Min number of segments where a point is flagged as anomalous
#     speed_threshold=0.5 # Ignore segments with avg speed < threshold (m/s)
# ):
#     """
#     Label trajectory points as anomalies if they belong to random walk segments.
    
#     Args:
#         df: DataFrame with columns ['Sampled_Date', 'Latitude', 'Longitude', 'SOG', 'COG'].
#         window_size: Number of points per segment (integer).
#         step: Sliding window step (integer, points).
#         K: Points must appear in >= K segments classified as random walks to be flagged.
#         speed_threshold: Ignore segments with avg speed below this (m/s).
        
#     Returns:
#         DataFrame with new column 'anomaly' (True if point is in a random walk segment).
#     """
#     # Preprocess
#     df = df.copy()
#     df['Sampled_Date'] = pd.to_datetime(df['Sampled_Date'])
#     df = df.sort_values('Sampled_Date').reset_index(drop=True)
    
#     # Initialize anomaly counter per point
#     df['anomaly_count'] = 0
    
#     # Slide window over the trajectory
#     for i in range(0, len(df) - window_size + 1, step):
#         segment = df.iloc[i:i + window_size]
        
#         if len(segment) < 3:  # Skip tiny segments
#             continue
            
#         # --- Feature Extraction ---
#         # 1. Mean Squared Displacement (MSD) slope
#         lats = segment['Latitude'].values
#         lons = segment['Longitude'].values
#         times = (segment['Sampled_Date'] - segment['Sampled_Date'].iloc[0]).dt.total_seconds().values
#         displacements = [geodesic((lats[0], lons[0]), (lat, lon)).meters for lat, lon in zip(lats, lons)]
#         msd = np.array(displacements) ** 2
        
#         # Fit MSD(t) = slope * t + intercept
#         if len(times) > 1:
#             X = times.reshape(-1, 1)
#             y = msd
#             model = LinearRegression().fit(X, y)
#             msd_slope = model.coef_[0]
#         else:
#             msd_slope = 0
        
#         # # 2. Speed autocorrelation
#         # speeds = segment['SOG'].values
#         # autocorr = acf(speeds, nlags=1, fft=True)[1] if len(speeds) > 1 else 0
        
#         # 3. Turning angle std (using COG)
#         cog_diff = np.abs(segment['COG'].diff().dropna())
#         turning_std = np.std(cog_diff) if len(cog_diff) > 0 else 0
        
#         # --- Classification Rules ---
#         is_random_walk = (
#             # ((msd_slope < 1.5) or              # MSD grows near-linearly
#             # (autocorr < 0.3) or                # Low speed autocorrelation
#             (turning_std > 30) and              # High turning angle variability
#             (segment['SOG'].mean() > speed_threshold)  # Ignore stationary points
#         )
        
#         # Increment anomaly count for points in this segment
#         if is_random_walk:
#             df.loc[segment.index, 'anomaly_count'] += 1
    
#     # Final anomaly label (True if >= K counts)
#     df['anomaly'] = df['anomaly_count'] >= K
#     return df.drop(columns=['anomaly_count'])

# def detect_turning_anomalies(
#     df,
#     window_size=5,      # Number of points per segment
#     step=1,             # Step size (points)
#     turning_std_threshold=30.0  # Degrees (adjust based on your data)
# ):
#     """
#     Label points as anomalies if their local turning angle variability is high.
    
#     Args:
#         df: DataFrame with columns ['Sampled_Date', 'Latitude', 'Longitude'].
#         window_size: Segment length in points.
#         step: Sliding window step.
#         K: Points must appear in >= K high-std segments to be flagged.
#         turning_std_threshold: Max allowed std of turning angles (degrees).
        
#     Returns:
#         DataFrame with new column 'anomaly' (True if erratic turning).
#     """
#     K = int(window_size/step)
#     # Preprocess
#     df = df.copy()
#     df['Sampled_Date'] = pd.to_datetime(df['Sampled_Date'])
#     df = df.sort_values('Sampled_Date').reset_index(drop=True)
    
#     # Calculate bearings between consecutive points
#     def calculate_bearing(lat1, lon1, lat2, lon2):
#         dlon = np.radians(lon2 - lon1)
#         lat1, lat2 = np.radians(lat1), np.radians(lat2)
#         y = np.sin(dlon) * np.cos(lat2)
#         x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlon)
#         bearing = np.degrees(np.arctan2(y, x))
#         return (bearing + 360) % 360  # Normalize to [0, 360)
    
#     df['bearing'] = df.apply(
#         lambda row: calculate_bearing(
#             df['Latitude'].shift(1).loc[row.name], df['Longitude'].shift(1).loc[row.name],
#             row['Latitude'], row['Longitude']
#         ) if not pd.isna(df['Latitude'].shift(1).loc[row.name]) else np.nan,
#         axis=1
#     )
    
#     # Calculate turning angles (absolute difference between bearings)
#     df['turning_angle'] = np.abs(df['bearing'].diff())
#     df['turning_angle'] = df['turning_angle'].apply(
#         lambda x: min(x, 360 - x) if not np.isnan(x) else np.nan  # Handle wraparound (e.g., 350° to 10° = 20°)
#     )
    
#     # Initialize anomaly counter
#     df['anomaly_std'] = 0.0
    
#     # Slide window and compute turning_std
#     for i in range(0, len(df) - window_size + 1, step):
#         segment = df.iloc[i:i + window_size]
#         turning_std = segment['turning_angle'].std()
        
#         segment_change = df.iloc[max(0, i - int(step)):i + window_size - int(step)]
        
#         # if turning_std > turning_std_threshold:
#         df.loc[segment_change.index, 'anomaly_std'] += (turning_std - turning_std_threshold) * abs(turning_std - turning_std_threshold)
    
#     # Label anomalies
#     df['anomaly'] = df['anomaly_std'] >= (turning_std_threshold/2)**2*K
#     df['suspecious'] = df['anomaly_std'] >= (turning_std_threshold/2)**2*K*3/5
#     return df.drop(columns=['anomaly_std', 'bearing', 'turning_angle'])
# def detect_motion_anomalies(df, threshold=0.9, window_size=3, step=1):
#     """
#     Classifies trajectory segments as 'normal' or 'random_walk' using PCA on displacements in rolling windows.
    
#     Parameters:
#     - df: DataFrame with columns ['Sampled_Date', 'Latitude', 'Longitude'].
#     - threshold: Base PCA variance ratio threshold (default=0.9).
#     - window_size: Number of points in each rolling window (default=3).
#     - step: Number of points to advance between windows (default=1).
    
#     Returns:
#     - DataFrame with 'anomaly' column added.
#     """
#     K = window_size/step
#     df['anomaly_score'] = 0.0  # Initialize anomaly scores
    
#     for i in range(0, len(df) - window_size + 1, step):
#         segment = df.iloc[i:i + window_size]
        
#         data = np.column_stack([
#             segment['Latitude'].values,
#             segment['Longitude'].values
#         ])
        
#         if len(data) < 2:
#             continue  # Skip if insufficient data for PCA
            
#         # Standardize and PCA
#         X = StandardScaler().fit_transform(data)
#         pca = PCA(n_components=2)
#         pca.fit(X)
#         variance_ratio = pca.explained_variance_ratio_[0]  # First component
        
#         # Apply this contribution to all points in the overlapping segment
#         segment_start = max(0, i - int(step))
#         segment_end = i + window_size - int(step)
#         overlap_segment = df.iloc[segment_start:segment_end]
        
#         df.loc[overlap_segment.index, 'anomaly_score'] += 1 - variance_ratio
    
#     # Determine anomalies based on accumulated scores

#     df['anomaly'] = df['anomaly_score'] >=  threshold * K
#     df['suspecious'] = df['anomaly_score'] >=  threshold * 2/3 * K
    
#     return df

def detect_anomalies(df, threshold_percent=50):
    """
    Detect anomalous points in trajectory data based on displacement prediction.
    
    Args:
        df: DataFrame with trajectory data (must contain Latitude and Longitude)
        threshold_percent: Percentage threshold for considering a point anomalous
        
    Returns:
        DataFrame with added columns for prediction and anomaly flag
    """
    df = df.copy()
    
    # Initialize new columns
    df['anomaly'] = False
    df['suspecious'] = False
    
    for i in range(len(df) - 2):
        # Current, next, and next-next points
        x0 = df.iloc[i]
        x1 = df.iloc[i+1]
        x2 = df.iloc[i+2]
        
        # Calculate displacement between x0 and x1
        dx_lat = x1['Latitude'] - x0['Latitude']
        dx_lon = x1['Longitude'] - x0['Longitude']
        displacement = geodesic((x0['Latitude'], x0['Longitude']), 
                               (x1['Latitude'], x1['Longitude'])).meters
        
        # Predict x2 position by applying same displacement to x1
        pred_lat = x1['Latitude'] + dx_lat
        pred_lon = x1['Longitude'] + dx_lon
        
        # Calculate error distance between predicted and actual x2
        error_distance = geodesic((pred_lat, pred_lon), 
                                 (x2['Latitude'], x2['Longitude'])).meters
        
        # Check if error exceeds threshold percentage of displacement
        if displacement > 0:  # Avoid division by zero
            error_percent = (error_distance / displacement) * 100
            is_anomaly = error_percent > threshold_percent
            is_suspecious = error_percent > (threshold_percent *1/2)
        else:
            is_suspecious = False
            is_anomaly = False
        
        # Store results
        df.loc[df.index[i+2], 'anomaly'] = is_anomaly
        df.loc[df.index[i+2], 'suspecious'] = is_suspecious
    
    return df

In [406]:
for mmsi in ["440045160", "440004930", "440013420", "440015360", "440017320", "440065010", "440065580", "440070250", "440125230"]:
    sample_traj = pd.read_csv(dynamic_data_files_dict[mmsi], index_col=0)
    sample_traj['Sampled_Date'] = pd.to_datetime(sample_traj['Sampled_Date'])
    sample_traj_sequences = get_trajectory_sequences(sample_traj)
    # print([len(i) for i in sample_traj_sequences])

    for i, sample_traj_sequence_i in enumerate(sample_traj_sequences):
        if len(sample_traj_sequence_i) > 900 or len(sample_traj_sequence_i) < 400:
            continue
        
    # sample_traj_sequence_i = sample_traj_sequences[-2]

    # sample_traj_smooth = sample_traj_sequence_i[smooth_cols].rolling(window=window_size, center=True, min_periods=1).mean()
    # sample_traj_smooth['SOG'] = sample_traj_sequence_i['SOG']
    # sample_traj_smooth['COG'] = sample_traj_sequence_i['COG']
    # sample_traj_smooth['Sampled_Date'] = sample_traj_sequence_i['Sampled_Date']
    # sample_traj_smooth['MMSI'] = sample_traj_sequence_i['MMSI']
        
        # df_annotated = detect_turning_anomalies(
        #     sample_traj_sequence_i, 
        #     window_size=12, 
        #     step=3, 
        #     turning_std_threshold=40
        # )
        # df_annotated = detect_motion_anomalies(
        #     sample_traj_sequence_i,
        #     threshold=0.25, 
        #     window_size=12, 
        #     step=3
        #     )
        df_annotated = detect_anomalies(sample_traj_sequence_i, threshold_percent=150)
            
        sample_traj_sequence_anomaly = df_annotated[df_annotated["anomaly"]].reset_index()
        sample_traj_sequence_anomaly_sequences = get_trajectory_sequences(sample_traj_sequence_anomaly)
        sample_traj_sequence_suspecious = df_annotated[df_annotated["suspecious"]].reset_index()
        sample_traj_sequence_suspecious_sequences = get_trajectory_sequences(sample_traj_sequence_suspecious)

        fig = plot_plotly_trajectory_groups([[sample_traj_sequence_i], sample_traj_sequence_suspecious_sequences, sample_traj_sequence_anomaly_sequences], ["initial trajectory", "suspecious", "anomaly"])
        fig.write_html(f"results/{int(sample_traj_sequence_i['MMSI'][0])}_{i}.html")
        fig.write_image(f"results/{int(sample_traj_sequence_i['MMSI'][0])}_{i}.png")
        
        # break