## Imports

In [212]:
from typing import List
import numpy as np
import pandas as pd
import re

from dataclasses import dataclass

from scipy.stats import entropy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score

import hdbscan

import os
from glob import glob

import plotly.express as px
import plotly.graph_objects as go

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')


## AIS DATA

In [87]:
@dataclass
class AISColumnNames:
    Date: str = "timestamp"
    Sampled_Date: str = "sampled_timestamp"
    Latitude: str = "latitude"
    Longitude: str = "longitude"
    Pseudo_Longitude: str = "pseudo_longitude"
    SOG: str = "sog"
    COG: str = "cog"
    Heading: str = "heading"

    n_Latitude: str = "norm_latitude"
    n_Longitude: str = "norm_longitude"
    n_SOG: str = "norm_sog"
    n_COG: str = "norm_cog"
    n_Heading: str = "norm_heading"

    is_synthetic: str = "is_synthetic"
    to_predict: str = "to_predict"

cols: AISColumnNames = AISColumnNames()
target_freq_in_minutes = 10
target_freq: str = f"{target_freq_in_minutes}min"
sample_T: pd.Timedelta = pd.Timedelta(minutes=target_freq_in_minutes)

def get_file_pairs(folder_path: str = "../../data/loitering_sampled/"):
    # Get all CSV files in the folder
    csv_files = glob(os.path.join(folder_path, "*.csv"))

    # Function to extract the base name (mmsi and eta_val) for pairing
    def get_base_name(filename):
        # Remove folder path and extension
        name = os.path.basename(filename).replace('.csv', '')
        # Remove '_loitering' if present
        return name.replace('_loitering', '')

    # Group files by their base name
    file_pairs = {}
    for file in csv_files:
        base_name = get_base_name(file)
        if base_name not in file_pairs:
            file_pairs[base_name] = {'origial': None, 'loitering': None}
        if file.endswith('_loitering.csv'):
            file_pairs[base_name]['loitering'] = file
        else:
            file_pairs[base_name]['origial'] = file
            
    return file_pairs
            
def get_data(file):
    mmsi_match = re.search(r'mmsi_(\d+)', file)
    if mmsi_match:
        mmsi = mmsi_match.group(1)
    df_original = pd.read_csv(file, index_col=0)
    df_original[cols.Sampled_Date] = pd.to_datetime(df_original[cols.Sampled_Date], errors="coerce")

    df = df_original.copy()
    df['target_id'] = mmsi  
    df = df.set_index(cols.Sampled_Date)
    return df

def get_trajectory_sequences(trajectory_sampled: pd.DataFrame, time_column_name=None
    ) -> List[pd.DataFrame]:
        trajectory_sequences: List[pd.DataFrame] = []  # To store the sequences
        current_sequence = pd.DataFrame(
            columns=trajectory_sampled.columns
        )  # DF To track the current sequence

        # Iterate through the timestamps
        for i in range(len(trajectory_sampled) - 1):
            if (
                trajectory_sampled.index[i + 1]
                - trajectory_sampled.index[i]
                == sample_T
            ):
                # If the difference is 10 minutes, add the current timestamp to the sequence
                if len(current_sequence) == 0:
                    current_sequence = trajectory_sampled.iloc[
                        [i]
                    ]  # Add the first timestamp of the sequence
                current_sequence = pd.concat(
                    [current_sequence, trajectory_sampled.iloc[[i + 1]]],
                    # ignore_index=True,
                )  # Add the next timestamp
            else:
                # If the difference is not 10 minutes, end the current sequence
                if len(current_sequence) != 0:
                    trajectory_sequences.append(
                        current_sequence
                    )  # Store the completed sequence
                    current_sequence = pd.DataFrame(
                        columns=trajectory_sampled.columns
                    )  # Reset the current sequence

        # Handle the last sequence if it ends at the last timestamp
        if len(current_sequence) != 0:
            trajectory_sequences.append(current_sequence)
            
        # handle one last point
        if trajectory_sequences[-1].index[-1] != trajectory_sampled.index[-1]:
            trajectory_sequences.append(trajectory_sampled.iloc[[-1]])
            
        # handle one first point
        if trajectory_sequences[0].index[0] != trajectory_sampled.index[0]:
            trajectory_sequences.append(trajectory_sampled.iloc[[0]])

        return trajectory_sequences
    
def plot_plotly_trajectory_groups(df_groups: List[List[pd.DataFrame]],
                         group_names, 
                         color_sequence=None,
                         line_width=2,
                         marker_size=4):
    if not df_groups:
        raise ValueError("Empty list of DataFrame groups provided")
    
    if color_sequence is None:
        color_sequence = px.colors.qualitative.Plotly
    
    # Create empty figure with proper mapbox setup
    fig = px.scatter_mapbox(lat=[None], lon=[None]).update_layout(
        mapbox_style="open-street-map",
        mapbox_zoom=8,
        height=600
    )
    
    for group_id, df_group in enumerate(df_groups):
        group_color = color_sequence[group_id % len(color_sequence)]
        
        for segment_id, df in enumerate(df_group):
            if len(df) == 0:
                continue  # Skip empty dataframes
                
            customdata = pd.concat([
                pd.Series(df.index, name=cols.Sampled_Date, index=df.index),
                df[cols.SOG],
                df[cols.COG],
                
            ], axis=1)
                        
            # Add line trace for this segment
            fig.add_trace(
                px.line_mapbox(
                    df,
                    lat=cols.Latitude,
                    lon=cols.Longitude,
                    color_discrete_sequence=[group_color]
                ).data[0].update(
                    mode="lines+markers",
                    line=dict(width=line_width),
                    marker=dict(size=marker_size),
                    name=f"{group_names[group_id]}",
                    showlegend=(segment_id == 0),  # Only show legend for first segment
                    legendgroup=f"{group_names[group_id]}",
                    hoverinfo="text",
                    customdata=customdata,
                    hovertemplate=(
                        "Latitude: %{lat}<br>"
                        "Longitude: %{lon}<br>"
                        "Date: %{customdata[0]}<br>"
                        "SOG: %{customdata[1]}<br>"
                        "COG: %{customdata[2]}<br>"
                    )
                )
            )
    
    fig.update_layout(
        margin={"r":0,"t":40,"l":0,"b":0},
        showlegend=True,
        legend_title_text="Trajectory Groups",
        title="Vessel Trajectory"
    )
    
    # Auto-zoom to the data
    if len(df_groups) > 0 and len(df_groups[0]) > 0:
        first_df = df_groups[0][0]
        fig.update_mapboxes(
            center=dict(
                lat=first_df[cols.Latitude].mean(),
                lon=first_df[cols.Longitude].mean()
            )
        )
    
    return fig

## Interpolation

In [12]:
def interpolate_traj(df, n_to_predict=20, interpolation_method='linear'):
    timestamps = df.index
    time_steps = np.array([(timestamps[i] - timestamps[i-1]).total_seconds() 
                          for i in range(1, len(timestamps))])
    
    freq = f'{time_steps.min()/60}min'
    
    # Find gaps larger than threshold
    gap_mask = time_steps > n_to_predict*time_steps.min()
    if gap_mask.any():
        # Split the index into segments at large gaps
        gap_indices = np.where(gap_mask)[0]
        segments = []
        start_idx = 0
        
        for gap_idx in gap_indices:
            # Create segment up to gap
            segment = pd.date_range(
                start=timestamps[start_idx],
                end=timestamps[gap_idx],
                freq=freq
            )
            segments.append(segment)
            start_idx = gap_idx + 1
        
        # Add final segment
        segment = pd.date_range(
            start=timestamps[start_idx],
            end=timestamps[-1],
            freq=freq
        )
        segments.append(segment)
        
        # Combine all segments
        full_range = pd.DatetimeIndex(np.concatenate(segments))
    else:
        # If no large gaps, use original logic
        full_range = pd.date_range(
            start=df.index.min(),
            end=df.index.max(),
            freq=freq
        )
    
    # Reindex to the complete time range
    df = df.reindex(full_range)
    
    # Reset index to make Sampled_Date a column again
    df = df.reset_index().rename(columns={'index': cols.Sampled_Date})
    
    # Interpolate numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].interpolate(method=interpolation_method)
    
    df = df.set_index(cols.Sampled_Date)
    
    return df

## Regular reciprocating

In [58]:
def spectral_concentrations_fig(spec_concentrations, name='Spectral Concentration'):
    
    max_idx = np.argmax(spec_concentrations)

    fig = go.Figure()

    # Add line trace for Spectral Concentrations
    fig.add_trace(
        go.Scatter(
            x=list(range(len(spec_concentrations))),
            y=spec_concentrations,
            mode='lines+markers',
            name=name,
            line=dict(color='#1f77b4'),  # Blue color for line
            marker=dict(size=8)
        )
    )

    # Add a single point for the maximum Spectral Concentration
    fig.add_trace(
        go.Scatter(
            x=[max_idx],
            y=[spec_concentrations[max_idx]],
            mode='markers',
            name=f'Max {name}',
            marker=dict(
                size=12,
                color='#ff0000',  # Red color for max point
                symbol='star'
            )
        )
    )

    # Update layout for better visualization
    fig.update_layout(
        title=f'{name} Across Segments',
        xaxis_title='Window Start Index',
        yaxis_title=name,
        yaxis=dict(range=[0, max(1, max(spec_concentrations) * 1.1)]),  # Start y-axis at 0, add padding
        showlegend=True,
        template='plotly_white',  # Light theme for visibility
        hovermode='closest'
    )
    
    return fig

def find_high_spec_concentration_segments(interpolated_df_sequences, col_name=cols.COG, window_size=40, step_size=10, sc_threshold=0.4):
    """
    Find segments in a list of DataFrames with Spectral Concentration above a threshold.
    
    Parameters:
    - interpolated_df_sequences: List of pandas DataFrames containing the data.
    - col_name: Column name to analyze (default: 'COG').
    - window_size: Size of the sliding window (default: 40).
    - step_size: Step size for sliding window (default: 10).
    - sc_threshold: Spectral Concentration threshold (default: 0.4).
    
    Returns:
    - results: List of dictionaries containing segment details (sequence index, start index, 
      Spectral Concentration, and segment data) for segments above the threshold.
    """
    results = []
    
    spec_concentrations = []
    # Iterate through each DataFrame in the sequence
    for seq_idx, df in enumerate(interpolated_df_sequences):
        # Ensure the column exists
        if col_name not in df.columns:
            print(f"Warning: Column '{col_name}' not found in DataFrame {seq_idx}. Skipping.")
            continue
            
        # Slide through the DataFrame
        for start in range(0, len(df) - window_size + 1, step_size):
            # Extract the window
            segment = df[col_name].iloc[start:start + window_size].values
            
            # Compute FFT
            fft_result = np.fft.fft(segment)
            
            # Compute power spectrum (squared magnitude)
            power_spectrum = np.abs(fft_result) ** 2
            
            # Exclude DC component (index 0)
            power_spectrum_no_dc = power_spectrum[1:]
            
            if len(power_spectrum_no_dc) == 0:
                continue
                
            # Compute Spectral Concentration: max power / total power (excluding DC)
            max_power = np.max(power_spectrum_no_dc)
            total_power = np.sum(power_spectrum_no_dc)
            
            if total_power > 0:  # Avoid division by zero
                spec_concentration = max_power / total_power
            else:
                spec_concentration = 0
            
            spec_concentrations.append(spec_concentration)
            
            # Check if Spectral Concentration exceeds the threshold
            if spec_concentration > sc_threshold:
                results.append({
                    'sequence_index': seq_idx,
                    'start_index': start,
                    'spectral_concentration': spec_concentration,
                    'segment': segment
                })
    
    # If no segments meet the threshold, print a message
    if not results:
        print(f"No segments found with Spectral Concentration > {sc_threshold}.")
    
    return results, spec_concentrations

def calculate_column_entropy(df, column_name, num_bins=36):
    """
    Calculate the Shannon entropy of a specified column in a DataFrame with binning for continuous data.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame
    column_name (str): Name of the column to calculate entropy for
    num_bins (int): Number of bins for histogram discretization (default: 36)
    
    Returns:
    float: Shannon entropy value in bits (using log base 2)
    """
    # Check if column exists
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in DataFrame")
    
    # Get histogram of values to discretize continuous data
    hist, bin_edges = np.histogram(df[column_name].dropna(), bins=num_bins, density=True)
    
    # Convert histogram to probabilities (normalize to sum to 1)
    probabilities = hist / np.sum(hist)
    
    # Filter out zero probabilities to avoid log(0)
    probabilities = probabilities[probabilities > 0]
    
    # Calculate entropy (in bits, using log base 2)
    entropy_value = entropy(probabilities, base=2)
    
    return entropy_value

## Classification

In [179]:
def is_lasso_shape(segment):
    segment_COG = segment[cols.COG].values
    
    segment_SOG_mean = segment[cols.SOG].mean()
    if segment_SOG_mean < 0.5:
        return False
    
    X_cog = np.arange(len(segment)).reshape(-1, 1)  # Index as independent variable
    y_cog = segment_COG  # Column values

    cog_std = segment_COG.std()
    
    # Fit linear regression
    model = LinearRegression()
    model.fit(X_cog, y_cog)
    y_sog_pred = model.predict(X_cog)

    # Calculate R² score
    r2 = r2_score(y_cog, y_sog_pred)
        
    if r2 > 0.9 and cog_std > 40:
        return True
    
def is_loitering(segment):
    segment_COG = segment[cols.COG].values
            
    # Compute FFT
    fft_result = np.fft.fft(segment_COG)
            
    # Compute power spectrum (squared magnitude)
    power_spectrum = np.abs(fft_result) ** 2
            
    # Exclude DC component (index 0)
    power_spectrum_no_dc = power_spectrum[1:]
                
    # Compute Spectral Concentration: max power / total power (excluding DC)
    max_power = np.max(power_spectrum_no_dc)
    total_power = np.sum(power_spectrum_no_dc)
            
    if total_power > 0:  # Avoid division by zero
        spec_concentration = max_power / total_power
    else:
        spec_concentration = 0
        
    if spec_concentration > 0.4:
        return True
            
    
def classifier_results(X: List[List[float]], y: List[float]):
    X = np.array(X)
    y = np.array(y)

    # Split data into training and test sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the Random Forest Classifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

def find_lasso_shape(origial_file, n_to_predict=20, window_size=40, step_size=10):
    original_df = get_data(origial_file)
    interpolated_original_df = interpolate_traj(original_df, n_to_predict)
    interpolated_origial_df_sequences = get_trajectory_sequences(interpolated_original_df)

    done = False
    segments = []
    for i, df in enumerate(interpolated_origial_df_sequences):
        for start in range(0, len(df) - window_size + 1, step_size):
            # Extract the window
            segment = df.iloc[start:start + window_size]
            
            if is_lasso_shape(segment):
                segments.append(segment)
                fig = plot_plotly_trajectory_groups([[segment]], group_names=["Lasso shape"])
                fig.show()
                done = True
                break
            
        if done:
            break
    return segments

def find_HDBSCAN_anomlies(origial_file, n_to_predict=20, window_size=40, step_size=10):
    original_df = get_data(origial_file)
    interpolated_original_df = interpolate_traj(original_df, n_to_predict)
    interpolated_origial_df_sequences = get_trajectory_sequences(interpolated_original_df)

    done = False
    segments = []
    for i, df in enumerate(interpolated_origial_df_sequences):
        for start in range(0, len(df) - window_size + 1, step_size):
            # Extract the window
            segment = df.iloc[start:start + window_size]
            
            if is_lasso_shape(segment):
                segments.append(segment)
                fig = plot_plotly_trajectory_groups([[segment]], group_names=["Lasso shape"])
                fig.show()
                done = True
                break
            
        if done:
            break
    return segments

def get_training_data(origial_file, loitering_file, n_to_predict=20, window_size=40, step_size=10):
    original_df = get_data(origial_file)
    loitering_df = get_data(loitering_file)
    
    interpolated_original_df = interpolate_traj(original_df, n_to_predict)
    interpolated_loitering_df = interpolate_traj(loitering_df, n_to_predict)
    
    # set loitering points to one df
    interpolated_original_df["loitering"] = interpolated_original_df.index.isin(interpolated_loitering_df.index)
    
    interpolated_origial_df_sequences = get_trajectory_sequences(interpolated_original_df)

    X = []
    y = []
    for i, df in enumerate(interpolated_origial_df_sequences):
        for start in range(0, len(df) - window_size + 1, step_size):
            # Extract the window
            segment = df.iloc[start:start + window_size]
            
            segment_std_lan = segment[cols.Latitude].std()
            segment_std_lon = segment[cols.Longitude].std()
            segment_avg_sog = segment[cols.SOG].mean()
            segment_std_sog = segment[cols.SOG].std()
            segment_avg_cog = segment[cols.COG].mean()
            segment_entropy_cog = calculate_column_entropy(segment, cols.COG)
                
            X_i = [segment_std_lan, segment_std_lon, segment_avg_sog, segment_std_sog, segment_avg_cog, segment_entropy_cog]
            
            y_i = False
            # current dataset
            if len(segment[segment["loitering"]])/window_size > 0.5 or is_loitering(segment): # 
                y_i = True
                
            X.append(X_i)
            y.append(y_i)
            
    return X, y



def get_training_data_hdbscan(origial_file, n_to_predict=20, window_size=40, step_size=10):
    original_df = get_data(origial_file)

    interpolated_original_df = interpolate_traj(original_df, n_to_predict)
    interpolated_origial_df_sequences = get_trajectory_sequences(interpolated_original_df)

    X = []
    for i, df in enumerate(interpolated_origial_df_sequences):
        for start in range(0, len(df) - window_size + 1, step_size):
            # Extract the window
            segment = df.iloc[start:start + window_size]
            
            segment_avg_sog = segment[cols.SOG].mean()
            segment_std_sog = segment[cols.SOG].std()
            segment_avg_cog = segment[cols.COG].mean()
            segment_entropy_cog = calculate_column_entropy(segment, cols.COG)
            X_i = {"mean SOG": segment_avg_sog, 
                                        "std SOG": segment_std_sog, 
                                        "mean COG": segment_avg_cog, 
                                        "entropy COG": segment_entropy_cog}

                
            X.append(X_i)
            
    return X


## Start Code

In [None]:
origial_file = "../../data/loitering_sampled/len_113_mmsi_36968098_eta_val_2023-01-01 00:00:00.csv"
loitering_file = "../../data/loitering_sampled/len_22_mmsi_36968098_eta_val_2023-01-01 00:00:00_loitering.csv"

In [None]:
n_to_predict = 20
window_size = 40
step_size = 10

original_df = get_data(origial_file)
loitering_df = get_data(loitering_file)

interpolated_original_df = interpolate_traj(original_df, n_to_predict)
interpolated_loitering_df = interpolate_traj(loitering_df, n_to_predict)

init_origial_df_sequences = get_trajectory_sequences(original_df)
origial_df_sequences = get_trajectory_sequences(interpolated_original_df)
loitering_df_sequences = get_trajectory_sequences(interpolated_loitering_df)
pure_loitering_df_sequences = get_trajectory_sequences(loitering_df)

results, spectrum = find_high_spec_concentration_segments(origial_df_sequences)
loitering_segmennts = [origial_df_sequences[res['sequence_index']].iloc[res['start_index']:res['start_index'] + window_size] for res in results]
    
fig = spectral_concentrations_fig(spectrum)
fig.show()

fig = plot_plotly_trajectory_groups([init_origial_df_sequences, origial_df_sequences, loitering_segmennts, loitering_df_sequences, pure_loitering_df_sequences], group_names=["Initial", "Original", "Detected", "Loitering", "Loitering initial"])
fig.show()


## New code

In [None]:
n_to_predict = 20
window_size = 80
step_size = 10
N_data = 10000
file_pairs = get_file_pairs(folder_path = "../../data/loitering_sampled/")

all_segments = []
i = 3
for _, pair in file_pairs.items():
    origial_file = pair['origial']
    segments = find_lasso_shape(origial_file, window_size=window_size)
    all_segments = all_segments + segments
    # if len(segments) > 0:  
    #     i -= 1
    #     if i == 0:
    #         break

In [151]:
n_to_predict = 20
window_size = 40
step_size = 10
N_data = 10000
file_pairs = get_file_pairs(folder_path = "../../data/loitering_sampled/")

X_new_2 = []
y_new_2 = []    
for _, pair in file_pairs.items():
    origial_file = pair['origial']
    loitering_file = pair['loitering']
    X_of_cur_pair, y_of_cur_pair = get_training_data(origial_file, loitering_file, n_to_predict=n_to_predict, window_size=window_size, step_size=step_size)

    X_new_2 = X_new_2 + X_of_cur_pair
    y_new_2 = y_new_2 + y_of_cur_pair
    
    if len(y_new_2) > N_data:
        break


In [152]:
classifier_results(X_new_2, y_new_2)

Accuracy: 0.85
Classification Report:
              precision    recall  f1-score   support

       False       0.85      0.95      0.90      1371
        True       0.85      0.65      0.73       640

    accuracy                           0.85      2011
   macro avg       0.85      0.80      0.82      2011
weighted avg       0.85      0.85      0.84      2011



In [131]:
len(y_new)

10051

In [None]:
# numpy_array_X = np.array(X)
# # Save the NumPy array as a pickle file
# np.save('X_data.npy', numpy_array_X)
# numpy_array_y = np.array(y)
# # Save the NumPy array as a pickle file
# np.save('y_data.npy', numpy_array_y)

In [244]:
N = 100000
X_np = np.array(X[:N])
y_np = np.array(y[:N])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_np)


# Split data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_np, test_size=0.2, random_state=42)

scale_pos_weight_value = (len(y_np) - sum(y_np))/sum(y_np)
print(f"Calculated scale_pos_weight: {scale_pos_weight_value}")

# Initialize and train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
# model = xgb.XGBClassifier(
#     objective='binary:logistic',  # For binary classification with probability output
#     n_estimators=100,             # Number of boosting rounds
#     learning_rate=0.1,            # Step size shrinkage
#     max_depth=30,                  # Maximum depth of a tree
#     subsample=0.8,                # Subsample ratio of the training instance
#     colsample_bytree=0.8,         # Subsample ratio of columns when constructing each tree
#     gamma=0.1,                    # Minimum loss reduction required to make a further partition
#     reg_alpha=0.005,              # L1 regularization term on weights
#     scale_pos_weight=scale_pos_weight_value, # Handle imbalanced dataset
#     random_state=42,              # For reproducibility
#     use_label_encoder=False,      # Suppress warning (newer versions handle this automatically)
#     eval_metric='logloss'         # Evaluation metric for monitoring training
# )

model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Calculated scale_pos_weight: 5.220452848967405
Accuracy: 0.89
Classification Report:
              precision    recall  f1-score   support

       False       0.90      0.98      0.94     16744
        True       0.79      0.44      0.56      3256

    accuracy                           0.89     20000
   macro avg       0.84      0.71      0.75     20000
weighted avg       0.88      0.89      0.88     20000



Calculated scale_pos_weight: 2.6429872495446265
Accuracy: 0.87
Classification Report:
              precision    recall  f1-score   support

       False       0.91      0.91      0.91      1484
        True       0.74      0.75      0.74       516

    accuracy                           0.87      2000
   macro avg       0.82      0.83      0.83      2000
weighted avg       0.87      0.87      0.87      2000

In [231]:
n_to_predict = 20
window_size = 30
step_size = 10
N_data = 10000
file_pairs = get_file_pairs(folder_path = "../../data/loitering_sampled/")

X_hdbscan = []
for _, pair in file_pairs.items():
    origial_file = pair['origial']
    X_hdbscan_cur = get_training_data_hdbscan(origial_file, n_to_predict=n_to_predict, window_size=window_size, step_size=step_size)

    X_hdbscan = X_hdbscan + X_hdbscan_cur
    
    if len(X_hdbscan) > N_data:
        break

X_hdbscan = pd.DataFrame(X_hdbscan)
scaler = StandardScaler()
X_hdbscan_scaled = scaler.fit_transform(X_hdbscan)


In [182]:
X_hdbscan

Unnamed: 0,mean SOG,std SOG,mean COG,entropy COG
0,3.659449,1.302689,163.045364,2.887972
1,3.745081,1.301206,184.039856,3.250159
2,3.684348,1.304718,183.688801,3.492897
3,4.021296,1.647447,184.637818,3.574024
4,4.184013,1.584327,183.768164,3.581287
...,...,...,...,...
10046,1.717975,0.284671,341.401360,2.402078
10047,1.765879,0.265567,342.137891,1.787907
10048,1.762064,0.258114,342.079229,1.749892
10049,1.759755,0.256881,342.043731,1.856780


In [232]:

clusterer = hdbscan.HDBSCAN(min_cluster_size=2, min_samples=1, gen_min_span_tree=True)
labels = clusterer.fit_predict(X_hdbscan_scaled)

# Step 3: Identify outliers
# Points with label -1 are outliers
X_hdbscan['Cluster'] = labels
outliers = X_hdbscan[labels == -1]

print("\nOutliers (points labeled as -1):")
print(outliers if not outliers.empty else "No outliers detected.")

# Optional: Print number of clusters and outliers
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_outliers = sum(labels == -1)
print(f"\nNumber of clusters: {n_clusters}")
print(f"Number of outliers: {n_outliers}")


Outliers (points labeled as -1):
       mean SOG   std SOG    mean COG  entropy COG  Cluster
6      4.912598  2.244273  178.415809     3.105587       -1
8      5.003432  1.978681  251.913480     2.811537       -1
9      9.201945  0.671576  139.481437     4.389898       -1
10     8.769742  0.339482  145.412597     4.615061       -1
11     4.117376  1.852534  267.477397     2.828934       -1
...         ...       ...         ...          ...      ...
9974   0.270132  0.372520  154.656812     4.323231       -1
9975   0.730851  0.381680  155.809984     3.586569       -1
9988   0.256063  0.053221  221.602070     4.481728       -1
10001  6.369612  2.611143  191.514661     4.256565       -1
10002  3.055008  2.411169  241.235232     4.306891       -1

[1967 rows x 5 columns]

Number of clusters: 2595
Number of outliers: 1967


In [233]:
X_hdbscan["Suspecious"]=X_hdbscan["Cluster"] < 0

In [239]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_hdbscan_scaled[:8000])

# Plot
fig = px.scatter(
    x=X_pca[:, 0], y=X_pca[:, 1], color=X_hdbscan["Suspecious"][:8000],
    title='HDBSCAN Clusters (PCA)',
    labels={'color': 'Suspecious'}
)
fig.show()

In [235]:
clustered_samples_indices = X_hdbscan['Cluster'] != -1
X_hdbscan_clustered = X_hdbscan[clustered_samples_indices]
labels_clustered = X_hdbscan['Cluster'][clustered_samples_indices]
silhouette_avg = silhouette_score(X_hdbscan_clustered, labels_clustered)
davies_bouldin_avg = davies_bouldin_score(X_hdbscan_clustered, labels_clustered)
    

In [236]:
silhouette_avg

0.44079945020911543

In [237]:
davies_bouldin_avg

0.9457894349487985

In [230]:
len(X_hdbscan)

10096