In [None]:
import os
import pandas as pd
import plotly.express as px
# from tqdm.notebook import tqdm  # for progress bars
import plotly.io as pio


In [54]:
# Initialize empty DataFrame
all_data = pd.DataFrame()

In [73]:
# Set the folder path
folder_path = "../../data/FishingKoreaAISFull"

# Configuration
sample_fraction = 1.00  # Adjust based on your memory/performance needs (1% sample)
min_points_per_vessel = 100000  # Only show vessels with at least this many points
output_file = "ais_trajectories_full.html"

In [64]:
# Read and process files with progress bar
for filename in os.listdir(folder_path):
    if filename.__contains__('Dynamic'):
        file_path = os.path.join(folder_path, filename)
        print(file_path)
        
        # Read with sampling and only necessary columns
        df = pd.read_csv(file_path, usecols=['MMSI', 'Date', 'Latitude', 'Longitude'])
        
        # Sample the data
        df = df.sample(frac=sample_fraction)
        
        # Append to main DataFrame
        all_data = pd.concat([all_data, df], ignore_index=True)


../../data/FishingKoreaAISFull/Dynamic_20230514_fishing_boats.csv
../../data/FishingKoreaAISFull/Dynamic_20230508_fishing_boats.csv
../../data/FishingKoreaAISFull/Dynamic_20230520_fishing_boats.csv
../../data/FishingKoreaAISFull/Dynamic_20230511_fishing_boats.csv
../../data/FishingKoreaAISFull/Dynamic_20230525_fishing_boats.csv
../../data/FishingKoreaAISFull/Dynamic_20230515_fishing_boats.csv
../../data/FishingKoreaAISFull/Dynamic_20230509_fishing_boats.csv
../../data/FishingKoreaAISFull/Dynamic_20230521_fishing_boats.csv
../../data/FishingKoreaAISFull/Dynamic_20230510_fishing_boats.csv
../../data/FishingKoreaAISFull/Dynamic_20230524_fishing_boats.csv
../../data/FishingKoreaAISFull/Dynamic_20230516_fishing_boats.csv
../../data/FishingKoreaAISFull/Dynamic_20230522_fishing_boats.csv
../../data/FishingKoreaAISFull/Dynamic_20230513_fishing_boats.csv
../../data/FishingKoreaAISFull/Dynamic_20230527_fishing_boats.csv
../../data/FishingKoreaAISFull/Dynamic_20230517_fishing_boats.csv
../../data

In [74]:
# Filter vessels with few points (reduces clutter)
vessel_counts = all_data['MMSI'].value_counts()
valid_vessels = vessel_counts[vessel_counts >= min_points_per_vessel].index
filtered_data = all_data[all_data['MMSI'].isin(valid_vessels)]
print(len(filtered_data))

17184438


In [75]:

# Filter vessels with few points (reduces clutter)
# vessel_counts = all_data['MMSI'].value_counts()
# valid_vessels = vessel_counts[vessel_counts >= min_points_per_vessel].index
# filtered_data = all_data[all_data['MMSI'].isin(valid_vessels)]

# Create the visualization
fig = px.line_map(filtered_data[:1000000].sort_values(['MMSI', 'Date']),
                     lat="Latitude",
                     lon="Longitude",
                     color="MMSI",
                     line_group="MMSI",
                     hover_name="MMSI",
                     zoom=6,
                     height=800,
                     title="Fishing Vessel Trajectories with Lines")

fig.update_layout(mapbox_style="stamen-terrain")

# # Show the figure
# fig.show()

# # Save as interactive HTML
pio.write_html(fig, file=output_file, auto_open=True)

In [76]:
len(filtered_data) #[:100]

17184438

In [78]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from tslearn.metrics import dtw
from scipy.spatial.distance import squareform
import plotly.express as px
import plotly.graph_objects as go



h5py not installed, hdf5 features will not be supported.
Install h5py to use hdf5 features: http://docs.h5py.org/



In [82]:
# Load data (if not already loaded)
df = filtered_data[:10000].copy()

# Convert 'Date' to datetime and sort
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(['MMSI', 'Date'])

# Group by MMSI to create trajectories
trajectories = df.groupby('MMSI').apply(
    lambda x: x[['Latitude', 'Longitude']].values.tolist()
).tolist()





In [83]:
n_traj = len(trajectories)
dist_matrix = np.zeros((n_traj, n_traj))

for i in range(n_traj):
    for j in range(i + 1, n_traj):
        dist = dtw(trajectories[i], trajectories[j])
        dist_matrix[i, j] = dist
        dist_matrix[j, i] = dist

condensed_dist = squareform(dist_matrix)

In [84]:
# Adjust parameters based on your data density
db = DBSCAN(metric="precomputed", eps=0.1, min_samples=2)  # Tune eps/min_samples
clusters = db.fit_predict(dist_matrix)

# Assign cluster labels back to the DataFrame
df['Cluster'] = np.repeat(clusters, df.groupby('MMSI').size())

In [85]:
fig = px.scatter_mapbox(
    df,
    lat="Latitude",
    lon="Longitude",
    color="Cluster",
    hover_data=["MMSI", "Date"],
    mapbox_style="carto-positron",
    title="Fishing Ship Trajectory Clusters",
    zoom=5
)

# Add lines between consecutive points for each ship
for mmsi, group in df.groupby('MMSI'):
    fig.add_trace(
        go.Scattermapbox(
            lon=group['Longitude'],
            lat=group['Latitude'],
            mode='lines',
            line=dict(width=1, color='gray'),
            showlegend=False,
            hoverinfo='none'
        )
    )

fig.update_layout(margin={"r": 0, "t": 30, "l": 0, "b": 0})
fig.show()

In [99]:
########
len(filtered_data)

17184438

In [129]:
min_points_per_vessel = 900
vessel_counts = all_data['MMSI'].value_counts()
valid_vessels = vessel_counts[vessel_counts >= min_points_per_vessel].index
filtered_data = all_data[all_data['MMSI'].isin(valid_vessels)]

df = filtered_data #[:1000000].copy()
trajectories = df.groupby('MMSI').apply(
    lambda x: x[['Latitude', 'Longitude']].values
).tolist()





In [135]:
def extract_features(traj):
    traj = np.array(traj)
    return np.concatenate([
        traj.mean(axis=0),          # Mean position (2D)
        traj.std(axis=0),           # Movement variability (2D)
        [len(traj)],                # Trajectory length (1D)
        traj.max(axis=0),
    ])

X = np.array([extract_features(traj) for traj in trajectories])

In [140]:
# from sklearn.cluster import MiniBatchKMeans

# k = 10  # Start with 3 clusters (adjust as needed)
# kmeans = MiniBatchKMeans(n_clusters=k, batch_size=100)
# clusters = kmeans.fit_predict(X)

# # Add cluster labels back to original DataFrame
# all_data['Cluster'] = all_data['MMSI'].map(
#     dict(zip(all_data['MMSI'].unique(), clusters)))

from sklearn.preprocessing import StandardScaler
import hdbscan
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



clusterer = hdbscan.HDBSCAN(
    min_cluster_size=6,      # Smallest group to consider a cluster
    min_samples=4,           # Controls noise sensitivity
    metric='euclidean',      # For scaled features
    cluster_selection_method='eom'  # "Excess of Mass" (balanced clusters)
)
clusters = clusterer.fit_predict(X_scaled)

# Assign labels back to DataFrame
all_data['Cluster'] = all_data['MMSI'].map(
    dict(zip(all_data['MMSI'].unique(), clusters)))


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [141]:
from sklearn.decomposition import PCA
import plotly.express as px

# Reduce to 2D
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot
fig = px.scatter(
    x=X_pca[:, 0], y=X_pca[:, 1], color=clusters,
    hover_name=df['MMSI'].unique(),
    title='HDBSCAN Clusters (PCA)',
    labels={'color': 'Cluster'}
)
fig.show()

In [None]:
# import plotly.express as px

# # Create a DataFrame for the PCA results
# pca_df = pd.DataFrame({
#     'PC1': X_pca[:, 0],
#     'PC2': X_pca[:, 1],
#     'Cluster': clusters,
#     'MMSI': df['MMSI'].unique()
# })

# fig = px.scatter(
#     pca_df, x='PC1', y='PC2', color='Cluster',
#     hover_data=['MMSI'], title='Trajectory Embeddings (PCA)'
# )
# fig.show()

In [146]:
((clusters==-1)*1.0).sum()

319.0

In [147]:
len(clusters)

626