In [1]:
from typing import List, Dict
import numpy as np
import pandas as pd
import glob
from dataclasses import dataclass

In [16]:
all_boats_trajectories = {}
dataset_path = "../../data/FishingKoreaAISFull/Dynamic_*.csv"
dynamic_data_files = glob.glob(dataset_path)

for dynamic_data_file in dynamic_data_files:
    print(f"Reading {dynamic_data_file}...")
    df_dynamic = pd.read_csv(dynamic_data_file)
    data_grouped = df_dynamic.groupby("MMSI")
    for mmsi, data in data_grouped:
        if mmsi not in all_boats_trajectories:
            all_boats_trajectories[mmsi] = (
                data.copy()
            )  # Create a copy to avoid SettingWithCopyWarning
        else:
            all_boats_trajectories[mmsi] = pd.concat(
                [all_boats_trajectories[mmsi], data], ignore_index=True
            )
    print("Done!")
    
    # break

Reading ../../data/FishingKoreaAISFull/Dynamic_20230514_fishing_boats.csv...
Done!
Reading ../../data/FishingKoreaAISFull/Dynamic_20230508_fishing_boats.csv...
Done!
Reading ../../data/FishingKoreaAISFull/Dynamic_20230520_fishing_boats.csv...
Done!
Reading ../../data/FishingKoreaAISFull/Dynamic_20230511_fishing_boats.csv...
Done!
Reading ../../data/FishingKoreaAISFull/Dynamic_20230525_fishing_boats.csv...
Done!
Reading ../../data/FishingKoreaAISFull/Dynamic_20230515_fishing_boats.csv...
Done!
Reading ../../data/FishingKoreaAISFull/Dynamic_20230509_fishing_boats.csv...
Done!
Reading ../../data/FishingKoreaAISFull/Dynamic_20230521_fishing_boats.csv...
Done!
Reading ../../data/FishingKoreaAISFull/Dynamic_20230510_fishing_boats.csv...
Done!
Reading ../../data/FishingKoreaAISFull/Dynamic_20230524_fishing_boats.csv...
Done!
Reading ../../data/FishingKoreaAISFull/Dynamic_20230516_fishing_boats.csv...
Done!
Reading ../../data/FishingKoreaAISFull/Dynamic_20230522_fishing_boats.csv...
Done!
Read

In [3]:
@dataclass
class AISColumnNames:
    Date: str = "Date"
    Sampled_Date: str = "Sampled_Date"
    Latitude: str = "Latitude"
    Longitude: str = "Longitude"
    Pseudo_Longitude: str = "Pseudo_Longitude"
    SOG: str = "SOG"
    COG: str = "COG"
    Heading: str = "Heading"

    n_Latitude: str = "norm Latitude"
    n_Longitude: str = "norm Longitude"
    n_SOG: str = "norm SOG"
    n_COG: str = "norm COG"
    n_Heading: str = "norm Heading"

    is_synthetic: str = "is_synthetic"
    to_predict: str = "to_predict"


In [4]:
cols: AISColumnNames = AISColumnNames()
target_freq_in_minutes = 10
target_freq: str = f"{target_freq_in_minutes}min"
sample_T: pd.Timedelta = pd.Timedelta(minutes=target_freq_in_minutes)

def get_sampled_trajectory(trajectory: pd.DataFrame) -> pd.DataFrame:
        trajectory[cols.Date] = pd.to_datetime(trajectory[cols.Date])
        trajectory = trajectory.set_index(cols.Date)
        trajectory = trajectory.sort_index()

        # add first and last steps of trajectory which are divisible by 10 minutes
        first = trajectory.iloc[:1].copy()
        first.index = [trajectory.index.min().floor(target_freq)]
        last = trajectory.iloc[-1:].copy()
        last.index = [trajectory.index.max().ceil(target_freq)]
        trajectory = pd.concat([first, trajectory, last])

        # Define exact 10-minute sampling times
        start_time = trajectory.index.min().floor("h")  # Round down to the nearest hour
        end_time = trajectory.index.max().ceil("h")  # Round up to the nearest hour
        sampling_times = pd.date_range(start_time, end_time, freq=target_freq)

        # Filter only timestamps where at least one real record exists within ±10 minutes
        valid_sampling_times = [
            t
            for t in sampling_times
            if any(abs(trajectory.index - t) <= sample_T)
        ]

        trajectory = trajectory[~trajectory.index.duplicated(keep="first")]
        trajectory_interpolated = trajectory.reindex(
            trajectory.index.union(valid_sampling_times)
        ).sort_index()

        # Perform linear interpolation
        trajectory_interpolated = trajectory_interpolated.interpolate(method="time")

        # Keep only the sampled timestamps and drop any remaining NaNs
        trajectory_sampled = (
            trajectory_interpolated.loc[valid_sampling_times].dropna().reset_index()
        )
        trajectory_sampled.rename(
            columns={"index": cols.Sampled_Date}, inplace=True
        )
        return trajectory_sampled

In [18]:
all_boats_trajectories_list = list(all_boats_trajectories.values())
     


In [20]:
len(all_boats_trajectories_list)

1251

In [37]:
import os
import re
directory = "../../data/FishingKoreaAIS_sampled"
pattern = r"len_(\d+)_mmsi_(\d+).csv"

# List to store len values
mmsi_values = []

# Iterate through files in the directory
for filename in os.listdir(directory):
    match = re.match(pattern, filename)
    if match:
        mmsi = int(match.group(2))
        mmsi_values.append(mmsi)

In [38]:
len(mmsi_values)

1251

In [36]:
for boat_trajectory in all_boats_trajectories_list:
    if int(list(boat_trajectory["MMSI"])[0]) in mmsi_values:
        continue
    sampled_boat_trajectory = get_sampled_trajectory(boat_trajectory)
    # print(sampled_boat_trajectory)
    sampled_boat_trajectory.to_csv(f"../../data/FishingKoreaAIS_sampled/len_{len(sampled_boat_trajectory)}_mmsi_{int(sampled_boat_trajectory["MMSI"][0])}.csv")
    # break

In [11]:
int(sampled_boat_trajectory["MMSI"][0])

41215086

In [12]:
len(sampled_boat_trajectory)

3