In [8]:
import numpy as np
# %matplotlib inline
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from scipy.stats import norm
from tqdm import tqdm
import pandas as pd
import os
import geopandas as gpd
import movingpandas as mpd
from fiona.crs import from_epsg
import warnings
from sklearn.metrics.pairwise import haversine_distances
CRS_METRIC = from_epsg(4326)

In [26]:
countries_gdf = gpd.read_file("../../../../World_EEZ_v12_20231025_gpkg/eez_v12.gpkg")
data = countries_gdf[countries_gdf['TERRITORY1'] == 'South Korea']
path = '../../data/FishingKoreaAIS'
dataset = list()
original_columns = ['MMSI', 'Date', 'Latitude', 'Longitude', 'SOG', 'COG', 'Heading']  # Original CSV columns
desired_columns = ['message_time', 'target_id', 'lat', 'lon', 'sog', 'cog']  # Your desired columns

for (root, dirs, file) in os.walk(path):
    file_ls = [f for f in file if f.endswith('.csv')] # list of sorted csv files
    files = sorted(file_ls, key=lambda x: int(x[8:16])) if file_ls else []

    for f in tqdm(files, desc='AIS processing'):
        if f.endswith('.csv'):
            data_path = os.path.abspath(os.path.join(root, f))
            with open(data_path, 'r') as df_file:
                lines = df_file.readlines()
                for line in lines[1:]:  # Skip header
                    # Split CSV line
                    values = line.strip().split(',')
                    dataset.append(values)

# Create DataFrame with original columns
df_ais = pd.DataFrame(dataset, columns=original_columns)

# Rename columns to your desired names
column_mapping = {
    'Date': 'message_time',
    'MMSI': 'target_id',
    'Latitude': 'lat',
    'Longitude': 'lon',
    'SOG': 'sog',
    'COG': 'cog'
}
df_ais = df_ais.rename(columns=column_mapping)

# Reorder columns to match your desired order
df_ais = df_ais[desired_columns]

# Filter and clean data
dataset_ais_clean = df_ais[(df_ais.sog.astype(float) >= 1) & (df_ais.sog.astype(float) < 22.0)].copy()
dataset_ais_clean = dataset_ais_clean.dropna()

# Filter for South Korea area
dataset_ais_clean = dataset_ais_clean.loc[(((dataset_ais_clean['lon'].astype(float) > 122.8963) & 
                                          (dataset_ais_clean['lon'].astype(float) < 133.8061)) & 
                                         ((dataset_ais_clean['lat'].astype(float) > 30.77) & 
                                         (dataset_ais_clean['lat'].astype(float) < 39.8397)))].copy()

# Sort and convert data types
dataset_ais_clean.sort_values(['message_time'], inplace=True)
dataset_ais_clean['lat'] = pd.to_numeric(dataset_ais_clean['lat'], errors='coerce')
dataset_ais_clean['lon'] = pd.to_numeric(dataset_ais_clean['lon'], errors='coerce')
dataset_ais_clean['sog'] = pd.to_numeric(dataset_ais_clean['sog'], errors='coerce')
dataset_ais_clean['cog'] = pd.to_numeric(dataset_ais_clean['cog'], errors='coerce')

# Create GeoDataFrame
boundary1 = gpd.GeoDataFrame(geometry=data.geometry)
poly = gpd.GeoDataFrame(dataset_ais_clean, 
                       geometry=gpd.points_from_xy(dataset_ais_clean.lon, dataset_ais_clean.lat), 
                       crs=CRS_METRIC)
poly['time'] = pd.to_datetime(poly['message_time'], format='%Y-%m-%d %H:%M:%S')
poly = poly.set_index('time')

# Spatial join and create trajectory collection
points_within = gpd.sjoin(poly, boundary1, predicate='within')
collection = mpd.TrajectoryCollection(points_within, 'target_id', min_length=100)
collection.trajectories = [traj for traj in collection.trajectories if len(traj.df) > 30]

AIS processing: 100%|██████████| 2/2 [00:05<00:00,  2.96s/it]


In [27]:
dataset_ais_clean

Unnamed: 0,message_time,target_id,lat,lon,sog,cog
270162,2023-05-01 00:00:00,440098120,33.097910,125.271562,4.2,94.6
224632,2023-05-01 00:00:01,440079270,34.918082,125.230035,7.4,331.1
620423,2023-05-01 00:00:01,440303640,35.132247,125.211447,10.3,182.1
430326,2023-05-01 00:00:01,440146860,34.376253,125.351012,8.7,178.6
13485,2023-05-01 00:00:02,412410125,32.122843,125.177045,10.3,145.7
...,...,...,...,...,...,...
1677397,2023-05-02 23:59:58,440307950,35.043268,125.671370,8.0,227.6
1050297,2023-05-02 23:59:58,440006950,34.510952,125.864033,4.9,348.4
1498967,2023-05-02 23:59:59,440185330,32.554887,125.717708,4.1,306.3
1977051,2023-05-02 23:59:59,440403790,35.013595,125.228810,8.8,89.0


In [28]:
import os
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
import matplotlib.pyplot as plt
def process_ais_data(
    path,
    boundary_geometry,
    colunm_name=["message_time", "target_id", "lat", "lon", "sog", "cog"],
):
    # Initialize an empty list to store the data
    dataset = list()

    # Walk through the files in the directory and process .txt files
    for (root, dirs, file) in os.walk(path):
        file_ls = [f for f in file if f.endswith('.txt')]  # List of sorted txt files
        files = sorted(file_ls, key=lambda x: int(x[:6]))  # Sort files by the first 6 characters (assuming timestamp)

        for f in tqdm(files, desc='AIS processing'):
            if '.txt' in f:
                data_path = os.path.abspath(os.path.join(root, f))
                with open(data_path, 'r') as df_file:
                    lines = df_file.readlines()
                    for line in lines[1:]:  # Skip the header row
                        dataset.append(line[:-1].split('\t'))  # Append data rows

    # Create DataFrame and clean data
    df_ais = pd.DataFrame(dataset[1:], columns=colunm_name)
    dataset_ais_clean = df_ais[(df_ais.sog.astype(float) >= 1) & (df_ais.sog.astype(float) < 22.0)].copy()
    dataset_ais_clean = dataset_ais_clean.dropna()

    # Filter data based on geographical boundaries
    dataset_ais_clean = dataset_ais_clean.loc[
        (((dataset_ais_clean['lon'].astype(float) > 122.8963) & (dataset_ais_clean['lon'].astype(float) < 133.8061)) &
         ((dataset_ais_clean['lat'].astype(float) > 30.77) & (dataset_ais_clean['lat'].astype(float) < 39.8397)))
    ].copy()

    # Sort data and convert columns to numeric
    dataset_ais_clean.sort_values(['message_time'], inplace=True)
    dataset_ais_clean['lat'] = pd.to_numeric(dataset_ais_clean['lat'], errors='coerce')
    dataset_ais_clean['lon'] = pd.to_numeric(dataset_ais_clean['lon'], errors='coerce')
    dataset_ais_clean['sog'] = pd.to_numeric(dataset_ais_clean['sog'], errors='coerce')
    dataset_ais_clean['cog'] = pd.to_numeric(dataset_ais_clean['cog'], errors='coerce')

    # Create geospatial data frame
    poly = gpd.GeoDataFrame(dataset_ais_clean, geometry=gpd.points_from_xy(dataset_ais_clean.lon, dataset_ais_clean.lat), crs="EPSG:4326")
    poly['time'] = pd.to_datetime(poly['message_time'], format='%Y-%m-%d %H:%M:%S')
    poly = poly.set_index('time')

    # Perform spatial join with the provided boundary geometry
    points_within = gpd.sjoin(poly, boundary_geometry, predicate='within')

    collection = mpd.TrajectoryCollection(points_within, 'target_id', min_length=100)
    collection.trajectories = [traj for traj in collection.trajectories if len(traj.df) > 30]

    return collection


# Example of usage
# Assuming 'boundary1' is a pre-defined GeoDataFrame with boundary geometry
# path = '/data/datasets/AIS/2024_02_15_03_14_ais_txt/0215'
# collection = process_ais_data(path, boundary1)

In [None]:
import geoviews as gv
import hvplot.pandas
import geoviews.feature as gf
from scipy.interpolate import LSQUnivariateSpline
from cartopy import crs
from shapely.geometry import Point, LineString, Polygon
from scipy.interpolate import LSQUnivariateSpline, UnivariateSpline
from holoviews import opts
gv.extension('bokeh', 'matplotlib')

In [None]:
def ssa(epsы

    


In [None]:
new_collections_aekf_alpha0_9 = list()
mask_traj = list()
for id, traj in tqdm(enumerate(collection.trajectories[:3000]), desc=f'AEKF 0.9'):
    traj = AEKF_traj(traj, 0.9, mmsi_colName="target_id")
    if isinstance(traj, mpd.Trajectory):
        new_collections_aekf_alpha0_9.append(traj)
        mask_traj.append(True)
    else:
        mask_traj.append(False)

RMSE_aekf_alpha0_9 = RMSE_error(collection.trajectories[:3000], new_collections_aekf_alpha0_9, mask_traj)

AEKF 0.9: 432it [04:09,  1.73it/s]
RMSE: 432it [07:51,  1.09s/it]


In [35]:
new_collections_ekf1 = list()
mask_traj_ekf1 = list()
for traj in tqdm(collection.trajectories[:3000], desc=f'EKF'):
    traj = AEKF_traj(traj, 0, mmsi_colName="target_id")
    if isinstance(traj, mpd.Trajectory):
        new_collections_ekf1.append(traj)
        mask_traj_ekf1.append(True)
    else:
        mask_traj_ekf1.append(False)
RMSE_ekf1 = RMSE_error(
    collection.trajectories[:3000], new_collections_ekf1, mask_traj_ekf1
)

EKF: 100%|██████████| 432/432 [03:24<00:00,  2.11it/s]
RMSE: 0it [00:00, ?it/s]


In [36]:
def plot_trajectory_with_alpha(collection, k, alpha=0.2):
    """
    Plot the trajectory of a vessel with different alpha values for EKF and Adaptive EKF.

    Args:
        collection (object): The collection containing the trajectories.
        k (int): The index of the trajectory to plot.
        alpha (float): The alpha value for the Adaptive EKF. If alpha=0, it uses Extended Kalman Filter (EKF).

    Returns:
        None: The function generates a plot and saves it as an HTML file.
    """
    # Apply AEKF_traj with given alpha value (1 for Adaptive EKF and 0 for EKF)
    traj = AEKF_traj(collection.trajectories[k], alpha=alpha)
    traj_ekf = AEKF_traj(collection.trajectories[k], alpha=0)  # For EKF, alpha is 0

    # Create plot for original, Adaptive EKF, and EKF trajectories
    plot = collection.trajectories[k].hvplot(geo=True, tiles='OSM', title=f"Original Vessel Trajectory", width=500, height=500, color='blue') + \
           traj.hvplot(geo=True, tiles='OSM', title=f"Vessel Trajectory with Adaptive EKF (alpha={alpha})", width=500, height=500, color='red') + \
           traj_ekf.hvplot(geo=True, tiles='OSM', title=f"Vessel Trajectory with EKF", width=500, height=500, color='green')

    # Save the plot as an HTML file
    hvplot.save(plot, f'AEKF_andEKF_{k}_alpha_{alpha}.html')

In [37]:
import pickle

with open('/home/ncl/quanh/EE817/ct_dma_train.pkl', 'rb') as f:
    data_denmark = pickle.load(f)

values = [i['traj'].tolist() for i in data_denmark]
columns=['lat', 'lon', 'sog', 'cog', 'unix_timestamp', 'mmsi']
dataframe = pd.DataFrame(columns=columns)
for i in values:
    df = dict()
    d = list(zip(*i))
    for idx in range(len(d)):
        # print(list(d[idx]))
        df[columns[idx]] = list(d[idx])
    dataframe = pd.concat([dataframe,pd.DataFrame(df)])

dataframe['time'] = pd.to_datetime(dataframe['unix_timestamp'], unit='s')
dataframe.set_index(['time'], inplace=True)

lat_min = 55.5
lat_max = 58.0
lon_min = 10.3
lon_max = 13
detal_lon = lon_max - lon_min
detal_lat = lat_max - lat_min 
dataframe['lat'] = dataframe['lat']*detal_lat + lat_min
dataframe['lon'] = dataframe['lon']*detal_lon + lon_min
dinish = gpd.GeoDataFrame(dataframe, geometry=gpd.points_from_xy(dataframe.lon, dataframe.lat), crs=CRS_METRIC)
collection_ = mpd.TrajectoryCollection(dinish, 'mmsi', min_length=100)
collection_ = mpd.TemporalSplitter(collection_).split(mode="day")
collection_.trajectories = [traj for traj in collection_.trajectories if len(traj.df) > 30]

FileNotFoundError: [Errno 2] No such file or directory: '/home/ncl/quanh/EE817/ct_dma_train.pkl'

In [22]:
lat_min = 55.5
lat_max = 58.0
lon_min = 10.3
lon_max = 13
detal_lon = lon_max - lon_min
detal_lat = lat_max - lat_min 
dataframe['lat'] = dataframe['lat']*detal_lat + lat_min
dataframe['lon'] = dataframe['lon']*detal_lon + lon_min
dinish = gpd.GeoDataFrame(dataframe, geometry=gpd.points_from_xy(dataframe.lon, dataframe.lat), crs=CRS_METRIC)
collection_ = mpd.TrajectoryCollection(dinish, 'mmsi', min_length=100)
collection_ = mpd.TemporalSplitter(collection_).split(mode="day")
collection_.trajectories = [traj for traj in collection_.trajectories if len(traj.df) > 30]

In [27]:
collection_.trajectories[280].df

Unnamed: 0_level_0,lat,lon,sog,cog,unix_timestamp,mmsi,geometry
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-30 08:36:14,56.142928,10.30125,0.46,0.315833,1548837000.0,209857000.0_2019-01-30 00:00:00,POINT (10.30125 56.14293)
2019-01-30 08:46:14,56.126655,10.365982,0.483333,0.318333,1548838000.0,209857000.0_2019-01-30 00:00:00,POINT (10.36598 56.12666)
2019-01-30 08:56:14,56.110385,10.431147,0.476667,0.316944,1548839000.0,209857000.0_2019-01-30 00:00:00,POINT (10.43115 56.11039)
2019-01-30 09:06:14,56.093919,10.495474,0.474332,0.320735,1548839000.0,209857000.0_2019-01-30 00:00:00,POINT (10.49547 56.09392)
2019-01-30 09:16:14,56.076381,10.559485,0.473333,0.323056,1548840000.0,209857000.0_2019-01-30 00:00:00,POINT (10.55948 56.07638)
2019-01-30 09:26:14,56.060627,10.624596,0.48,0.31,1548840000.0,209857000.0_2019-01-30 00:00:00,POINT (10.62460 56.06063)
2019-01-30 09:36:14,56.050337,10.691412,0.466667,0.241389,1548841000.0,209857000.0_2019-01-30 00:00:00,POINT (10.69141 56.05034)
2019-01-30 09:46:14,56.05102,10.762401,0.476667,0.246759,1548842000.0,209857000.0_2019-01-30 00:00:00,POINT (10.76240 56.05102)
2019-01-30 09:56:14,56.051123,10.833696,0.48,0.249444,1548842000.0,209857000.0_2019-01-30 00:00:00,POINT (10.83370 56.05112)
2019-01-30 10:06:14,56.06139,10.895066,0.474333,0.103556,1548843000.0,209857000.0_2019-01-30 00:00:00,POINT (10.89507 56.06139)


In [50]:
new_collections_aekf_alpha1_ = list()
mask_traj = list()
for traj in tqdm(collection_.trajectories, desc=f'AEKF 1'):
    traj = AEKF_traj(traj, 1, mmsi_colName=columns[-1])
    if isinstance(traj, mpd.Trajectory):
        new_collections_aekf_alpha1_.append(traj)
        mask_traj.append(True)
    else:
        mask_traj.append(False)

RMSE_aekf_alpha1 = RMSE_error(collection_.trajectories, new_collections_aekf_alpha1_, mask_traj)

AEKF 1: 100%|██████████| 10476/10476 [03:20<00:00, 52.26it/s]
RMSE: 10474it [00:22, 468.18it/s]


In [53]:
np.sum(RMSE_aekf_alpha1)

6949.141927557351

In [None]:
def RMSE_alpha(collection, new_collections_aekf):
    RMSE_aekf = list()
    for true_traj, est_traj in tqdm(zip(collection,new_collections_aekf), desc="RMSE"):
        true_coord = true_traj.df[['lat', 'lon']].values
        true_coord = np.deg2rad(true_coord)
        pred_coord = est_traj.df[['lat', 'lon']].values
        pred_coord = np.deg2rad(pred_coord)
        # print(true_coord.shape,pred_coord.shape)
        traj_dist = (haversine_distances(true_coord, pred_coord)*6371) * np.eye(len(true_traj.df))
        traj_dist = traj_dist[traj_dist != 0]
        rmse = np.sqrt((traj_dist**2).mean())
        RMSE_aekf.append(rmse)
    return RMSE_aekf

In [None]:
new_collections_aekf_alpha1_ = list()
for traj in tqdm(collection_.trajectories, desc=f'AEKF 1'):
    traj = AEKF_traj(traj, 1)
    new_collections_aekf_alpha1_.append(traj)
    
new_collections_aekf_alpha0_9_ = list()
for traj in tqdm(collection_.trajectories, desc=f'AEKF 0.9'):
    traj = AEKF_traj(traj, 0.9)
    new_collections_aekf_alpha0_9_.append(traj)
    
new_collections_aekf_alpha0_8_ = list()
for traj in tqdm(collection_.trajectories, desc=f'AEKF 0.8'):
    traj = AEKF_traj(traj, 0.8)
    new_collections_aekf_alpha0_8_.append(traj)

new_collections_aekf_alpha0_7_ = list()
for traj in tqdm(collection_.trajectories, desc=f'AEKF 0.7'):
    traj = AEKF_traj(traj, 0.7)
    new_collections_aekf_alpha0_7_.append(traj)




# AEKF 0.2: 100%|██████████| 6277/6277 [22:53<00:00,  4.57it/s]  
# AEKF 0.2: 100%|██████████| 6277/6277 [22:52<00:00,  4.57it/s]  
# AEKF 0.3: 100%|██████████| 3000/3000 [11:48<00:00,  4.23it/s] 
# AEKF 0.4: 100%|██████████| 3000/3000 [11:41<00:00,  4.28it/s]