In [49]:
import os
import re
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats
from scipy.optimize import curve_fit
import warnings


In [50]:
# Directory path
directory = "../../data/FishingKoreaAIS_sampled"

# Regular expression pattern to extract len values
pattern = r"len_(\d+)_mmsi_\d+\.csv"

# List to store len values
len_values = []

# Iterate through files in the directory
for filename in os.listdir(directory):
    match = re.match(pattern, filename)
    if match:
        len_value = int(match.group(1))
        len_values.append(len_value)

# Create a DataFrame for plotting
df = pd.DataFrame({'Length': len_values})

In [51]:
df

Unnamed: 0,Length
0,3
1,2
2,4055
3,465
4,128
...,...
1246,486
1247,3974
1248,93
1249,3


In [52]:
fig = px.histogram(df, # [df["Length"] > 500] 
                   x='Length', 
                   nbins=50,
                   title='Distribution of Length Values in FishingKoreaAIS Dataset',
                   labels={'Length': 'Length Value', 'count': 'Frequency'},
                   opacity=0.7)

# Customize layout
fig.update_layout(
    bargap=0.1,
    xaxis_title="Length",
    yaxis_title="Count",
    title_x=0.5,
    font=dict(size=12)
)

# Show the plot
fig.show()

In [53]:
data = np.array(df["Length"])
# Plot histogram with best fit distribution
fig = go.Figure()

# Add histogram
fig.add_trace(go.Histogram(
    x=data, 
    name='Data', 
    opacity=0.7,
    histnorm='probability density',
    nbinsx=50
))
# fig.show()

In [56]:
# Define candidate distributions to test
distributions = [
    stats.norm,      # Normal
    stats.lognorm,   # Log-Normal
    stats.expon,     # Exponential
    stats.weibull_min,  # Weibull
    stats.gamma,     # Gamma
    stats.beta,      # Beta (needs bounded data)
    stats.pareto     # Pareto
]

# Prepare DataFrame for results
results = pd.DataFrame(columns=['Distribution', 'Parameters', 'KS Statistic', 'p-value'])

# Fit each distribution and calculate goodness-of-fit
for dist in distributions:
    # Try to fit the distribution
    try:
        # Ignore warnings from fitting
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            
            # Fit distribution to data
            params = dist.fit(data)
            
            # Separate parts of parameters
            arg = params[:-2]
            loc = params[-2]
            scale = params[-1]
            
            # Calculate fitted PDF and error with fit in distribution
            pdf = dist.pdf(sorted(data), loc=loc, scale=scale, *arg)
            
            # Calculate Kolmogorov-Smirnov test
            D, p = stats.kstest(data, dist.cdf, args=params)
            
            # Add to results
            results.loc[len(results)] = [dist.name, params, D, p]
    
    except Exception as e:
        print(f"Could not fit {dist.name}: {str(e)}")
        continue

# Sort by KS statistic (lower is better)
best_dist = results.sort_values('KS Statistic').iloc[0]
print("\nBest fitting distribution:")
print(best_dist)

# Plot histogram with best fit distribution
fig = go.Figure()

# Add histogram
fig.add_trace(go.Histogram(
    x=data, 
    name='Data', 
    opacity=0.7,
    histnorm='probability density',
    nbinsx=50
))

# Get best distribution info
best_dist_name = best_dist['Distribution']
best_params = best_dist['Parameters']

# Generate PDF for best fit
x = np.linspace(min(data), max(data), 1000)
if best_dist_name == 'lognorm':
    pdf = stats.lognorm.pdf(x, s=best_params[0], loc=best_params[1], scale=best_params[2])
elif best_dist_name == 'weibull_min':
    pdf = stats.weibull_min.pdf(x, c=best_params[0], loc=best_params[1], scale=best_params[2])
else:
    pdf = getattr(stats, best_dist_name).pdf(x, *best_params)

# Add best fit line
fig.add_trace(go.Scatter(
    x=x, 
    y=pdf, 
    mode='lines',
    name=f'Best Fit: {best_dist_name}',
    line=dict(color='red', width=2)
))

# Update layout
fig.update_layout(
    title=f'Length Distribution with Best Fit: {best_dist_name}',
    xaxis_title='Length',
    yaxis_title='Probability Density',
    bargap=0.1,
    showlegend=True,
    yaxis=dict(range=[0, 0.005])
)

fig.show()

# Print all results for comparison
print("\nAll tested distributions:")
print(results.sort_values('KS Statistic'))


Best fitting distribution:
Distribution                                              lognorm
Parameters      (2.5830721685008213, 1.9476923430647204, 114.4...
KS Statistic                                             0.078205
p-value                                                       0.0
Name: 1, dtype: object



All tested distributions:
  Distribution                                         Parameters  \
1      lognorm  (2.5830721685008213, 1.9476923430647204, 114.4...   
6       pareto  (0.47798514029498085, -27.411188045174622, 29....   
4        gamma  (0.4370582439531763, 1.9999999999999996, 891.1...   
3  weibull_min  (0.4491353360841618, 1.9999999999999998, 1382....   
5         beta  (0.2971058057916406, 0.8545036176664949, 1.999...   
0         norm            (774.4916067146283, 1150.4749586258522)   
2        expon                           (2.0, 772.4916067146283)   

   KS Statistic        p-value  
1      0.078205   4.215534e-07  
6      0.100104   2.295127e-11  
4      0.167965   2.551386e-31  
3      0.198597   1.024294e-43  
5      0.223517   2.184059e-55  
0      0.261589   5.107282e-76  
2      0.334940  1.221580e-125  


In [None]:
# import glob

# all_boats_trajectories = {}
# dataset_path = "../../data/FishingKoreaAISFull/Dynamic_*.csv"
# dynamic_data_files = glob.glob(dataset_path)

# all_mmsi = set()
# for dynamic_data_file in dynamic_data_files:
#     print(f"Reading {dynamic_data_file}...")
#     df_dynamic = pd.read_csv(dynamic_data_file)
#     all_mmsi = all_mmsi | set(df_dynamic["MMSI"])
    # data_grouped = df_dynamic.groupby("MMSI")
    # for mmsi, data in data_grouped:
    #     if mmsi not in all_boats_trajectories:
    #         all_boats_trajectories[mmsi] = (
    #             data.copy()
    #         )  # Create a copy to avoid SettingWithCopyWarning
    #     else:
    #         all_boats_trajectories[mmsi] = pd.concat(
    #             [all_boats_trajectories[mmsi], data], ignore_index=True
    #         )
    # print("Done!")
    
    # break

Reading ../../data/FishingKoreaAISFull/Dynamic_20230514_fishing_boats.csv...
Reading ../../data/FishingKoreaAISFull/Dynamic_20230508_fishing_boats.csv...
Reading ../../data/FishingKoreaAISFull/Dynamic_20230520_fishing_boats.csv...
Reading ../../data/FishingKoreaAISFull/Dynamic_20230511_fishing_boats.csv...


KeyboardInterrupt: 

In [33]:
len(all_mmsi)

1251

In [57]:
len(df[df["Length"] > 450])/len(df)

0.3517186250999201

In [58]:
len(df[df["Length"] > 2000])/len(df)

0.16866506794564348