In [None]:
from dataloader import load_parquet
#THIS_PATH = os.path.dirname(os.path.realpath(__file__))
MY_FILE = "aisdk-2025-02-27"
df = load_parquet(MY_FILE, k=10000)
print(df.head())

            Timestamp   Latitude  Longitude       SOG   COG  Segment  \
0 2025-02-27 00:00:05  56.123522  11.591552  6.739216  90.1        0   
1 2025-02-27 00:00:06  56.123522  11.591770  6.739216  90.0        0   
2 2025-02-27 00:00:07  56.123522  11.591770  6.739216  90.0        0   
3 2025-02-27 00:00:13  56.123518  11.592530  6.739216  89.8        0   
4 2025-02-27 00:00:14  56.123518  11.592637  6.739216  89.7        0   

             MMSI             x             y  
0  MMSI=200000000  1.290366e+06  7.583045e+06  
1  MMSI=200000000  1.290390e+06  7.583045e+06  
2  MMSI=200000000  1.290390e+06  7.583045e+06  
3  MMSI=200000000  1.290475e+06  7.583044e+06  
4  MMSI=200000000  1.290486e+06  7.583044e+06  


In [None]:
print(df.columns)

Index(['Timestamp', 'Latitude', 'Longitude', 'SOG', 'COG', 'Segment', 'MMSI',
       'x', 'y'],
      dtype='object')


http://aisdata.ais.dk/!_README_information_CSV_files.txt

SOG =  Speed over ground from AIS message if available

COG = Course over ground from AIS message if available

Segment = Divide track into segments based on timegap (he created this in a script)
"Copilot explanation: " Segment splits a vessel's time-ordered AIS track into contiguous sub-tracks whenever there's a large time gap between consecutive messages. This prevents treating long gaps (e.g., device off, no reception) as part of the same continuous trajector""

MMSI = Marinetime Mobile Service Identity (unique 9 digit identifier)

In [None]:
# Exploration imports and load
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dataloader import load_parquet

plt.rcParams['figure.figsize'] = (10, 5)

MY_FILE = "aisdk-2025-02-27"
# load a sample of data (k controls number of MMSIs sampled by load_parquet)
df = load_parquet(MY_FILE, k=1000)
print('Loaded rows,cols:', df.shape)
df.head()

In [None]:
# Quick dataset overview
print('\n--- dtypes and non-null counts ---')
df.info()
print('\n--- missing values per column ---')
print(df.isna().sum())

print('\nUnique MMSIs:', df['MMSI'].nunique())
print('Unique Segments:', df[['MMSI','Segment']].drop_duplicates().shape[0])

In [None]:
# MMSI frequency and top vessels
mmsi_counts = df['MMSI'].value_counts()
print('Total MMSIs:', len(mmsi_counts))
print('\nTop 20 MMSIs by number of records:')
print(mmsi_counts.head(20))

# Plot distribution of records per MMSI
sns.histplot(mmsi_counts, bins=50)
plt.title('Records per MMSI (distribution)')
plt.xlabel('Records')
plt.show()

In [None]:
# Segment length distribution (points per segment)
seg_len = df.groupby(['MMSI','Segment']).size().reset_index(name='n')
print('Segments total:', len(seg_len))
seg_len['n'].describe()

sns.histplot(seg_len['n'], bins=60)
plt.title('Segment lengths (number of points per segment)')
plt.xlabel('Points per segment')
plt.show()

In [None]:
# Time coverage and records per day
# ensure Timestamp is datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
print('Time range:', df['Timestamp'].min(), 'to', df['Timestamp'].max())

# records per day
daily = df.set_index('Timestamp').resample('D').size()
daily.plot()
plt.title('Records per day')
plt.ylabel('Records')
plt.show()

In [None]:
# SOG and COG distributions
sns.histplot(df['SOG'].dropna(), bins=100, kde=True)
plt.title('SOG distribution (m/s)')
plt.xlabel('SOG (m/s)')
plt.show()

sns.histplot(df['COG'].dropna(), bins=72)
plt.title('COG distribution (degrees)')
plt.xlabel('Course over ground')
plt.show()

In [None]:
# Map: sample some MMSIs and plot tracks
sample_mmsis = df['MMSI'].value_counts().head(5).index.tolist()
print('Sample MMSIs:', sample_mmsis)

fig, ax = plt.subplots()
for m in sample_mmsis:
    sub = df[df['MMSI']==m].sort_values('Timestamp')
    ax.plot(sub['Longitude'], sub['Latitude'], marker='.', linewidth=1, label=str(m))
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_title('Sample vessel tracks')
ax.legend()
plt.show()

In [None]:
# Save cleaned sample for modelling (optional)
# Keep these columns and a sample of segments with at least N points
keep_cols = ['MMSI','Segment','Timestamp','Longitude','Latitude','SOG','COG']
seg_len = df.groupby(['MMSI','Segment']).size().reset_index(name='n')
keep_segments = seg_len[seg_len['n']>=100][['MMSI','Segment']]
keep_df = df.merge(keep_segments, on=['MMSI','Segment'], how='inner')
print('Kept rows for modelling:', keep_df.shape)
# write to a local parquet sample
keep_df.to_parquet('sample_for_modeling.parquet', index=False)
print('Wrote sample_for_modeling.parquet')