In [1]:
import pandas as pd

def load_and_clean(path="chunk_0.csv"):
    df = pd.read_csv(path, parse_dates=["BaseDateTime"] )
    df = df.dropna(subset=["LAT","LON"])
    df = df[(df.SOG >= 0) & (df.SOG <= 50)]
    print(f"Loaded {len(df)} records")
    return df

df = load_and_clean()

Loaded 99993 records


  df = pd.read_csv(path, parse_dates=["BaseDateTime"] )


In [2]:
import pandas as pd
import geopandas as gpd

# 1. Convert to GeoDataFrame (if not already)
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df.LON, df.LAT),
    crs="EPSG:4326"
)

# 2. Load EEZ and label inside_eez
eez = gpd.read_file("World_EEZ_v12_20231025/eez_v12.shp").to_crs("EPSG:4326")
gdf["inside_eez"] = gdf.within(eez.unary_union)

# 3. Compute previous inside flag per vessel
gdf = gdf.sort_values(["MMSI","BaseDateTime"])
gdf["prev_inside"] = gdf.groupby("MMSI")["inside_eez"].shift(fill_value=True)

# 4. Identify departures and returns
dep = gdf[(gdf.prev_inside) & (~gdf.inside_eez)][["MMSI","BaseDateTime"]] 
dep = dep.rename(columns={"BaseDateTime":"departure_time"})
ret = gdf[(~gdf.prev_inside) & (gdf.inside_eez)][["MMSI","BaseDateTime"]]
ret = ret.rename(columns={"BaseDateTime":"return_time"})

# 5. Pair departures with next returns using merge_asof
dep = dep.sort_values(["MMSI","departure_time"])
ret = ret.sort_values(["MMSI","return_time"])
trips_df = pd.merge_asof(
    dep, ret, by="MMSI",
    left_on="departure_time", right_on="return_time",
    direction="forward", tolerance=pd.Timedelta("7 days")
)

print(f"Found {len(trips_df)} voyages")

  gdf["inside_eez"] = gdf.within(eez.unary_union)


In [2]:
from pyproj import Geod
from shapely.geometry import LineString

geod = Geod(ellps="WGS84")
records = []
for _, trip in trips_df.iterrows():
    seg = gdf[
        (gdf.MMSI==trip.MMSI) &
        (gdf.BaseDateTime.between(trip.departure_time, trip.return_time))
    ]
    if len(seg) < 2:
        continue
    line = LineString(seg.geometry.tolist())
    dist_m = geod.line_length(seg.LON.values, seg.LAT.values)
    records.append({
        **trip.to_dict(),
        "duration_h": (trip.return_time - trip.departure_time).total_seconds()/3600,
        "distance_km": dist_m/1000,
        "mean_sog": seg.SOG.mean(),
        "max_sog": seg.SOG.max()
    })
voyage_feats = pd.DataFrame(records)
print(f"Engineered features for {len(voyage_feats)} voyages")

NameError: name 'trips_df' is not defined

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

# Prepare features
cols = ["duration_h","distance_km","mean_sog","max_sog"]
X = voyage_feats[cols].dropna()
X_scaled = StandardScaler().fit_transform(X)

# Fit model
iso = IsolationForest(n_estimators=100, contamination=0.02, random_state=42)
voyage_feats["anomaly_flag"] = iso.fit_predict(X_scaled)

# Summary
print(voyage_feats["anomaly_flag"].value_counts())

In [None]:
from arcgis.gis import GIS
from arcgis.features import GeoAccessor
import geopandas as gpd
from shapely.geometry import LineString

# Authenticate
gis = GIS("https://www.arcgis.com", "vgaza1_newhaven", "Kavita@08")

# Build trips_gdf with line geometries
records = []
for _, trip in voyage_feats.iterrows():
    seg = gdf[(gdf.MMSI==trip.MMSI) &
              (gdf.BaseDateTime.between(trip.departure_time, trip.return_time))]
    if len(seg)<2: continue
    line = LineString(seg.geometry.tolist())
    rec = trip.to_dict()
    rec["geometry"] = line
    records.append(rec)
trips_gdf = gpd.GeoDataFrame(records, crs="EPSG:4326")
anoms_gdf = trips_gdf[trips_gdf.anomaly_flag==-1]

# Publish
trips_sdf = GeoAccessor.from_geodataframe(trips_gdf, sr=4326)
anoms_sdf = GeoAccessor.from_geodataframe(anoms_gdf, sr=4326)
trips_item = trips_sdf.spatial.to_featurelayer(title="Voyages", tags="Fishing")
anoms_item = anoms_sdf.spatial.to_featurelayer(title="Anomalies", tags="Fishing")
print(trips_item.url, anoms_item.url)