In [5]:
import pandas as pd
import geopandas as gpd
from shapely.wkt import loads

### Load data

In [6]:
#### LOAD DATA DIRECTLY #####
from google.cloud import bigquery
client = bigquery.Client()
table_id = "wsdemo-457314.ais.shadowtankers"
query = f"SELECT * FROM `{table_id}`"
df = client.query(query).to_dataframe()
df.to_csv("../data/shadowtankers.csv", index=False)

In [7]:
##### LOAD DATA FROM CSV #####
df = pd.read_csv("../data/shadowtankers.csv")

### Initial data exploration

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539456 entries, 0 to 539455
Data columns (total 27 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   mmsi               539456 non-null  int64  
 1   imo                539456 non-null  int64  
 2   vessel_name        539456 non-null  object 
 3   callsign           539456 non-null  object 
 4   vessel_type        539456 non-null  object 
 5   vessel_class       535283 non-null  object 
 6   length             539456 non-null  int64  
 7   width              539456 non-null  int64  
 8   flag_country       539456 non-null  object 
 9   destination        539256 non-null  object 
 10  lat                539456 non-null  float64
 11  lon                539456 non-null  float64
 12  sog                539456 non-null  int64  
 13  cog                539456 non-null  float64
 14  rot                539456 non-null  int64  
 15  heading            539456 non-null  int64  
 16  na

#### Is each IMO associated with just one MMSI (and vice versa)?

In [10]:
# Check for 1:1 match for imo to mmsi
imo_to_mmsi_unique = df.groupby('imo')['mmsi'].nunique()
mmsi_to_imo_unique = df.groupby('mmsi')['imo'].nunique()

# Determine if there are any imo with multiple mmsi or vice versa
is_imo_to_mmsi_one_to_one = (imo_to_mmsi_unique.max() == 1)
is_mmsi_to_imo_one_to_one = (mmsi_to_imo_unique.max() == 1)

# Final result
if is_imo_to_mmsi_one_to_one and is_mmsi_to_imo_one_to_one:
    print("Yes, there is a 1:1 match between imo and mmsi.")
else:
    print("No, there is not a 1:1 match between imo and mmsi.")

No, there is not a 1:1 match between imo and mmsi.


In [6]:
###### SCRATCH ########

In [7]:
df_moving = df[df['sog']!=0].copy()

In [8]:
df_moving['dt'] = pd.to_datetime(df_moving['dt'], format="mixed")
df_moving['geometry'] = df_moving['geometry'].apply(loads)
gdf_moving = gpd.GeoDataFrame(df_moving, geometry='geometry')
gdf_moving.set_crs(epsg=4326, inplace=True)
gdf_moving.head(2)

Unnamed: 0,mmsi,imo,vessel_name,callsign,vessel_type,vessel_class,length,width,flag_country,destination,...,lon,sog,cog,rot,heading,nav_status,source,position_accuracy,dt,geometry
222,304868000,9236353,BLUE,V2YM7,TANKER_CRUDE,A,274,50,Antigua and Barbuda,25TURK.3SECGUARDS,...,52.84034,13,73.1,0,76,Under Way Using Engine,V-AIS,HIGH,2024-04-29 17:11:38+00:00,POINT (52.84034 14.28468)
224,304868000,9236353,BLUE,V2YM7,TANKER_CRUDE,A,274,50,Antigua and Barbuda,25TURK.3SECGUARDS,...,49.62179,12,70.2,0,73,Under Way Using Engine,V-AIS,HIGH,2024-04-29 01:15:33+00:00,POINT (49.62179 13.28817)


In [None]:
# Step 1: Define parameters for proximity
MAX_DISTANCE = 0.01  # in degrees (~111 meters per degree at the equator, adjust as needed)
MAX_TIME_DIFF = pd.Timedelta(minutes=10)

# Step 2: Perform pairwise comparisons
close_instances = []

for i, row1 in df_moving.iterrows():
    for j, row2 in df_moving.iterrows():
        if i >= j:  # Avoid duplicate comparisons and self-comparison
            continue
        # Check spatial proximity
        spatial_proximity = row1['geometry'].distance(row2['geometry']) <= MAX_DISTANCE
        
        # Check temporal proximity
        temporal_proximity = abs(row1['dt'] - row2['dt']) <= MAX_TIME_DIFF
        
        if spatial_proximity and temporal_proximity:
            # Record the pair of vessels and additional data
            close_instances.append({
                'vessel_1': row1['mmsi'],
                'vessel_2': row2['mmsi'],
                'time_1': row1['dt'],
                'time_2': row2['dt'],
                'distance': row1['geometry'].distance(row2['geometry']),
                'time_diff': abs(row1['dt'] - row2['dt'])
            })

# Step 3: Convert to DataFrame
close_df = pd.DataFrame(close_instances)

# Step 4: Filter for extended periods
# Group by vessel pairs and calculate the total duration
close_df['pair'] = close_df.apply(lambda x: frozenset((x['vessel_1'], x['vessel_2'])), axis=1)
extended_periods = close_df.groupby('pair').filter(lambda x: len(x) > 1)  # Adjust 'len(x)' as needed