In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import plotly_express as px
import warnings

from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from vincenty import vincenty

from helper_funcs import wrangle, vincenty_distance, nearest_port

warnings.filterwarnings("ignore")

In [2]:
port_file = 'data/ports.csv'
tracking_file = 'data/tracking.csv'

In [3]:
df1, df2 = wrangle(port_file=port_file, tracking_file=tracking_file)

print("the shape of the ports dataframe is:", df1.shape)
print("the shape of the tracking dataframe is:", df2.shape)

df1.head()

the shape of the ports dataframe is: (122, 6)
the shape of the tracking dataframe is: (591402, 16)


Unnamed: 0,port,lat,long,lat_rad,long_rad,lat_long
0,6,42.387051,-71.057504,0.739794,-1.240187,"[42.387051, -71.0575042]"
1,7,22.81233,120.193849,0.39815,2.097778,"[22.8123304, 120.1938488]"
2,10,10.200166,-61.701978,0.178026,-1.076903,"[10.2001663, -61.701978]"
3,11,36.4,126.483333,0.6353,2.207551,"[36.4, 126.4833333]"
4,13,29.897303,122.089303,0.521806,2.13086,"[29.8973026, 122.0893035]"


In [4]:
df2.head()

Unnamed: 0,vessel,datetime,lat,long,heading,speed,draft,vessel_1back,lat_rad,long_rad,lat_long,lat_long_1back,direction,time_delta,hour_delta,quad
0,1,2019-01-01 02:47:00,29.743,-93.8695,214.0,0.0,9.6,1.0,0.519113,-1.638332,"[29.743, -93.8695]","[29.743, -93.8695]",SW,0 days 00:54:00,0.9,quad2
1,1,2019-01-01 05:47:00,29.743,-93.8695,214.0,0.0,9.6,1.0,0.519113,-1.638332,"[29.743, -93.8695]","[29.743, -93.8695]",SW,0 days 03:00:00,3.0,quad2
2,1,2019-01-01 06:59:00,29.743,-93.8695,214.0,0.0,9.6,1.0,0.519113,-1.638332,"[29.743, -93.8695]","[29.743, -93.8695]",SW,0 days 01:12:00,1.2,quad2
3,1,2019-01-01 08:53:00,29.743,-93.8695,214.0,0.0,9.6,1.0,0.519113,-1.638332,"[29.743, -93.8695]","[29.743, -93.8695]",SW,0 days 01:54:00,1.9,quad2
4,1,2019-01-01 10:53:00,29.743,-93.8695,214.0,0.0,9.6,1.0,0.519113,-1.638332,"[29.743, -93.8695]","[29.743, -93.8695]",SW,0 days 02:00:00,2.0,quad2


In [5]:
# creating a dictionary of ports with their lat and longs 
ports = {port:(lat, long) for port, lat, long in zip(df1['port'], df1['lat'], df1['long'])}

# a dictionary to retrieve the port id from the index
idx_ports = {idx:port for idx, port in zip(df1.index, df1.port)}

In [6]:
# Getting the Vincenty distances for each pair of ports in the ports.csv file
# and identifying  those that are within 100km of each other

close_ones = []

for x in df1['lat_long']:
    distances = []
    for y in df1['lat_long']:
        vdist = vincenty(x, y)
        distances.append(vdist)
    close_ones.append(distances)

# this is a list of tuples with ports and their distances to the 4 nearest ports
disters = [[(df1['port'].iloc[n], z) for n, z in zip(np.argsort(p)[:5], sorted(p)[:5])] for p in close_ones]

close_ones = {}
for n in disters:
    port = n[0][0]
    dees = {d:v for d, v in n[1:] if v < 100}
    close_ones[port] = dees

close_ones = {k:v for k,v in close_ones.items() if len(v) >= 1}

# A list of ports that are very close to other ports 
really_close = [30, 109, 42, 51, 65, 71, 108, 139, 63, 152, 102]

In [15]:
# training a nearest neighbor model to  find the closest port when the 
# conditions indicating an extended stop have occurred. the metric is haversine
# in order to compute the 'great circle' distance. so i don't forget, the model
# returns the *index* of the port, not the port's identifying label

ports_train = df1[['lat_rad', 'long_rad']]

neigh_ports = NearestNeighbors(n_neighbors=3, algorithm='ball_tree', metric='haversine')
neigh_ports.fit(ports_train)

# selecting an example coordinate and printing the (port, distance)
dist, n = neigh_ports.kneighbors(np.array([0.677565, 0.469731]).reshape(1,-1))

for d, i in zip(dist[0], n[0]):
    print("port, distance for the coordinate:", (idx_ports[i], d * 6370))

port, distance for the coordinate: (82, 3.3093102570558104)
port, distance for the coordinate: (113, 46.44757564974505)
port, distance for the coordinate: (44, 255.13837716039743)


In [None]:
# creating a dictionary of labels from the dbscan model

df1['labels'] = train_dbscan()

db_labels = {port:cluster for port, cluster in zip(df1['port'], df1['labels'])}

In [None]:
# calculating the distance between each row using the vincenty function above.
# note it only calcs within each vessel group (see function). the elapsed time
# and the total distance travelled implies a speed and that speed creates a 
# simple filter 

df2['vin_diff'] = df2.apply(vincent_distance, axis=1)
df2['vin_per_hour'] = df2['vin_diff'] / df2['hour_delta']
df2 = df2.query('vin_per_hour <= 50')
df2 = df2.sort_values(by=['vessel', 'datetime'])

# time deltas to be used later to filter voyages and a new lat/long calc 

df2['time_delta'] = df2.groupby('vessel')['datetime'].transform(lambda x: x - x.shift(-1))
df2['hour_delta'] = [abs(n.total_seconds()/3600) for n in df2['time_delta']]
df2['lat_long_1back'] = df2.groupby('vessel')['lat_long'].transform(lambda x: x.shift())

In [None]:
# calculating draft forwards and backwards 

df2['draft_raw'] = df2.groupby('vessel')['draft'].transform(lambda x: x.diff())
df2['draft_delta_back'] = df2.groupby('vessel')['draft'].transform(lambda x: abs(x.diff()).ge(0.69)).astype(int)
df2['draft_delta_ahead'] = df2.groupby('vessel')['draft'].transform(lambda x: abs(x.diff(-1)).ge(0.69)).astype(int)
df2['draft_change'] = ((df2['draft_delta_back'] + df2['draft_delta_ahead']) >= 1).astype(int)

# calculating heading change and 
df2['heading_change'] = df2.groupby('vessel')['heading'].transform(lambda x: abs(x.diff()))
df2['heading_change'] = [360 - x if x > 180 else x for x in df2['heading_change']]
df2['heading_seq'] = df2.groupby('vessel')['heading'].transform(lambda x: abs(x.diff()).gt(3).cumsum()+1)

In [None]:
# assigning ports using the nearest neighbor model

df2['pred_port'] = df2.apply(nearest_port, axis=1)
df2['pred_port_backup'] = df2['pred_port']

# defining sequences of rows with with the same port predicted
df2['port_sequence'] = df2.groupby('vessel')['pred_port'].transform(lambda x: x.diff().ne(0).cumsum())

In [None]:
# defining sequences of rows with a port predicted - finds movement from
# one port to another and so identifies sequences w/ambiguous intended port

df2['consec_port_sequence'] = (
    df2
    .groupby('vessel')['pred_port_backup']
    .transform(lambda x: x.gt(0).astype(int).diff().ne(0).cumsum())
)

In [None]:
# calculating the distance from the predicted port using the more precise 
# vincenty distance. a mask is applied to speed it up.

df2['port_coords'] = [list(ports[k]) if k in ports else -99 for k in df2['pred_port']]

vin_mask = df2['pred_port'] > 0
temp_vin = df2[vin_mask]

df2['pred_port_dist'] = 0
df2.loc[vin_mask, 'pred_port_dist'] = temp_vin.apply(vincenty_port, axis=1)

In [None]:
df2['port_sequence_min_dist'] = df2.groupby(['vessel', 'port_sequence'])['pred_port_dist'].transform(lambda x: x.min())
df2['port_sequence_time'] = df2.groupby(['vessel', 'port_sequence'])['hour_delta'].transform(lambda x: abs(x).sum())
df2['heading_sequence_time'] = df2.groupby(['vessel', 'heading_seq'])['hour_delta'].transform(lambda x: abs(x).sum())
df2['consec_port_min_dist'] = df2.groupby(['vessel', 'consec_port_sequence'])['pred_port_dist'].transform(lambda x: round(x.min(), 2))

In [None]:
# getting the last port with a function and then flattening the lists it
# returned for each vessel and appending them to the dataframe

last_ports = df2.groupby("vessel").apply(get_prior_port)
last_ports_col = []

for lps in last_ports:
  for lp in lps:
    last_ports_col.append(lp)

df2['prior_port'] = last_ports_col
df2['dist_last_port'] = [vincenty(l, ports[p]) if p>0 else 0 for l, p in zip(df2['lat_long'], df2['prior_port'])]

In [None]:
# calculating the rolling unique predicted port values using a two lambdas
# and a filter
df2['rolling_unique_vals'] = (
    df2
    .groupby('vessel')['pred_port_backup']
    .apply(lambda x: x.rolling(8, center=True).apply(lambda x: x[x>0].nunique()))
)

# getting the minimum distance from a port over the window using two lambdas
# and a filter

df2['window_min_dist'] = (
    df2
    .groupby('vessel')['pred_port_dist']
    .apply(lambda x: x.rolling(8, center=True).apply(lambda x: x[x>0].min()))
)

In [None]:
# processing the dataframes with conditions. it is setup this way to allow for
# fast iteration. looking at this several years later, it seems as though i
# am iterating over each dataframe that's filtered for each unique vessel 
# in the dataset. i think apply various filters and functions to process
# the data. 
# this seems quite hacky, but it did improve port identification by allowing
# me to spot idiosyncratic anomalies.

processed_dfs = {}

for df in df2.vessel.unique():
  df_ = df2.query(f'vessel == {df}').set_index('datetime')
  df_['pred_port'] = np.where(df_['draft_change'] < 1, 0, df_['pred_port'])
  
  condition = ((df_['pred_port'].isin(really_close)) & (df_['pred_port_dist'] >13))
  df_['pred_port'] = np.where(condition, 0, df_['pred_port'])

  # df_['pred_port'] = df_.apply(fix_really_close_cluster, axis=1)

  df_['pred_port'] = np.where(df_['port_sequence_time'] < 16, 0, df_['pred_port'])

  df_['fixed_ports'] = fix_close_ports(df_)
  df_['pred_port'] = [x if x == y else 0 for x, y in zip(df_['pred_port_backup'], df_['fixed_ports'])]

  condition2 = (df_['pred_port'].isin([115, 54]) & (df_['pred_port_dist'] >16))
  df_['pred_port'] = np.where(condition2, 0, df_['pred_port'])

  df_['pred_port'] = df_.apply(fix_really_close_cluster, axis=1)

  last_ports = df_.groupby('vessel').apply(get_prior_port)
  last_ports_col = []

  for lps in last_ports:
    for lp in lps:
      last_ports_col.append(lp)

  condition = ((df_['heading_sequence_time'] < 6) & (df_['pred_port'] >0) & (df_['draft_change'] <1))
  df_['pred_port'] = np.where(condition, 0, df_['pred_port'])

  condition2 = ((df_['pred_port_dist'] - df_['consec_port_min_dist']) > 10)
  df_['pred_port'] = np.where(condition2, 0, df_['pred_port'])

  condition3 = (df_['pred_port_dist'] > 80)
  df_['pred_port'] = np.where(condition3, 0, df_['pred_port'])

  df_['prior_port'] = last_ports_col
  df_['dist_last_port'] = [vincenty(l, ports[p]) if p>0 else 0 for l, p in zip(df_['lat_long'], df_['prior_port'])]
  
  processed_dfs[df] = df_

alldf = pd.concat(processed_dfs)

In [None]:
# here i am creating a new dataframe of voyages
voyages_df = pd.concat([get_voyages(processed_dfs[key]) for key in processed_dfs.keys()])

voyages_df['begin_date'] = voyages_df['begin_date'].dt.date
voyages_df['end_date'] = voyages_df['end_date'].dt.date
voyages_df['len_voyage'] = voyages_df['end_date'] - voyages_df['begin_date']
voyages_df['begin_coords'] = [ports[key] for key in voyages_df['begin_port_id']]
voyages_df['end_coords'] = [ports[key] for key in voyages_df['end_port_id']]
voyages_df['voyage_dist'] = [vincenty(x, y) for x, y in zip(voyages_df['begin_coords'], voyages_df['end_coords'])]
voyages_df.shape