<a href="https://colab.research.google.com/github/yaobviously/sym-cargo/blob/main/sym_cargo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install plotly-express --quiet
!pip install vincenty --quiet

  Building wheel for vincenty (setup.py) ... [?25l[?25hdone


In [257]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from vincenty import vincenty
from sklearn.neighbors import NearestNeighbors
import plotly_express as px

In [94]:
port_file = '/content/drive/MyDrive/Ports/ports.csv'
tracking_file = '/content/drive/MyDrive/Ports/tracking.csv'

In [118]:
def wrangle():
  df1 = pd.read_csv(port_file)

  # converting lat and long to radians to compute haversine distance
  df1['lat_rad'] = np.radians(df1['lat'])
  df1['long_rad'] = np.radians(df1['long'])
  
  # rounding lat and long in port df
  df1['lat'] = df1['lat'].round(1)
  df1['long'] = df1['long'].round(1)

  df2 = pd.read_csv(tracking_file, parse_dates=['datetime'])
  df2 = df2.drop_duplicates()
  df2 = df2.sort_values(['vessel', 'datetime'])
  
  # converting lat and long to radians to compute haversine distance  
  df2['lat_rad'] = np.radians(df2['lat'])
  df2['long_rad'] = np.radians(df2['long'])

  # adding lat/long column and lat/long 1 back to later compute delta
  df2['lat_long'] = [[x,y] for x, y in zip(df2['lat'], df2['long'])]
  df2['lat_long_1back'] = df2.groupby('vessel')['lat_long'].transform(lambda x: x.shift())   
  
  # months and days
  df2['time_delta'] = df2.groupby('vessel')['datetime'].transform(lambda x: x.diff(1))
  df2['delta_hours'] = df2.groupby('vessel')['time_delta'].transform(lambda x: [(n.seconds/3600) + (n.days *24) for n in x])
  df2 = df2.drop(columns=['time_delta'])

  # speed and heading back
  df2['speed_diff_2back'] = df2.groupby('vessel')['speed'].transform(lambda x: x.diff(2))
  df2['speed_diff_1back'] = df2.groupby('vessel')['speed'].transform(lambda x: x.diff(1))
  df2['speed_diff_1ahead'] = df2.groupby('vessel')['speed'].transform(lambda x: x.diff(-1))
  df2['speed_diff_2ahead'] = df2.groupby('vessel')['speed'].transform(lambda x: x.diff(-2))
  df2['heading_diff_2back'] = df2.groupby('vessel')['heading'].transform(lambda x: x.diff(2))
  df2['heading_diff_1back'] = df2.groupby('vessel')['heading'].transform(lambda x: x.diff(1))
  df2['heading_diff_1ahead'] = df2.groupby('vessel')['heading'].transform(lambda x: x.diff(-1))
  df2['heading_diff_2ahead'] = df2.groupby('vessel')['heading'].transform(lambda x: x.diff(-2))
  
  df2['vessel_1back'] = df2['vessel'].shift(1)
  df2 = df2.reset_index(drop=True)

  return df1, df2

In [119]:
df1, df2 = wrangle()

In [120]:
# creating a dictionary of ports with their lat and longs - will be used to
# repeatedly to explore data and assign values

ports = {port:(lat, long) for port, lat, long in zip(df1['port'], df1['lat'], df1['long'])}

# a dictionary to retrieve the port id from the index
idx_ports = {idx:port for idx, port in zip(df1.index, df1.port)}

In [121]:
# training a nearest neighbor model to  find the closest port when the 
# conditions indicating an extended stop have occurred. the metric is haversine
# in order to compute the 'great circle' distance. so i don't forget, the model
# returns the *index* of the port, not the port's identifying label

ports_train = df1[['lat_rad', 'long_rad']]

neigh_ports = NearestNeighbors(n_neighbors=3, algorithm='ball_tree', metric='haversine')
neigh_ports.fit(ports_train)

dist, n = neigh_ports.kneighbors(np.array([0.677565, 0.469731]).reshape(1,-1))

print(dist[0] * 6370)
print([idx_ports[n] for n in n[0]])

[  3.30931026  46.44757565 255.13837716]
[82, 113, 44]


In [122]:
# using set differences i noticed two ports (42, 113) were never being assigned,
# so i used vincenty distance to figure out what was going on. since the lat/long
# of where the boats are docked are, in both cases, much closer to the port
# assigned by the model, i cannot justify any change

coord_first_stop = (38.743843, 26.895678)               
coord_second_stop = (38.821700, 26.913800)

viable_ports = [82, 113]

vin_one = {f'port{a}':[vincenty(ports[a], coord_first_stop)] for a in viable_ports}
vin_two = {f'port{a}':[vincenty(ports[a], coord_second_stop)] for a in viable_ports}

In [101]:
# functions to quickly inspect dataframe to figure out what's up

def get_dates(vessel=111, start='2019-01-01', end='2019-12-25'):

  df = df2[(df2.datetime >= start) & (df2.datetime <=end)]
  df = df[df['vessel'] == vessel]
  
  return df

def where_zero(vessel = 75):
  
  df_ = df2[(df2['vessel'] == vessel) & (df2['speed'] <= 0.5)]
  
  return df_

In [126]:
def nearest_port(df):
  """
  a simple function to return the port (identifier) of the nearest port in the
  dataset. 
  """

  data = np.array([df['lat_rad'], df['long_rad']]).reshape(1, -1)
  dist, pred = neigh_ports.radius_neighbors(data, radius=0.008, sort_results = True) 

  if len(dist[0]) == 0:
    return -1
  
  else:
    return idx_ports[pred[0][0]]

In [127]:
def nearest_distance(df):
  """
  a simple function to return the distance of the nearest port in the dataset
  """
  
  data = np.array([df['lat_rad'], df['long_rad']]).reshape(1, -1)
  dist, pred = neigh_ports.radius_neighbors(data, radius=0.0085, sort_results = True) 

  if len(dist[0]) == 0:
    return -1
  
  else:
    return dist[0][0]


In [104]:
def vincent_distance(row):
  """ 
  a simple function that returns the vincenty distance for lat and longitude
  coordinate pairs
  """
  if row['vessel'] != row['vessel_1back']:
    return -99

  loc1 = row['lat_long']
  loc2 = row['lat_long_1back']

  try:
    distance = vincenty(loc1, loc2)
    return distance
  except:
    return -99

In [128]:
# using my simple neighbor model with haversine distance to assign the closest 
# port along with its distance to the dataframe. i used a mask to speed up
# the process

mask = ((df2['speed'] <= 0.2) & \
        ((abs(df2['heading_diff_1back'] <= 2)) & (abs(df2['heading_diff_2back'] <=2)) | \
         (abs(df2['heading_diff_1ahead'] <=2)) & (abs(df2['heading_diff_2ahead'] <=2)))
)
  
df_valid = df2[mask]

df2['pred_port'] = -1
df2['pred_dist'] = -1
df2.loc[mask, 'pred_port'] = df_valid.apply(nearest_port, axis=1)
df2.loc[mask, 'pred_dist'] = df_valid.apply(nearest_distance, axis=1)

In [15]:
# calculating the vincenty distances between each row in the dataframe to
# detect movement. this seems like it might allow me to more easily detect when
# a ship is in japanese (and other?) ports where they appear to unload/load 
# without zero movement 

vin_mask = df2['speed'] <= 1

df_vin = df2[vin_mask]

df2['vin_delta'] = -99
df2.loc[vin_mask, 'vin_delta'] = df_vin.apply(vincent_distance, axis=1)

In [131]:
df2['pred_port_coords'] = [ports[k] if k in ports else -99 for k in df2['pred_port']]

In [260]:
def vincenty_port(row):
  try:
    distance = vincenty(row['port_coords'], row['lat_long'])
    return distance
  except:
    return -99

In [253]:
def assign_ports(df):
  """
  a function that prepares a dataframe for predictions
  """

  # resampling the data to standardize time intervals
  df = df.resample('4H').mean().ffill()
  df['lat_rad'] = df['lat_rad'].resample('4H').median().ffill()
  df['long_rad'] = df['long_rad'].resample('4H').median().ffill()
  
  # applying a mask to limit the rows the predict function is applied to
  mask = df['speed'] <= 3
  df_temp = df[mask]

  df['pred_port'] = 0
  df.loc[mask, 'pred_port'] = df_temp[mask].apply(nearest_port, axis=1)  
  df['port_coords'] = [list(ports[k]) if k in ports else -99 for k in df['pred_port']]
  
  # applying a mask to limit the rows the vincenty distance func is applied to
  maskb = df['pred_port'] > 0
  df_temp = df[maskb]
  
  df['dist_from_port'] = 0
  df['lat_long'] = [[x,y] for x, y in zip(df['lat'], df['long'])]
  df.loc[maskb, 'dist_from_port'] = df_temp[maskb].apply(vincenty_port, axis=1)
  
  # recasting vessel and pred_port columns as integers
  df[['vessel', 'pred_port']] = df[['vessel', 'pred_port']].astype(int)

  return df

In [None]:
p = assign_ports(vessel_dfs[4])
p

In [150]:
# creating a dictionary of dataframes for each vessel

vessel_dfs = {}

for vessel in df2.vessel.unique():
  df_ = df2[df2['vessel'] == vessel]
  vessel_dfs[vessel] = df_.set_index('datetime')

In [177]:
# creating a dictionary of dataframes for each port

port_ds = {}

for port in df2.pred_port.unique():
  df_ = df2[df2['pred_port'] == port]
  port_ds[port] = df_.set_index('datetime')

del port_ds[-1]

for n in port_ds.keys():
  port_ds[n][f'{n}_coord'] = [ports[n]] * len(port_ds[n])

for n in port_ds.keys():
  port_ds[n]['vdist_port'] = [vincenty(x, y) for x, y in zip(port_ds[n][f'{n}_coord'], port_ds[n]['lat_long'])]


In [178]:
# calculating the minimum and maximum vincenty distances from ports and 
# storing the values in a dictionary. it may prove useful when distinguishing
# between 'in port' and 'waiting around to enter the port'

port_dist = {}

for port in port_ds.keys():
  max_vincenty = port_ds[port]['vdist_port'].max()
  min_vincenty = port_ds[port]['vdist_port'].min()
  port_dist[port] = [max_vincenty, min_vincenty]

In [172]:
def hanging_around(row):

  key = int(row['pred_port'])

  if key < 0:
    return 0

  from_min = row['vindist_pred_port'] - port_dist[key][1]

  try:
    if from_min >= 20:
      return 1
    else:
      return 0

  except:
    return -1

In [179]:
def get_voyages(df):

  nz = df[df['pred_port'] > 0].reset_index()

  vessel = nz['vessel'][0]
  dt = nz['datetime']
  pred = nz['pred_port']

  records = []

  for i in range(len(dt)-1):
    if pred[i] != pred[i+1]:
      start_port = pred[i]
      end_port = pred[i+1]
      begin_date = dt[i]
      end_date = dt[i+1]
      records.append([vessel, begin_date, end_date, start_port, end_port])

  df = pd.DataFrame.from_records(records, columns = ['vessel', 'begin_date', 'end_date', 'begin_port_id', 'end_port_id'])
  
  return df

In [44]:
voyages_dict = pd.concat([get_voyages(vessel_dfs[n]) for n in vessel_dfs.keys()])

In [198]:
resample_52 = vessel_dfs[172].resample('4H').mean().ffill()
resample_52['lat_rad'] = vessel_dfs[172]['lat_rad'].resample('4H').median().ffill()
resample_52['long_rad'] = vessel_dfs[172]['long_rad'].resample('4H').median().ffill()
resample_52['pred_port_resample'] = resample_52.apply(nearest_port, axis = 1)

In [46]:
df3 = voyages_dict
df3['len_voyage'] = df3['end_date'] - df3['begin_date']

In [None]:
# simple mapping tool to inspect routes

fig = px.scatter_geo(f,lat='lat',lon='long', hover_name="datetime")

fig.update_layout(title = 'Vessel Locations', title_x=0.5)
fig.show()

In [None]:
# the dataframes to submit

voyages = pd.DataFrame(columns= ['vessel', 'begin_date', 'end_date', 'begin_port_id', 'end_port_id'])
predict = pd.DataFrame(columns= ['vessel', 'begin_port_id', 'end_port_id', 'voyage'])