<a href="https://colab.research.google.com/github/yaobviously/sym-cargo/blob/main/sym_cargo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install plotly-express --quiet
!pip install vincenty --quiet

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from vincenty import vincenty
from sklearn.neighbors import NearestNeighbors
import plotly_express as px

In [3]:
port_file = '/content/drive/MyDrive/Ports/ports.csv'
tracking_file = '/content/drive/MyDrive/Ports/tracking.csv'

In [4]:
def wrangle():
  df1 = pd.read_csv(port_file)

  # converting lat and long to radians to compute haversine distance
  df1['lat_rad'] = np.radians(df1['lat'])
  df1['long_rad'] = np.radians(df1['long'])
  
  # rounding lat and long in port df
  df1['lat'] = df1['lat'].round(1)
  df1['long'] = df1['long'].round(1)

  df2 = pd.read_csv(tracking_file, parse_dates=['datetime'])
  df2 = df2.drop_duplicates()
  df2 = df2.sort_values(['vessel', 'datetime'])
  
  # converting lat and long to radians to compute haversine distance  
  df2['lat_rad'] = np.radians(df2['lat'])
  df2['long_rad'] = np.radians(df2['long'])

  # adding lat/long column and lat/long 1 back to later compute delta
  df2['lat_long'] = [[x,y] for x, y in zip(df2['lat'], df2['long'])]
  df2['lat_long_1back'] = df2.groupby('vessel')['lat_long'].transform(lambda x: x.shift())   
  
  # months and days
  df2['month'] = df2['datetime'].dt.month
  df2['day'] = df2['datetime'].dt.day
  df2['time_delta'] = df2.groupby('vessel')['datetime'].transform(lambda x: x.diff(1))
  df2['hours'] = df2.groupby('vessel')['time_delta'].transform(lambda x: [(n.seconds/3600) + (n.days *24) for n in x])

  # speed and heading back
  df2['speed_diff_2back'] = df2.groupby('vessel')['speed'].transform(lambda x: x.diff(2))
  df2['speed_diff_1back'] = df2.groupby('vessel')['speed'].transform(lambda x: x.diff(1))
  df2['speed_diff_1ahead'] = df2.groupby('vessel')['speed'].transform(lambda x: x.diff(-1))
  df2['speed_diff_2ahead'] = df2.groupby('vessel')['speed'].transform(lambda x: x.diff(-2))
  df2['heading_diff_2back'] = df2.groupby('vessel')['heading'].transform(lambda x: x.diff(2))
  df2['heading_diff_1back'] = df2.groupby('vessel')['heading'].transform(lambda x: x.diff(1))
  df2['heading_diff_1ahead'] = df2.groupby('vessel')['heading'].transform(lambda x: x.diff(-1))
  df2['heading_diff_2ahead'] = df2.groupby('vessel')['heading'].transform(lambda x: x.diff(-2))
  
  df2['vessel_1back'] = df2['vessel'].shift(1)
  df2 = df2.reset_index(drop=True)

  return df1, df2

In [5]:
df1, df2 = wrangle()

In [6]:
# creating a dictionary of ports with their lat and longs - will be used to
# repeatedly to explore data and assign values

ports = {port:(lat, long) for port, lat, long in zip(df1['port'], df1['lat'], df1['long'])}

# a dictionary to retrieve the port id from the index
idx_ports = {idx:port for idx, port in zip(df1.index, df1.port)}

In [7]:
# training a nearest neighbor model to  find the closest port when the 
# conditions indicating an extended stop have occurred. the metric is haversine
# in order to compute the 'great circle' distance. so i don't forget, the model
# returns the *index* of the port, not the port's identifying label

ports_train = df1[['lat_rad', 'long_rad']]

neigh_ports = NearestNeighbors(n_neighbors=3, algorithm='ball_tree', metric='haversine')
neigh_ports.fit(ports_train)

dist, n = neigh_ports.kneighbors(np.array([0.677565, 0.469731]).reshape(1,-1))

print(dist[0] * 6370)
print([idx_ports[n] for n in n[0]])

[  3.30931026  46.44757565 255.13837716]
[82, 113, 44]


In [8]:
# using set differences i noticed two ports (42, 113) were never being assigned,
# so i used vincenty distance to figure out what was going on. since the lat/long
# of where the boats are docked are, in both cases, much closer to the port
# assigned by the model, i cannot justify any change

coord_first_stop = (38.743843, 26.895678)               
coord_second_stop = (38.821700, 26.913800)

viable_ports = [82, 113]

vin_one = {f'port{a}':[vincenty(ports[a], coord_first_stop)] for a in viable_ports}
vin_two = {f'port{a}':[vincenty(ports[a], coord_second_stop)] for a in viable_ports}

In [9]:
# there are five ports in Japan that are very close to each other. note ports
# 42 and 51 are the same port - they have precisely the same distances from two
# different docking locations.

jports = [42, 109, 65, 51, 30]

jc1 = (35.470100, 139.738700)    # has a predicted distance of 0.000022 from 51
jc2 = (35.461105, 139.718472)    # has a predicted distance of 0.000347 from 51

jc1_dist = {f'port_{a}':[vincenty(ports[a], jc1)] for a in jports}
jc2_dist = {f'port_{a}':[vincenty(ports[a], jc2)] for a in jports}

In [10]:
# functions to quickly inspect dataframe to figure out what's up

def get_dates(vessel=111, start='2019-01-01', end='2019-12-25'):

  df = df2[(df2.datetime >= start) & (df2.datetime <=end)]
  df = df[df['vessel'] == vessel]
  
  return df

def where_zero(vessel = 75):
  
  df_ = df2[(df2['vessel'] == vessel) & (df2['speed'] <= 0.5)]
  
  return df_

In [11]:
def nearest_port(df):
  """
  a simple function to return the port (identifier) of the nearest port in the
  dataset. 
  """

  data = np.array([df['lat_rad'], df['long_rad']]).reshape(1, -1)
  dist, pred = neigh_ports.radius_neighbors(data, radius=0.006, sort_results = True) 

  if len(dist[0]) == 0:
    return -1
  
  else:
    return idx_ports[pred[0][0]]

In [12]:
def nearest_distance(df):
  """
  a simple function to return the distance of the nearest port in the dataset
  """
  
  data = np.array([df['lat_rad'], df['long_rad']]).reshape(1, -1)
  dist, pred = neigh_ports.radius_neighbors(data, radius=0.0075, sort_results = True) 

  if len(dist[0]) == 0:
    return -1
  
  else:
    return dist[0][0]


In [13]:
def vincent_distance(row):
  """ 
  a simple function that returns the vincenty distance for lat and longitude
  coordinate pairs
  """
  if row['vessel'] != row['vessel_1back']:
    return -99

  loc1 = row['lat_long']
  loc2 = row['lat_long_1back']

  try:
    distance = vincenty(loc1, loc2)
    return distance
  except:
    return -99

In [14]:
# using my simple neighbor model with haversine distance to assign the closest 
# port along with its distance to the dataframe. i used a mask to speed up
# the process

mask = ((df2['speed'] <= 0.2) & \
        ((abs(df2['heading_diff_1back'] <= 2)) & (abs(df2['heading_diff_2back'] <=2)) | \
         (abs(df2['heading_diff_1ahead'] <=2)) & (abs(df2['heading_diff_2ahead'] <=2)))
)
  
df_valid = df2[mask]

df2['pred_port'] = -1
df2['pred_dist'] = -1
df2.loc[mask, 'pred_port'] = df_valid.apply(nearest_port, axis=1)
df2.loc[mask, 'pred_dist'] = df_valid.apply(nearest_distance, axis=1)

In [15]:
# calculating the vincenty distances between each row in the dataframe to
# detect movement. this seems like it might allow me to more easily detect when
# a ship is in japanese (and other?) ports where they appear to unload/load 
# without zero movement 

vin_mask = df2['speed'] <= 1

df_vin = df2[vin_mask]

df2['vin_delta'] = -99
df2.loc[vin_mask, 'vin_delta'] = df_vin.apply(vincent_distance, axis=1)

In [16]:
df2['pred_port_coords'] = [ports[k] if k in ports else -99 for k in df2['pred_port']]

In [17]:
def vincenty_port(row):

  if row['vessel'] != row['vessel_1back']:
    return -99

  try:
    distance = vincenty(row['pred_port_coords'], row['lat_long'])
    return distance
  except:
    return -99

In [18]:
df2['vindist_pred_port'] = df2.apply(vincenty_port, axis=1)

In [None]:
# the next few cells are a deep dive on the ports that aren't visited 

port_60 = [31.6940667, 121.7672667]
port_88 = [21.48393, 109.0811]
port_109 = [35.49415, 139.7681333]
port_113 = [38.45433, 27.1685]

distances_from_60 = {vessel:vincenty(port_60, d) for vessel, d in zip(df2.index, df2['lat_long'])}
distances_from_88 = {vessel:vincenty(port_88, d) for vessel, d in zip(df2.index, df2['lat_long'])}          # returning a none for some reason - manually delete using loop
distances_from_109 = {vessel:vincenty(port_109, d) for vessel, d in zip(df2.index, df2['lat_long'])}
distances_from_113 = {vessel:vincenty(port_113, d) for vessel, d in zip(df2.index, df2['lat_long'])}

sorted_60 = sorted(distances_from_60.items(), key = lambda x: x[1], reverse=False)[:10]
# sorted_88 = sorted(distances_from_88.items(), key = lambda x: x[1], reverse=False)[:10]
sorted_109 = sorted(distances_from_109.items(), key = lambda x: x[1], reverse=False)[:10]
sorted_113 = sorted(distances_from_113.items(), key = lambda x: x[1], reverse=False)[:10]

In [None]:
# looking more carefully at the nearest neighbor ports for Japan using vincenty
# distance. 42 doesn't require inclusion.

japan = df2[df2['pred_port'].isin([109, 51, 65, 30])]
jp_port = japan[japan['pred_port'] == 51]

port_51 = 35.4700833, 139.7402667
port_65 = 35.4736489, 139.9677229
port_30 = 35.3421341, 139.8226354

jp_port['dist_109'] = [vincenty(port_109, x) for x in jp_port['lat_long']]
jp_port['dist_51'] = [vincenty(port_51, x) for x in jp_port['lat_long']]
jp_port['dist_65'] = [vincenty(port_65, x) for x in jp_port['lat_long']]
jp_port['dist_30'] = [vincenty(port_30, x) for x in jp_port['lat_long']]

In [19]:
# creating a dictionary of dataframes for each vessel

vessel_dfs = {}

for vessel in df2.vessel.unique():
  df_ = df2[df2['vessel'] == vessel]
  vessel_dfs[vessel] = df_.set_index('datetime')

In [20]:
# creating a dictionary of dataframes for each port

port_ds = {}

for port in df2.pred_port.unique():
  df_ = df2[df2['pred_port'] == port]
  port_ds[port] = df_.set_index('datetime')

del port_ds[-1]

for n in port_ds.keys():
  port_ds[n][f'{n}_coord'] = [ports[n]] * len(port_ds[n])

for n in port_ds.keys():
  port_ds[n]['vdist_port'] = [vincenty(x, y) for x, y in zip(port_ds[n][f'{n}_coord'], port_ds[n]['lat_long'])]


In [None]:
for p in port_ds.keys():
  print(f'{p}:', len(port_ds[p]))

In [None]:
for v in vessel_dfs.keys():
  print(f'{v}:', len(vessel_dfs[v]))

In [None]:
# a cell to check distances

port_63 = ports[63]
d_from_63 = [[v, vincenty(port_63, d)] for v, d in zip(df2['vessel'], df2['lat_long'])]
d63 = pd.DataFrame.from_records(d_from_63, columns = ['vessel', 'distance'])
d63.groupby('vessel')['distance'].min().sort_values().head(30)

In [None]:
maxes = {}

for n in port_ds.keys():
  max_ = port_ds[n]['vdist_port'].max()
  maxes[n] = max_

In [None]:
mins = {}

for n in port_ds.keys():
  min_ = port_ds[n]['vdist_port'].min()
  minb = port_ds[n]['pred_dist'].min() * 1000
  mins[n] = (min_, minb)

In [48]:
df66 = vessel_dfs[99]

nz = df66[df66['pred_port'] > 0].reset_index()

dt = nz['datetime']
pred = nz['pred_port']

records = []
start = 0

for i in range(len(dt)-1):  
  if pred[i] != pred[i+1]:
    record = [nz.vessel[i], dt[start], pred[i], dt[i], nz.vindist_pred_port[i]]
    start = i+1
    records.append(record)

In [None]:
pd.DataFrame.from_records(records, columns = ['vessel', 'start', 'port', 'end', 'vidst'])

In [None]:
# simple mapping tool to inspect routes

fig = px.scatter_geo(f,lat='lat',lon='long', hover_name="datetime")

fig.update_layout(title = 'Vessel Locations', title_x=0.5)
fig.show()

In [None]:
# the dataframes to submit

voyages = pd.DataFrame(columns= ['vessel', 'begin_date', 'end_date', 'begin_port_id', 'end_port_id'])
predict = pd.DataFrame(columns= ['vessel', 'begin_port_id', 'end_port_id', 'voyage'])