<a href="https://colab.research.google.com/github/yaobviously/sym-cargo/blob/main/sym_cargo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
!pip install plotly-express --quiet
!pip install vincenty --quiet

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
from vincenty import vincenty
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
import plotly_express as px

In [3]:
port_file = '/content/drive/MyDrive/Ports/ports.csv'
tracking_file = '/content/drive/MyDrive/Ports/tracking.csv'

In [4]:
def wrangle():
  
  df1 = pd.read_csv(port_file)

  # converting lat and long to radians to compute haversine distance
  df1['lat_rad'] = np.radians(df1['lat'])
  df1['long_rad'] = np.radians(df1['long'])
  
  # rounding lat and long in port df
  df1['lat'] = df1['lat'].round(1)
  df1['long'] = df1['long'].round(1)

  df2 = pd.read_csv(tracking_file, parse_dates=['datetime'])
  df2 = df2.drop_duplicates()
  df2 = df2.sort_values(['vessel', 'datetime'])
  df2['vessel_1back'] = df2['vessel'].shift()
  
  # converting lat and long to radians to compute haversine distance  
  df2['lat_rad'] = np.radians(df2['lat'])
  df2['long_rad'] = np.radians(df2['long'])

  # adding lat/long column and lat/long 1 back to later compute delta
  df2['lat_long'] = [[x,y] for x, y in zip(df2['lat'], df2['long'])]
  df2['lat_long_1back'] = df2.groupby('vessel')['lat_long'].transform(lambda x: x.shift())   
  
  # time deltas to compute impossible distances travelled
  df2['time_delta'] = df2.groupby('vessel')['datetime'].transform(lambda x: x - x.shift(1))
  df2['hour_delta'] = df2['time_delta'].apply(lambda x: x.seconds / 3600)

  # filtering using query to eliminate unneeded/impossible values
  df2 = df2.query('hour_delta > 0.25 & speed <30 & heading <=360')
  df2 = df2.reset_index(drop=True)

  return df1, df2

In [5]:
df1, df2 = wrangle()

In [23]:
df2['vin_diff'] = df2.apply(vincent_distance, axis=1)
df2['vin_per_hour'] = df2['vin_diff'] / df2['hour_delta']
df2 = df2.query('vin_per_hour <= 50')

49.90867685931024

In [213]:
df2[df2['hour_diff'] <0.25].shape

(27741, 16)

In [11]:
# creating a dictionary of ports with their lat and longs - will be used to
# repeatedly to explore data and assign values

ports = {port:(lat, long) for port, lat, long in zip(df1['port'], df1['lat'], df1['long'])}

# a dictionary to retrieve the port id from the index
idx_ports = {idx:port for idx, port in zip(df1.index, df1.port)}

In [12]:
# training a nearest neighbor model to  find the closest port when the 
# conditions indicating an extended stop have occurred. the metric is haversine
# in order to compute the 'great circle' distance. so i don't forget, the model
# returns the *index* of the port, not the port's identifying label

ports_train = df1[['lat_rad', 'long_rad']]

neigh_ports = NearestNeighbors(n_neighbors=3, algorithm='ball_tree', metric='haversine')
neigh_ports.fit(ports_train)

dist, n = neigh_ports.kneighbors(np.array([0.677565, 0.469731]).reshape(1,-1))

print(dist[0] * 6370)
print([idx_ports[n] for n in n[0]])

[  3.30931026  46.44757565 255.13837716]
[82, 113, 44]


In [75]:
def nearest_port(df, radius=0.0085):
  """
  returns the port identifier of the nearest port using the nearest neighbors
  model 
  """

  data = np.array([df['lat_rad'], df['long_rad']]).reshape(1, -1)
  dist, pred = neigh_ports.radius_neighbors(data, radius=radius, sort_results = True) 

  if len(dist[0]) == 0:
    return -1
  
  else:
    return idx_ports[pred[0][0]]

In [76]:
def nearest_distance(df, radius=0.0085):
  """
  returns the distance of the nearest port in the dataset
  """
  
  data = np.array([df['lat_rad'], df['long_rad']]).reshape(1, -1)
  dist, pred = neigh_ports.radius_neighbors(data, radius=radius, sort_results = True) 

  if len(dist[0]) == 0:
    return -1
  
  else:
    return dist[0][0]


In [6]:
def vincent_distance(row):
  """ 
  returns the vincenty distance for contiguous rows - will be used to identify
  impossible distances travelled, and so on. could be used to create distance
  matrix, but this may not be worthwhile 
  """
  if row['vessel'] != row['vessel_1back']:
    return -99

  loc1 = row['lat_long']
  loc2 = row['lat_long_1back']

  try:
    distance = vincenty(loc1, loc2)
    return distance
  except:
    return -99

In [16]:
def vincenty_port(row):
  """
  a function that computes the vincenty distance between the assigned port
  and the latitude and longitude of the location data. 
  """
  port = row['port_coords']
  loc = row['lat_long']
  return vincenty(port, loc)

In [None]:
# calculating the minimum and maximum vincenty distances from ports and 
# storing the values in a dictionary. it may prove useful when distinguishing
# between 'in port' and 'waiting around to enter the port'

# port_dist = {}

# for port in port_ds.keys():
#   max_vincenty = port_ds[port]['vdist_port'].max()
#   min_vincenty = port_ds[port]['vdist_port'].min()
#   port_dist[port] = [max_vincenty, min_vincenty]

In [17]:
def assign_ports(df):
  """
  a function that prepares a dataframe for predictions

  Parameters:
  ----------
        df: pandas dataframe
  
  Returns:
  -------
        df: processed pandas dataframe

  """

  # resampling the data to standardize time intervals
  df = df.resample('4H').mean().ffill()
  df['lat_rad'] = df['lat_rad'].resample('4H').median().ffill()
  df['long_rad'] = df['long_rad'].resample('4H').median().ffill()
  
  # applying a mask to limit the rows the predict function is applied to
  mask = df['speed'] <= 3.31
  df_temp = df[mask]

  df['pred_port'] = 0
  df.loc[mask, 'pred_port'] = df_temp.apply(nearest_port, axis=1)  
  df['port_coords'] = [list(ports[k]) if k in ports else -99 for k in df['pred_port']]
  df['lat_long'] = [[x,y] for x, y in zip(df['lat'], df['long'])]

  # applying a mask to limit the rows vincenty func is applied to
  vin_mask = (df['pred_port'] > 0)
  vin_temp = df[vin_mask]
  
  df['port_dist'] = 0
  df.loc[vin_mask, 'port_dist'] = vin_temp.apply(vincenty_port, axis=1)

  # eliminating entries in sequences where max distance greatly differs from min
  # indicates 'waiting to enter port'
  df['seq'] = df['pred_port'].diff().ne(0).cumsum()
  df['seq_count'] = df.groupby('seq')['seq'].transform('count')
  df['dist_diff'] = df.groupby('seq')['port_dist'].transform(lambda x: x - x.min())
  df['pred_port'] = np.where(df['dist_diff'] >= 15, 0, df['pred_port'])         # 15km away from closest point this trip to the port 
  df['seq'] = df['pred_port'].diff().ne(0).cumsum()

  # identifying draft changes indicating 'ports' that are not included in the
  # ports .csv. without tagging voyages will be mislabelled
  df['draft_delta'] = df['draft'].transform(lambda x: abs(x.diff(-1)).ge(0.5))
  df['sum_draft_delta'] = df.groupby('seq')['draft_delta'].transform(sum)
  
  mask = ((df['pred_port'] == -1) & (df['sum_draft_delta'] >= 1) & \
          (df['seq_count'] >=4))                                                # indicates 16 hrs at the location - a minimum threshold for unknown 'ports'
  
  df['pred_port'] = np.where(mask, -75, df['pred_port'])

  # recasting vessel and pred_port columns as integers
  df[['vessel', 'pred_port']] = df[['vessel', 'pred_port']].astype(int)

  return df

In [18]:
# creating a dictionary of vessel dataframes with a datetime index

vessel_dfs = {}

for vessel in df2.vessel.unique():
  df_ = df2[df2['vessel'] == vessel]
  vessel_dfs[vessel] = df_.set_index('datetime')

# assigning ports to each dataframe
processed_dfs = {f'vessel_{key}':assign_ports(vessel_dfs[key]) for key in vessel_dfs.keys()}

In [19]:
def get_voyages(df):
  """
  a function to convert the port sequences in each dataframe into voyages
  with the proper formatting

  Parameters:
  ----------
      df: pandas DataFrame
  
  Returns:
  -------
      df: processed pandas DataFrame
      
  """
  # filtering out columns without an assigned port
  nz = df[(df['pred_port'] > 0) | (df['pred_port'] == -75)].reset_index()

  vessel = nz['vessel'][0]
  dt = nz['datetime']
  pred = nz['pred_port']

  records = []

  for i in range(len(dt)-1):
    if pred[i] != pred[i+1]:
      start_port = pred[i]
      end_port = pred[i+1]
      begin_date = dt[i]
      end_date = dt[i+1]
      records.append([vessel, begin_date, end_date, start_port, end_port])

  df = pd.DataFrame.from_records(records, columns = ['vessel', 'begin_date', 'end_date', 'begin_port_id', 'end_port_id'])
  
  return df

In [20]:
voyages_df = pd.concat([get_voyages(processed_dfs[key]) for key in processed_dfs.keys()])
voyages_df['len_voyage'] = voyages_df['end_date'] - voyages_df['begin_date']

In [21]:
def prepare_data(df, n_input = 3):
  """
  preparing the sequences for models. it was setup to be flexible but i chipped
  away at it until it returned data for an xgboost 
  """

  df = get_voyages(df)
  vessel = df['vessel'].iloc[0]
  ports_ = np.array(df['begin_port_id'].append(pd.Series(df['end_port_id'].iloc[-1])))  

  X = []
  Y = []
  start = 0

  for i in range(len(ports_)):
    last_input = start + n_input
    last_output = last_input + 3
    if last_output <= len(ports_):
      x = ports_[start:last_input]
      y = ports_[last_input: last_output]
      X.append(x)
      Y.append(y)
      start += 1
  try:
    df = pd.concat([pd.DataFrame(X),
                  pd.DataFrame(Y, columns=['port_1ahead', 'port_2ahead', 'port_3ahead'])], axis=1)
    
  except:
    df = pd.DataFrame()
    
  # X = []

  # for x in X:
  #   for n in x:
  #     if n == -75:
  #       port_coords = [33, 140]
  #     else:
  #       port_coords = list(ports[n])
  #     port = [n]
  #     port.extend(port_coords)
  #     new_X.append(port)
  
  df['vessel'] = len(df) * [vessel]
  
  return df.astype(int)

In [22]:
df = prepare_data(processed_dfs['vessel_133'], 6)

In [23]:
dfs = [prepare_data(processed_dfs[key], 6) for key in processed_dfs.keys()]
model_df = pd.concat(dfs)

In [33]:
alldf = pd.concat([processed_dfs[key] for key in processed_dfs.keys()])

In [None]:
# unport = alldf[(alldf['sum_draft_delta'] >= 2) & (alldf['pred_port'] < 0)]
# unport.groupby(['vessel', 'seq'])['draft'].agg(first = 'first',
#                                    last = 'last')

unport['pred_port_alt'] = unport.apply(nearest_port, radius=0.015, axis=1)
unport['pred_port_dist_alt'] = unport.apply(nearest_distance, radius=0.015, axis=1)

In [None]:
creating a dictionary of dataframes for each port

ports_dfs = {}

for port in alldf.pred_port.unique():
  ports_dfs[port] = alldf[alldf['pred_port'] == port]

In [None]:
model = XGBClassifier(learning_rate=0.02)
 
model_df['port_2back_lat'] = [ports[key][0] if key in ports else None for key in model_df['port_2back']]
model_df['port_2back_long'] = [ports[key][1] if key in ports else None for key in model_df['port_2back']]
model_df['port_1back_lat'] = [ports[key][0] if key in ports else None for key in model_df['port_2back']]
model_df['port_1back_long'] = [ports[key][1] if key in ports else None for key in model_df['port_2back']]
model_df['last_port_lat'] = [ports[key][0] if key in ports else None for key in model_df['port_2back']]
model_df['last_port_long'] = [ports[key][1] if key in ports else None for key in model_df['port_2back']]

In [None]:
from sklearn.model_selection import train_test_split

target = 'port_1ahead'
features = model_df.drop(columns=['port_1ahead', 'port_2ahead', 'port_3ahead', 'pred', 'correct'])
model_df = model_df.groupby('vessel').filter(lambda x: len(x) > 8)

X = features
y = model_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, eval_set=eval_set);

In [None]:
smallsamples = list(set(df1.port).difference(y.unique()))

# all = pd.concat(processed_dfs)

# uniques = all[all['vessel'].isin(smallsamples)].groupby('vessel')['pred_port'].unique().to_list()

# flattened = [item for sublist in uniques for item in sublist]

# dict_ = {}

# for f in flattened:
#   if f not in dict_:
#     dict_[f] = 1
#   else:
#     dict_[f] += 1

dict_

In [124]:
all_vessels = pd.concat([processed_dfs[key] for key in processed_dfs.keys()])

In [150]:
# all_vessels['vessel_1back'] = all_vessels['vessel'].shift()
# all_vessels['lat_long_1back'] = all_vessels['lat_long'].shift()
# all_vessels['dist_diff'] = all_vessels.apply(vincent_distance, axis=1)

dist_check = all_vessels.groupby(['vessel', 'seq'], as_index=False)[['dist_diff', 'seq_count', 'pred_port']].apply(lambda x: x.max())

In [None]:
dist_check['diff_seq'] = dist_check['dist_diff'] / dist_check['seq_count'] 
print(dist_check.query('diff_seq > 200').to_string())

In [None]:
print(all_vessels[all_vessels['vessel'] == 149].to_string())

In [None]:
v1 = all_vessels[all_vessels['vessel'] == 10]
v2 = all_vessels[all_vessels['pred_port'] == 54]

In [None]:
mask = ((all_vessels['speed'] > 1) & (all_vessels['pred_port'] >0))

fast = all_vessels[mask]
fast.sort_values(by='speed', ascending=False).head(25)

In [157]:
fig = px.scatter_geo(all_vessels[all_vessels.vessel==149], lat='lat', lon='long')

fig.show()

In [None]:
# the dataframes to submit

voyages = pd.DataFrame(columns= ['vessel', 'begin_date', 'end_date', 'begin_port_id', 'end_port_id'])
predict = pd.DataFrame(columns= ['vessel', 'begin_port_id', 'end_port_id', 'voyage'])