In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import xgboost as xgb
from datetime import datetime

# Load datasets
vessel_data = pd.read_csv('../ais_train.csv', sep='|')
schedule_data = pd.read_csv('../schedules_to_may_2024.csv', sep='|')

display(vessel_data.head())
display(schedule_data.head())

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.7437,-57.8513,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.8944,-79.47939,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.41189,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3


Unnamed: 0,vesselId,shippingLineId,shippingLineName,arrivalDate,sailingDate,portName,portId,portLatitude,portLongitude
0,61e9f3b1b937134a3c4bfe53,61a8e672f9cba188601e84ac,Wallenius Wilhelmsen Ocean,2023-10-02 00:00:00+00:00,2023-10-03 00:00:00+00:00,Port of Brunswick,61d38499b7b7526e1adf3d54,31.140556,-81.496667
1,61e9f3b1b937134a3c4bfe53,61a8e672f9cba188601e84ac,Wallenius Wilhelmsen Ocean,2023-10-27 00:00:00+00:00,2023-10-27 00:00:00+00:00,Port of Southampton,61d3832bb7b7526e1adf3b63,50.9025,-1.428889
2,61e9f3b1b937134a3c4bfe53,61a8e672f9cba188601e84ac,Wallenius Wilhelmsen Ocean,2023-10-19 00:00:00+00:00,2023-10-20 00:00:00+00:00,Port of Bremerhaven,61d375e793c6feb83e5eb3e2,53.563611,8.554722
3,61e9f3b1b937134a3c4bfe53,61a8e672f9cba188601e84ac,Wallenius Wilhelmsen Ocean,2023-10-09 00:00:00+00:00,2023-10-10 00:00:00+00:00,Port of New York,61d38481b7b7526e1adf3d23,40.688333,-74.028611
4,61e9f3b1b937134a3c4bfe53,61a8e672f9cba188601e84ac,Wallenius Wilhelmsen Ocean,2023-09-25 00:00:00+00:00,2023-09-26 00:00:00+00:00,Manzanillo International Terminal,61d37d0199db2ccf7339eee1,9.37237,-79.87979


In [2]:
# Sample 20% of the data
vessel_data = vessel_data.sample(frac=0.01, random_state=42)
schedule_data = schedule_data.sample(frac=0.01, random_state=42)

# Merge datasets on a common key (e.g., vesselId)
data = pd.merge(vessel_data, schedule_data, on='vesselId')

In [3]:
# Handle missing values
data.ffill(inplace=True)

In [4]:
# Convert categorical variables to numerical
encoder = OneHotEncoder()
categorical_features = ['portName'] # ['vesselType']
encoded_features = encoder.fit_transform(data[categorical_features]).toarray()

In [5]:
# Convert datetime columns
# data['arrivalDate'] = pd.to_datetime(data['arrivalDate'])
# data['departureDate'] = pd.to_datetime(data['departureDate'])

# Convert datetime columns
data['arrivalDate'] = pd.to_datetime(data['arrivalDate'])

# Convert arrivalDate to numerical format (e.g., seconds since Unix epoch)
data['arrivalDate'] = data['arrivalDate'].apply(lambda x: x.timestamp())

# Create datetime features
# data['arrival_day_of_week'] = data['arrivalDate'].dt.dayofweek
# data['arrival_hour'] = data['arrivalDate'].dt.hour

# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['portLatitude', 'portLongitude']
# numerical_features = ['portLatitude', 'portLongitude', 'arrival_day_of_week', 'arrival_hour']
scaled_features = scaler.fit_transform(data[numerical_features])

# Combine all features
X = np.hstack((encoded_features, scaled_features))
y = data['arrivalDate']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

# Train XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)

In [6]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluate the model
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Mean Absolute Error (MAE): 7984602.162162162
Mean Squared Error (MSE): 125747285583077.73
R-squared (R2): 0.08865421081516311
