In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import xgboost as xgb
from datetime import datetime

# Load datasets
vessel_data = pd.read_csv('../ais_train.csv', sep='|')
schedule_data = pd.read_csv('../schedules_to_may_2024.csv', sep='|')
ports_data = pd.read_csv('../ports.csv', sep='|')  # Load ports data
vessels_info = pd.read_csv('../vessels.csv', sep='|')  # Load vessel specifications

# Display the first few rows of each dataset
display(vessel_data.head())
display(schedule_data.head())
display(ports_data.head())
display(vessels_info.head())

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.7437,-57.8513,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.8944,-79.47939,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.19065,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.41189,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.88379,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3


Unnamed: 0,vesselId,shippingLineId,shippingLineName,arrivalDate,sailingDate,portName,portId,portLatitude,portLongitude
0,61e9f3b1b937134a3c4bfe53,61a8e672f9cba188601e84ac,Wallenius Wilhelmsen Ocean,2023-10-02 00:00:00+00:00,2023-10-03 00:00:00+00:00,Port of Brunswick,61d38499b7b7526e1adf3d54,31.140556,-81.496667
1,61e9f3b1b937134a3c4bfe53,61a8e672f9cba188601e84ac,Wallenius Wilhelmsen Ocean,2023-10-27 00:00:00+00:00,2023-10-27 00:00:00+00:00,Port of Southampton,61d3832bb7b7526e1adf3b63,50.9025,-1.428889
2,61e9f3b1b937134a3c4bfe53,61a8e672f9cba188601e84ac,Wallenius Wilhelmsen Ocean,2023-10-19 00:00:00+00:00,2023-10-20 00:00:00+00:00,Port of Bremerhaven,61d375e793c6feb83e5eb3e2,53.563611,8.554722
3,61e9f3b1b937134a3c4bfe53,61a8e672f9cba188601e84ac,Wallenius Wilhelmsen Ocean,2023-10-09 00:00:00+00:00,2023-10-10 00:00:00+00:00,Port of New York,61d38481b7b7526e1adf3d23,40.688333,-74.028611
4,61e9f3b1b937134a3c4bfe53,61a8e672f9cba188601e84ac,Wallenius Wilhelmsen Ocean,2023-09-25 00:00:00+00:00,2023-09-26 00:00:00+00:00,Manzanillo International Terminal,61d37d0199db2ccf7339eee1,9.37237,-79.87979


Unnamed: 0,portId,name,portLocation,longitude,latitude,UN_LOCODE,countryName,ISO
0,61d36ed80a1807568ff9a064,Port of Algiers,Algiers,3.067222,36.773611,DZALG,Algeria,DZ
1,61d36ed80a1807568ff9a065,Port of Annaba,Annaba,7.7725,36.900556,DZAAE,Algeria,DZ
2,61d36edf0a1807568ff9a070,Port of Oran,Oran,-0.639722,35.712222,DZORN,Algeria,DZ
3,61d36ee00a1807568ff9a072,Port of Skikda,Skikda,6.905833,36.8875,DZSKI,Algeria,DZ
4,61d36ee10a1807568ff9a074,Port of Pago-Pago,Pago-Pago,-170.690556,-14.274167,ASPPG,American Samoa,AS


Unnamed: 0,shippingLineId,vesselId,CEU,DWT,GT,NT,vesselType,breadth,depth,draft,enginePower,freshWater,fuel,homePort,length,maxHeight,maxSpeed,maxWidth,rampCapacity,yearBuilt
0,61a8e672f9cba188601e84ab,61e9f38eb937134a3c4bfd8b,6500,21200.0,58684,17606.0,83.0,32.0,22.2,,0.0,,,OSLO,199.0,5.0,18.6,15.2,150.0,2000
1,61ec94f1a8cafc0e93f0e92a,61e9f38eb937134a3c4bfd8d,4902,12325.0,46800,,83.0,31.0,,,14220.0,,,MONROVIA,182.0,,,,,2006
2,61e213d5d612676a0f0fb755,61e9f38eb937134a3c4bfd8f,5000,13059.0,46800,,83.0,31.0,,,14220.0,,,SAINT JOHN'S,182.0,,,,,2010
3,61be24574ea00ae59d0fe388,61e9f38eb937134a3c4bfd91,4200,12588.0,39362,,83.0,28.0,,,11060.0,,,,167.0,,,,,2011
4,61a8e673f9cba188601e84ae,61e9f390b937134a3c4bfd93,7450,21052.0,75528,24391.0,83.0,37.2,22.23,,13140.0,491.47,3236.78,Panama,199.98,,,,,2018


In [None]:
# Merging datasets
# Merge vessel data with schedule data on vesselId
combined_data = pd.merge(vessel_data, schedule_data, on='vesselId', how='left')

# Merge with ports data on portId (assuming portId is in both datasets)
combined_data = pd.merge(combined_data, ports_data, on='portId', how='left')

# Merge with vessel specifications on vesselId
combined_data = pd.merge(combined_data, vessels_info, on='vesselId', how='left')

In [None]:
# Feature Engineering
# Convert time column to datetime
combined_data['time'] = pd.to_datetime(combined_data['time'])

# Create additional features
combined_data['hour'] = combined_data['time'].dt.hour
combined_data['day_of_week'] = combined_data['time'].dt.dayofweek
combined_data['month'] = combined_data['time'].dt.month

# Handle missing values (example: fill with mean or drop)
combined_data.fillna(combined_data.mean(), inplace=True)

In [None]:
# Split into features (X) and target (y)
X = combined_data.drop(['target_column'], axis=1)  # Replace 'target_column' with your actual target
y = combined_data['target_column']  # Define your target variable

# Encode categorical features
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X[['categorical_column1', 'categorical_column2']]).toarray()  # Replace with actual categorical columns

# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[['numerical_column1', 'numerical_column2']])  # Replace with actual numerical columns

# Combine the encoded and scaled features
X_final = np.hstack((X_scaled, X_encoded))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.02, random_state=42)

# Now you can proceed to train your model
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation (using appropriate metrics)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')