In [11]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime

# Load datasets
vessel_data = pd.read_csv('../ais_train.csv', sep='|')
schedule_data = pd.read_csv('../schedules_to_may_2024.csv', sep='|')
ports_data = pd.read_csv('../ports.csv', sep='|')
vessels_info = pd.read_csv('../vessels.csv', sep='|')

# Step 1: Convert 'time' column to datetime in vessel_data
vessel_data['time'] = pd.to_datetime(vessel_data['time'])

# Feature extraction: Create time-based features
vessel_data['hour'] = vessel_data['time'].dt.hour
vessel_data['day_of_week'] = vessel_data['time'].dt.dayofweek
vessel_data['month'] = vessel_data['time'].dt.month

In [12]:
# Step 2: Calculate sailing velocity (distance between positions divided by time difference)
# Ensure that lag features are computed for each vessel separately
vessel_data = vessel_data.sort_values(['vesselId', 'time'])  # Sort by vessel and time
vessel_data['prev_latitude'] = vessel_data.groupby('vesselId')['latitude'].shift(1)
vessel_data['prev_longitude'] = vessel_data.groupby('vesselId')['longitude'].shift(1)
vessel_data['prev_time'] = vessel_data.groupby('vesselId')['time'].shift(1)

In [13]:
# Compute time difference in hours
vessel_data['time_diff'] = (vessel_data['time'] - vessel_data['prev_time']).dt.total_seconds() / 3600

In [14]:
# Calculate the distance traveled (in kilometers)
vessel_data['distance_traveled'] = vessel_data.apply(
    lambda row: geodesic(
        (row['prev_latitude'], row['prev_longitude']), 
        (row['latitude'], row['longitude'])
    ).kilometers if pd.notnull(row['prev_latitude']) else np.nan, axis=1)

In [15]:
# Calculate sailing velocity (km/h)
vessel_data['sailing_velocity'] = vessel_data['distance_traveled'] / vessel_data['time_diff']

In [16]:
# Step 3: Merge with vessels_info
# Ensure unique vessel information is merged without duplicating rows
vessel_data = pd.merge(vessel_data, vessels_info[['vesselId', 'CEU', 'length', 'maxSpeed']], on='vesselId', how='left')

# Step 4: Merge with ports_data
# Merge on 'portId' carefully to avoid many-to-many issues
vessel_data = pd.merge(vessel_data, ports_data[['portId', 'latitude', 'longitude']], left_on='portId', right_on='portId', how='left', suffixes=('', '_port'))

In [17]:
# Calculate the distance to the nearest port (or the destination port)
vessel_data['distance_to_port'] = vessel_data.apply(
    lambda row: geodesic(
        (row['latitude'], row['longitude']), 
        (row['latitude_port'], row['longitude_port'])
    ).kilometers if pd.notnull(row['latitude_port']) else np.nan, axis=1)

In [18]:
print(vessel_data)

                       time    cog   sog  rot  heading  navstat       etaRaw  \
0       2024-01-12 14:07:47  308.1  17.1   -6      316        0  01-08 06:00   
1       2024-01-12 14:31:00  307.6  17.3    5      313        0  01-14 23:30   
2       2024-01-12 14:57:23  306.8  16.9    5      312        0  01-14 23:30   
3       2024-01-12 15:18:48  307.9  16.9    6      313        0  01-14 23:30   
4       2024-01-12 15:39:47  307.0  16.3    7      313        0  01-14 23:30   
...                     ...    ...   ...  ...      ...      ...          ...   
1522060 2024-05-07 22:36:16  324.1  13.5   -2      325        0  05-08 03:00   
1522061 2024-05-07 22:57:05  324.2  13.3   -3      326        0  05-08 03:00   
1522062 2024-05-07 23:17:54  356.5  12.2   -1      354        0  05-08 03:00   
1522063 2024-05-07 23:38:13   52.6  17.3    3       50        0  05-08 03:00   
1522064 2024-05-07 23:59:01   53.6  17.7   -1       51        0  05-08 03:00   

         latitude  longitude           

In [19]:
# Step 5: Merge with schedule_data

# Check for duplicates in schedule_data for vesselId
duplicate_schedule = schedule_data[schedule_data.duplicated('vesselId', keep=False)]

# If duplicates exist, select only the latest or earliest arrival date for each vessel
schedule_data_unique = schedule_data.sort_values(by='arrivalDate').drop_duplicates('vesselId', keep='last')

# Convert arrivalDate to datetime, ensuring it's timezone-naive
schedule_data_unique['arrivalDate'] = pd.to_datetime(schedule_data_unique['arrivalDate']).dt.tz_localize(None)

# Merge vessel_data with the unique schedule_data
vessel_data = pd.merge(vessel_data, schedule_data_unique[['vesselId', 'arrivalDate']], on='vesselId', how='left')

# Ensure that vessel_data['time'] is timezone-naive
vessel_data['time'] = vessel_data['time'].dt.tz_localize(None)

# Create the time until scheduled arrival feature
vessel_data['time_until_arrival'] = (vessel_data['arrivalDate'] - vessel_data['time']).dt.total_seconds() / 3600  # Time in hours

In [20]:
# Step 6: Drop unnecessary columns or duplicates if needed
vessel_data = vessel_data.drop_duplicates()

In [21]:
# Prepare the dataset for training
X = vessel_data[['hour', 'day_of_week', 'month', 'sailing_velocity', 'CEU', 'length', 'maxSpeed', 'distance_to_port', 'time_until_arrival']]
y_latitude = vessel_data['latitude']  # Target for latitude
y_longitude = vessel_data['longitude']  # Target for longitude

# Split the data into training and test sets
X_train_lat, X_test_lat, y_train_lat, y_test_lat = train_test_split(X, y_latitude, test_size=0.2, random_state=42)
X_train_lon, X_test_lon, y_train_lon, y_test_lon = train_test_split(X, y_longitude, test_size=0.2, random_state=42)

# Train the model for latitude prediction
model_latitude = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
model_latitude.fit(X_train_lat, y_train_lat)

# Make predictions on the test set for latitude
y_pred_latitude = model_latitude.predict(X_test_lat)

# Evaluate the latitude model
mse_latitude = mean_squared_error(y_test_lat, y_pred_latitude)
r2_latitude = r2_score(y_test_lat, y_pred_latitude)

print(f'Mean Squared Error (Latitude): {mse_latitude}')
print(f'R-squared (Latitude): {r2_latitude}')

# Train the model for longitude prediction
model_longitude = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
model_longitude.fit(X_train_lon, y_train_lon)

# Make predictions on the test set for longitude
y_pred_longitude = model_longitude.predict(X_test_lon)

# Evaluate the longitude model
mse_longitude = mean_squared_error(y_test_lon, y_pred_longitude)
r2_longitude = r2_score(y_test_lon, y_pred_longitude)

print(f'Mean Squared Error (Longitude): {mse_longitude}')
print(f'R-squared (Longitude): {r2_longitude}')

Mean Squared Error (Latitude): 223.39886129231058
R-squared (Latitude): 0.5749278862648384
Mean Squared Error (Longitude): 2462.0760602182004
R-squared (Longitude): 0.4794367412804249


In [23]:
# Prepare the dataset for training with limited features (same as test set)
X_limited = vessel_data[['hour', 'day_of_week', 'month']]
y_latitude = vessel_data['latitude']  # Target for latitude
y_longitude = vessel_data['longitude']  # Target for longitude

# Split the data into training and test sets
X_train_lat, X_test_lat, y_train_lat, y_test_lat = train_test_split(X_limited, y_latitude, test_size=0.2, random_state=42)
X_train_lon, X_test_lon, y_train_lon, y_test_lon = train_test_split(X_limited, y_longitude, test_size=0.2, random_state=42)

# Train the model for latitude prediction
model_latitude = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
model_latitude.fit(X_train_lat, y_train_lat)

# Train the model for longitude prediction
model_longitude = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
model_longitude.fit(X_train_lon, y_train_lon)

# Load the test data and make sure to only extract the available features (hour, day_of_week, month)
ais_test = pd.read_csv('../ais_test.csv', sep=',')

# Convert 'time' column to datetime in test data
ais_test['time'] = pd.to_datetime(ais_test['time'])

# Create time-based features for test data
ais_test['hour'] = ais_test['time'].dt.hour
ais_test['day_of_week'] = ais_test['time'].dt.dayofweek
ais_test['month'] = ais_test['time'].dt.month

# Select features for prediction
X_test_limited = ais_test[['hour', 'day_of_week', 'month']]

# Make predictions for latitude and longitude using the model trained with limited features
latitude_predictions = model_latitude.predict(X_test_limited)
longitude_predictions = model_longitude.predict(X_test_limited)

# Load the sample submission file
sample_submission = pd.read_csv('../ais_sample_submission.csv')

# Fill in the predicted latitude and longitude
sample_submission['latitude_predicted'] = latitude_predictions
sample_submission['longitude_predicted'] = longitude_predictions

# Save the filled submission file
sample_submission.to_csv('submission02.csv', index=False)

# Display the first few rows of the submission file to verify
print(sample_submission.head())


   ID  longitude_predicted  latitude_predicted
0   0            10.714623           37.924057
1   1            10.714623           37.924057
2   2            10.714623           37.924057
3   3            10.714623           37.924057
4   4            10.714623           37.924057
