In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras import Sequential
from keras import layers
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime

# Load datasets
vessel_data = pd.read_csv('../ais_train.csv', sep='|')

# Convert 'time' column to datetime in vessel_data
vessel_data['time'] = pd.to_datetime(vessel_data['time'])

# Sort the data by vesselId and time
vessel_data = vessel_data.sort_values(by=['vesselId', 'time'])

# Create time-based features (hour, day_of_week, month)
vessel_data['hour'] = vessel_data['time'].dt.hour
vessel_data['day_of_week'] = vessel_data['time'].dt.dayofweek
vessel_data['month'] = vessel_data['time'].dt.month

# Select the features and target for training
features = ['hour', 'day_of_week', 'month']
target_latitude = 'latitude'
target_longitude = 'longitude'

# Normalize the data (standardization)
scaler = StandardScaler()
vessel_data[features] = scaler.fit_transform(vessel_data[features])

# Function to create sequences for time-series
def create_sequences(data, target, sequence_length=10):
    sequences = []
    labels = []
    for i in range(len(data) - sequence_length):
        seq = data.iloc[i:i+sequence_length].values
        label = target.iloc[i+sequence_length]
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

# Create sequences for latitude prediction
sequence_length = 10
X_lat, y_lat = create_sequences(vessel_data[features], vessel_data[target_latitude], sequence_length)

# Create sequences for longitude prediction
X_lon, y_lon = create_sequences(vessel_data[features], vessel_data[target_longitude], sequence_length)

# Split into training and test sets
X_train_lat, X_test_lat, y_train_lat, y_test_lat = train_test_split(X_lat, y_lat, test_size=0.2, random_state=42)
X_train_lon, X_test_lon, y_train_lon, y_test_lon = train_test_split(X_lon, y_lon, test_size=0.2, random_state=42)


In [11]:
from keras import Sequential
from keras import layers

# Define the LSTM model for latitude prediction
model_latitude = Sequential()
model_latitude.add(layers.Input(shape=(sequence_length, len(features))))  # Define input shape
model_latitude.add(layers.LSTM(units=50, return_sequences=False))  # LSTM layer
model_latitude.add(layers.Dense(1))  # Output layer to predict latitude

model_latitude.compile(optimizer='adam', loss='mean_squared_error')

# Define the LSTM model for longitude prediction
model_longitude = Sequential()
model_longitude.add(layers.Input(shape=(sequence_length, len(features))))  # Define input shape
model_longitude.add(layers.LSTM(units=50, return_sequences=False))  # LSTM layer
model_longitude.add(layers.Dense(1))  # Output layer to predict longitude

model_longitude.compile(optimizer='adam', loss='mean_squared_error')

In [12]:
# Step 4: Train the Models

# Train the latitude model
model_latitude.fit(X_train_lat, y_train_lat, epochs=10, batch_size=32, validation_data=(X_test_lat, y_test_lat))

# Train the longitude model
model_longitude.fit(X_train_lon, y_train_lon, epochs=10, batch_size=32, validation_data=(X_test_lon, y_test_lon))

Epoch 1/10
[1m38052/38052[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 2ms/step - loss: 553.7634 - val_loss: 524.8231
Epoch 2/10
[1m38052/38052[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 2ms/step - loss: 520.1207 - val_loss: 515.1726
Epoch 3/10
[1m38052/38052[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 2ms/step - loss: 512.2949 - val_loss: 509.1985
Epoch 4/10
[1m38052/38052[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 2ms/step - loss: 509.5121 - val_loss: 508.1026
Epoch 5/10
[1m38052/38052[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 2ms/step - loss: 508.2950 - val_loss: 506.5005
Epoch 6/10
[1m38052/38052[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 2ms/step - loss: 504.5497 - val_loss: 505.6689
Epoch 7/10
[1m38052/38052[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 2ms/step - loss: 504.4155 - val_loss: 505.7750
Epoch 8/10
[1m38052/38052[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 3ms/step - los

<keras.src.callbacks.history.History at 0x2235dcd8710>

In [13]:
# Step 5: Make Predictions and Evaluate

# Make predictions for latitude
y_pred_latitude = model_latitude.predict(X_test_lat)

# Make predictions for longitude
y_pred_longitude = model_longitude.predict(X_test_lon)

# Evaluate the models using Mean Squared Error and R-squared
mse_latitude = mean_squared_error(y_test_lat, y_pred_latitude)
r2_latitude = r2_score(y_test_lat, y_pred_latitude)

mse_longitude = mean_squared_error(y_test_lon, y_pred_longitude)
r2_longitude = r2_score(y_test_lon, y_pred_longitude)

print(f'Mean Squared Error (Latitude): {mse_latitude}')
print(f'R-squared (Latitude): {r2_latitude}')
print(f'Mean Squared Error (Longitude): {mse_longitude}')
print(f'R-squared (Longitude): {r2_longitude}')

[1m9513/9513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step
[1m9513/9513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 1ms/step
Mean Squared Error (Latitude): 504.75864998643164
R-squared (Latitude): 0.038775136494696216
Mean Squared Error (Longitude): 4658.697550768376
R-squared (Longitude): 0.01301266271424506


In [15]:
# Function to create sequences with padding for the initial missing rows
def create_sequences_with_padding(data, sequence_length=10):
    sequences = []
    for i in range(len(data)):
        # Start the sequence at 0 and pad the initial rows with zeros if needed
        start_idx = max(0, i - sequence_length + 1)
        seq = data.iloc[start_idx:i+1].values
        # If the sequence is shorter than required, pad with zeros at the beginning
        if len(seq) < sequence_length:
            seq = np.pad(seq, ((sequence_length - len(seq), 0), (0, 0)), mode='constant')
        sequences.append(seq)
    return np.array(sequences)

# Step 6: Predict Test Data and Create Submission File

# Load and preprocess test data
ais_test = pd.read_csv('../ais_test.csv', sep=',')
ais_test['time'] = pd.to_datetime(ais_test['time'])
ais_test['hour'] = ais_test['time'].dt.hour
ais_test['day_of_week'] = ais_test['time'].dt.dayofweek
ais_test['month'] = ais_test['time'].dt.month

# Normalize the test data
ais_test[features] = scaler.transform(ais_test[features])

# Create sequences for test data with padding
X_test_sequences = create_sequences_with_padding(ais_test[features], sequence_length)

# Make predictions on the test set
latitude_predictions = model_latitude.predict(X_test_sequences)
longitude_predictions = model_longitude.predict(X_test_sequences)

# Fix the length mismatch in the sample submission
print(f"Length of test set: {len(ais_test)}")
print(f"Length of predictions: {len(latitude_predictions)}")

# Load the sample submission file
sample_submission = pd.read_csv('../ais_sample_submission.csv')

# Ensure the number of predictions matches the test set size
assert len(latitude_predictions) == len(sample_submission), "Prediction length does not match submission length"
assert len(longitude_predictions) == len(sample_submission), "Prediction length does not match submission length"

# Fill in the predicted latitude and longitude
sample_submission['latitude_predicted'] = latitude_predictions.flatten()
sample_submission['longitude_predicted'] = longitude_predictions.flatten()

# Save the submission file
sample_submission.to_csv('submission_lstm3.csv', index=False)

# Display the first few rows of the submission file to verify
print(sample_submission.head())


[1m1617/1617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
[1m1617/1617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
Length of test set: 51739
Length of predictions: 51739
   ID  longitude_predicted  latitude_predicted
0   0            27.633219           34.024612
1   1            32.688614           21.310032
2   2            29.410028           16.319891
3   3            23.703220           15.083231
4   4            18.782597           15.887377
