In [33]:

import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pre_processing import pre_process
from validation import prepare_output, calculate_score, calculate_distance

# Split Data into Training and Validation Sets


In [34]:
# Load your features and labels datasets
features = pd.read_csv('data/features.csv')
labels = pd.read_csv('data/labels.csv')

# Check the structure of both dataframes to ensure they match
# print(features.shape, features.head())  # Ensure the number of rows match in features and labels
# print(labels.shape, labels.head())

# Define how many days you want to use for validation (in seconds)
# Assuming 'time_diff' is in seconds and the dataset is sorted by time
five_days_in_seconds = 5 * 24 * 60 * 60

# Find the cutoff point for the last five days
cutoff_time = features['time_diff'].max() - five_days_in_seconds

# Split the features based on the time_diff into training and validation sets
X_train = features[features['time_diff'] < cutoff_time]
X_val = features[features['time_diff'] >= cutoff_time]

# Now apply the same split to labels using the index of the features dataframes
y_train = labels.loc[X_train.index]
y_val = labels.loc[X_val.index]

# Drop time_diff column from X_train and X_val (if not needed in the features)
# X_train = X_train.drop(columns=['time_diff'], errors='ignore')
# X_val = X_val.drop(columns=['time_diff'], errors='ignore')

print("X_train", X_train.shape, X_train.head())
print("X_val", X_val.shape, X_val.head())
print("y_train", y_train.shape, y_train.head())
print("y_val", y_val.shape, y_val.head())


X_train (551827, 4)    latitude  longitude                  vesselId  time_diff
0  53.33566    7.20093  61e9f3f3b937134a3c4bffa9      779.0
1  55.85554   12.83279  61e9f466b937134a3c4c0277      816.0
2  43.44188   -3.81814  61e9f440b937134a3c4c017d      817.0
3  22.62450  113.70109  61e9f3d7b937134a3c4bff2b      818.0
4  51.26160    4.23430  61e9f3a7b937134a3c4bfdeb      826.0
X_val (23012, 4)         latitude  longitude                  vesselId   time_diff
551827   1.28482  103.75267  61e9f451b937134a3c4c01df  10628110.0
551828  43.10853  131.88221  61e9f414b937134a3c4c005b  10628187.0
551829  38.90542    3.03312  61e9f3c9b937134a3c4bfef7  10628187.0
551830   6.03841  116.06873  61e9f3cfb937134a3c4bff11  10628198.0
551831  35.46137  139.69726  61e9f396b937134a3c4bfda9  10628228.0
y_train (551827, 3)    latitude  longitude                  vesselId
0  45.55133   13.73575  61e9f440b937134a3c4c0181
1  53.33458    7.16221  61e9f444b937134a3c4c019f
2  36.13732   -5.37673  61e9f434b937134a

## Validation of data


In [35]:
# Print shapes and columns
print("X_train shape: ", X_train.shape)
print("X_train columns: ", X_train.columns)
print("y_train shape: ", y_train.shape)
print("y_train latitude shape: ", y_train['latitude'].shape)
print("y_train longitude shape: ", y_train['longitude'].shape)

# Print first few rows of the training set to inspect
print("X_train head: \n", X_train.head())
print("y_train head: \n", y_train.head())

# Check if latitude and longitude columns are correctly populated in y_train
print("Check for NaNs in latitude and longitude:")
print(y_train['latitude'].isnull().sum(), "missing values in latitude")
print(y_train['longitude'].isnull().sum(), "missing values in longitude")

X_train shape:  (551827, 4)
X_train columns:  Index(['latitude', 'longitude', 'vesselId', 'time_diff'], dtype='object')
y_train shape:  (551827, 3)
y_train latitude shape:  (551827,)
y_train longitude shape:  (551827,)
X_train head: 
    latitude  longitude                  vesselId  time_diff
0  53.33566    7.20093  61e9f3f3b937134a3c4bffa9      779.0
1  55.85554   12.83279  61e9f466b937134a3c4c0277      816.0
2  43.44188   -3.81814  61e9f440b937134a3c4c017d      817.0
3  22.62450  113.70109  61e9f3d7b937134a3c4bff2b      818.0
4  51.26160    4.23430  61e9f3a7b937134a3c4bfdeb      826.0
y_train head: 
    latitude  longitude                  vesselId
0  45.55133   13.73575  61e9f440b937134a3c4c0181
1  53.33458    7.16221  61e9f444b937134a3c4c019f
2  36.13732   -5.37673  61e9f434b937134a3c4c0121
3  51.29922    3.23907  61e9f43db937134a3c4c0169
4  43.57881   10.30563  61e9f43bb937134a3c4c0161
Check for NaNs in latitude and longitude:
0 missing values in latitude
0 missing values in long

# Initialize and Train the XGBoost Model


In [57]:
# Initialize the XGBoost model for predicting latitude
model_lat = XGBRegressor(n_estimators=2500, max_depth=5, learning_rate=0.1, objective='reg:squarederror')

# Initialize the XGBoost model for predicting longitude
model_long = XGBRegressor(n_estimators=2500, max_depth=5, learning_rate=0.1, objective='reg:squarederror')

# Fit the models
model_lat.fit(X_train.drop(columns=['vesselId']), y_train['latitude'])
model_long.fit(X_train.drop(columns=['vesselId']), y_train['longitude'])

# Evaluate on val set


### Step 1: Prediction on Validation Data


In [59]:
# Predict on validation set
val_preds_lat = model_lat.predict(X_val.drop(columns=['vesselId']))
val_preds_long = model_long.predict(X_val.drop(columns=['vesselId']))

# Combine the predicted latitudes and longitudes
val_preds_combined = pd.DataFrame({
    'longitude_predicted': val_preds_long,
    'latitude_predicted': val_preds_lat
})
print(val_preds_combined.head())

# Combine with actual validation data
validation_data = X_val.copy()  # This includes the validation features
validation_data['latitude'] = y_val['latitude']  # Adding actual latitude
validation_data['longitude'] = y_val['longitude']  # Adding actual longitude


   longitude_predicted  latitude_predicted
0           -12.006053           41.805637
1            15.718575           37.737095
2            27.760981           35.787987
3           -27.166487           38.515636
4             0.123962           41.512691


### Step 2: Prepare the Output for Validation


In [55]:
# Prepare the validation output with predictions
validation_output = prepare_output(val_preds_combined.to_numpy(), validation_data)
print(validation_output.head())

# Calculate the geodesic distance-based score
validation_score = calculate_score(validation_output)

print(f"Validation Score (Weighted Geodesic Distance in km): {validation_score}")


        latitude  longitude                  vesselId   time_diff  \
551827  36.91718   23.64930  61e9f451b937134a3c4c01df  10628110.0   
551828  38.90542    3.03312  61e9f414b937134a3c4c005b  10628187.0   
551829  36.12969   -5.43158  61e9f3c9b937134a3c4bfef7  10628187.0   
551830  51.29672    3.24144  61e9f3cfb937134a3c4bff11  10628198.0   
551831  39.44099   -0.30523  61e9f396b937134a3c4bfda9  10628228.0   

        longitude_predicted  latitude_predicted  
551827           -45.305248           38.432087  
551828            -7.471652           36.659481  
551829            19.617462           38.531727  
551830            -5.300850           38.959465  
551831             2.853654           41.267761  
Validation Score (Weighted Geodesic Distance in km): 1217.9058407392524


### Step 3: Make predictions on the test set


In [None]:
# Assuming you have already preprocessed the test_data and it's ready for predictions
test_data = pd.read_csv('data/pre_processed_test_data.csv')
print(test_data.head())
# Drop the 'time' column only if it exists
test_data = test_data.drop(columns=['time'], errors='ignore')
print(test_data.head())
test_data['longitude'] = 0.0  # or you can use NaN if you prefer
test_data['latitude'] = 0.0   # or use NaN

# Reorder columns to match the order used in training: ['time_diff', 'longitude', 'latitude']
test_data = test_data[['latitude', 'longitude', 'time_diff']]  # Adjust column order

print(test_data.head())

# Use the trained model to predict on the test data
preds_lat = model_lat.predict(test_data)
preds_long = model_long.predict(test_data)

# Save the predictions
submission = pd.DataFrame({
    'predicted_latitude': preds_lat,
    'predicted_longitude': preds_long
})

print(submission.head())
submission.to_csv('data/final_predictions.csv', index=False)


                  time  time_diff
0  2024-05-08 00:03:16     1978.0
1  2024-05-08 00:06:17     2523.0
2  2024-05-08 00:10:02    40674.0
3  2024-05-08 00:10:34     1761.0
4  2024-05-08 00:12:27     1620.0
   time_diff
0     1978.0
1     2523.0
2    40674.0
3     1761.0
4     1620.0
   latitude  longitude  time_diff
0      12.0       12.0     1978.0
1      12.0       12.0     2523.0
2      12.0       12.0    40674.0
3      12.0       12.0     1761.0
4      12.0       12.0     1620.0
   predicted_latitude  predicted_longitude
0           10.627735            13.323826
1           10.631050            13.331933
2           10.373518            11.771955
3           10.628657            13.322577
4           10.629682            13.324364
