In [252]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pre_processing import pre_process
from validation import prepare_output, calculate_score, calculate_distance
import datetime

# Split Data into Training and Validation Sets


In [253]:
# Load your features and labels datasets
features = pd.read_csv('data/features.csv')
labels = pd.read_csv('data/labels.csv')

# Split the features and labels into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

print("X_train", X_train.shape, X_train.head())
print("X_val", X_val.shape, X_val.head())
print("y_train", y_train.shape, y_train.head())
print("y_val", y_val.shape, y_val.head())

X_train (1216552, 5)          time_diff  longitude  latitude  previous_long  previous_lat
507247      1078.0    8.56608  53.56380        8.56607      53.56381
1163637     1272.0   14.45352  55.25410       14.30903      55.16020
184329      1080.0  -79.88520   9.36768      -79.88521       9.36765
1280257     1216.0    9.78959  41.49199        9.77107      41.38357
1459988     2879.0  114.09895  22.03080      114.09887      22.03024
X_val (304138, 5)          time_diff  longitude  latitude  previous_long  previous_lat
262569      1082.0    7.18055  53.33738        7.18050      53.33740
1362118     1206.0 -123.21660  48.55060     -123.24372      48.64326
362220      1224.0   -7.80658  36.22110       -7.93353      36.24835
1082131     1200.0  -16.07350  25.34050      -16.11867      25.25200
1259325     1079.0   -5.43157  36.12972       -5.43157      36.12972
y_train (1216552, 4)          longitude  latitude  previous_long  previous_lat
507247     8.56446  53.56256        8.56608      53.56

## Encode the vesselIds


# Initialize and Train the XGBoost Model


In [254]:
# Initialize the XGBoost model for predicting latitude
model_lat = XGBRegressor(n_estimators=3000, max_depth=8, learning_rate=0.01, objective='reg:squarederror')

# Initialize the XGBoost model for predicting longitude
model_long = XGBRegressor(n_estimators=3000, max_depth=8, learning_rate=0.01, objective='reg:squarederror')

# Fit the models. Send in the columns that you want the model to use to predict.
model_lat.fit(X_train[['time_diff', 'longitude', 'previous_long', 'previous_lat']], y_train['latitude'])
model_long.fit(X_train[['time_diff', 'latitude', 'previous_long', 'previous_lat']], y_train['longitude'])


# Evaluate on val set


### Step 1: Prediction on Validation Data


In [255]:
# Predict on validation set
val_preds_lat = model_lat.predict(X_val.drop(columns=['latitude']))
val_preds_long = model_long.predict(X_val.drop(columns=['longitude']))

# Combine the predicted latitudes and longitudes
val_preds_combined = pd.DataFrame({
    'longitude_predicted': val_preds_long,
    'latitude_predicted': val_preds_lat
})
print(val_preds_combined.head())

# Combine with actual validation data
validation_data = X_val.copy()  # This includes the validation features
validation_data['latitude'] = y_val['latitude']  # Adding actual latitude
validation_data['longitude'] = y_val['longitude']  # Adding actual longitude


   longitude_predicted  latitude_predicted
0             7.182348           53.343063
1          -123.246849           48.818954
2            -7.737917           36.204273
3           -15.654371           25.296524
4            -5.483555           36.139095


### Step 2: Prepare the Output for Validation


In [256]:
# Prepare the validation output with predictions
validation_output = prepare_output(val_preds_combined.to_numpy(), validation_data)
print(validation_output.head())

# Calculate the geodesic distance-based score
validation_score = calculate_score(validation_output)

print(f"Validation Score (Weighted Geodesic Distance in km): {validation_score}")


         time_diff  longitude  latitude  previous_long  previous_lat  \
262569      1082.0    7.18053  53.33739        7.18050      53.33740   
1362118     1206.0 -123.20305  48.47902     -123.24372      48.64326   
362220      1224.0   -7.68097  36.19265       -7.93353      36.24835   
1082131     1200.0  -16.02597  25.42981      -16.11867      25.25200   
1259325     1079.0   -5.38942  36.09497       -5.43157      36.12972   

         longitude_predicted  latitude_predicted  
262569              7.182348           53.343063  
1362118          -123.246849           48.818954  
362220             -7.737917           36.204273  
1082131           -15.654371           25.296524  
1259325            -5.483555           36.139095  
Validation Score (Weighted Geodesic Distance in km): 16.409809794519674


### Step 3: Make predictions on the test set


In [257]:
# Assuming model_lat and model_long are your trained models, and test_data has already been preprocessed

# Load the preprocessed test data
test_data = pd.read_csv('data/merged_test_and_train_data.csv')

# Group by vesselId to process each vessel separately
grouped_data = test_data.groupby('vesselId')

# Initialize lists to store all predictions across vessels
total_predicted_latitudes = []
total_predicted_longitudes = []

# Iterate through each vessel's data for row-by-row predictions
for vessel_id, vessel_data in grouped_data:
    vessel_data = vessel_data.sort_values(by='time_diff')  # Sort by time_diff to ensure sequential predictions
    
    # Initialize lists to store predictions for the current vessel
    predicted_latitudes = []
    predicted_longitudes = []
    
    # Iterate through each row for this vessel
    for idx, row in vessel_data.iterrows():
        # Prepare the row's feature data for latitude and longitude
        lat_features = row[['time_diff', 'longitude', 'previous_lat', 'previous_long']].values.reshape(1, -1)
        long_features = row[['time_diff', 'latitude', 'previous_lat', 'previous_long']].values.reshape(1, -1)
        
        # Predict latitude and longitude for this row
        predicted_lat = model_lat.predict(lat_features)
        predicted_long = model_long.predict(long_features)
        
        # Store the predictions for this vessel
        predicted_latitudes.append(predicted_lat[0])  # Append the predicted latitude
        predicted_longitudes.append(predicted_long[0])  # Append the predicted longitude
        
        # Update the test_data for the next iteration within the same vessel
        if idx + 1 < len(vessel_data):
            vessel_data.loc[idx + 1, 'previous_lat'] = predicted_lat[0]  # Update next row's previous_lat for this vessel
            vessel_data.loc[idx + 1, 'previous_long'] = predicted_long[0]  # Update next row's previous_long for this vessel

    # Append predictions for this vessel to the total predictions
    total_predicted_latitudes.extend(predicted_latitudes)
    total_predicted_longitudes.extend(predicted_longitudes)

# Now, store the predictions into a DataFrame
submission = pd.DataFrame({
    'ID': test_data['ID'],  # Retain the original ID
    'longitude_predicted': total_predicted_longitudes,
    'latitude_predicted': total_predicted_latitudes
})

# Get the current date and time
current_time = datetime.datetime.now()

# Format the current date and time as a string (e.g., "2024-10-22_14-45")
time_str = current_time.strftime("%m-%d_%H-%M")
filename = f'data/final_preds/final_predictions_{time_str}.csv'

# Sort the submission by ID to match the original test set order
submission = submission.sort_values(by='ID').reset_index(drop=True)

# Output to CSV with the timestamp in the filename
submission.to_csv(filename, index=False)

print(f"Row-by-row predictions for each vessel completed and saved as {filename}.")


Row-by-row predictions for each vessel completed and saved as data/final_preds/final_predictions_10-22_21-56.csv.
