In [1]:

import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pre_processing import pre_process
from validation import prepare_output, calculate_score, calculate_distance

# Split Data into Training and Validation Sets


In [20]:
# Load your features and labels datasets
features = pd.read_csv('data/features.csv')
labels = pd.read_csv('data/labels.csv')

# Check the structure of both dataframes to ensure they match
# print(features.shape, features.head())  # Ensure the number of rows match in features and labels
# print(labels.shape, labels.head())

# Split the features and labels into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2) #, random_state=42)

print(X_train.head())
print(X_val.head())
print(y_train.head())
print(y_val.head())


         latitude  longitude  time_diff
1356432  53.15219    4.34956     1240.0
703168   53.35950    4.70711     1220.0
600269   51.13660    1.64029     1212.0
573367   50.38845   -0.85260     1213.0
439941   51.47689    2.57269     1230.0
        latitude  longitude  time_diff
726933  33.71005 -118.19548     1259.0
261665  54.02101    7.97893     1045.0
434205  51.30551    3.23303     1270.0
2193    42.25120   -8.73768     1375.0
673362  43.66389   -9.51715     1129.0
         latitude  longitude
1356432  53.13348    4.31476
703168   53.40098    4.76084
600269   51.08389    1.55553
573367   50.45288   -0.85590
439941   51.45846    2.55301
        latitude  longitude
726933  33.71005 -118.19547
261665  54.03354    7.82737
434205  51.30551    3.23304
2193    42.24587   -8.75465
673362  43.74076   -9.46913


# Initialize and Train the XGBoost Model


In [21]:
# Initialize the XGBoost model for predicting latitude
model_lat = XGBRegressor(n_estimators=1000, max_depth=5, learning_rate=0.1, objective='reg:squarederror')

# Initialize the XGBoost model for predicting longitude
model_long = XGBRegressor(n_estimators=1000, max_depth=5, learning_rate=0.1, objective='reg:squarederror')

# Fit the models
model_lat.fit(X_train, y_train['latitude'])
model_long.fit(X_train, y_train['longitude'])


# Evaluate on val set


### Step 1: Prediction on Validation Data


In [22]:
# Predict on validation set
val_preds_lat = model_lat.predict(X_val)
val_preds_long = model_long.predict(X_val)

# Combine the predicted latitudes and longitudes
val_preds_combined = pd.DataFrame({
    'longitude_predicted': val_preds_long,
    'latitude_predicted': val_preds_lat
})
print(val_preds_combined.head())

# Combine with actual validation data
validation_data = X_val.copy()  # This includes the validation features
validation_data['latitude'] = y_val['latitude']  # Adding actual latitude
validation_data['longitude'] = y_val['longitude']  # Adding actual longitude


   longitude_predicted  latitude_predicted
0          -118.140862           33.685444
1             7.893031           54.040916
2             3.236534           51.309998
3            -8.546696           42.301369
4            -9.430494           43.624920


### Step 2: Prepare the Output for Validation


In [23]:
# Prepare the validation output with predictions
validation_output = prepare_output(val_preds_combined.to_numpy(), validation_data)
print(validation_output.head())

# Calculate the geodesic distance-based score
validation_score = calculate_score(validation_output)

print(f"Validation Score (Weighted Geodesic Distance in km): {validation_score}")


        latitude  longitude  time_diff  longitude_predicted  \
726933  33.71005 -118.19547     1259.0          -118.140862   
261665  54.03354    7.82737     1045.0             7.893031   
434205  51.30551    3.23304     1270.0             3.236534   
2193    42.24587   -8.75465     1375.0            -8.546696   
673362  43.74076   -9.46913     1129.0            -9.430494   

        latitude_predicted  
726933           33.685444  
261665           54.040916  
434205           51.309998  
2193             42.301369  
673362           43.624920  
Validation Score (Weighted Geodesic Distance in km): 9.3604395026191


### Step 3: Make predictions on the test set


In [7]:
# Assuming you have already preprocessed the test_data and it's ready for predictions
test_data = pd.read_csv('data/pre_processed_test_data.csv')
print(test_data.head())
# Use the trained model to predict on the test data
preds_lat = model_lat.predict(test_data)
preds_long = model_long.predict(test_data)

# Save the predictions
submission = pd.DataFrame({
    'predicted_latitude': preds_lat,
    'predicted_longitude': preds_long
})

submission.to_csv('data/final_predictions.csv', index=False)


                  time  time_diff
0  2024-05-08 00:12:27      551.0
1  2024-05-08 00:39:27     2171.0
2  2024-05-08 01:33:28     5412.0
3  2024-05-08 01:51:26     6490.0
4  2024-05-08 02:03:29     7213.0


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:time: object

In [None]:
# # Load your test dataset
# test_data = pd.read_csv('data/pre_processed_test_data')

# # Check the processed test data format
# # print(test_data.head())

# # Convert 'time' to time_diff and add placeholders for latitude and longitude
# # test_data['time'] = pd.to_datetime(test_data['time'])
# # reference_time = test_data['time'].min()
# # test_data['time_diff'] = (test_data['time'] - reference_time).dt.total_seconds()

# # Add placeholder columns for latitude and longitude (since these will be predicted)
# prediction_data = pd.read_csv('data/predictions_test.csv')
# # Add ID column (index based on row numbers, starting from 0)
# test_data['ID'] = pd.Series(range(len(final_predictions)))

# test_data['longitude_predicted'] = prediction_data['predicted_longitude']
# test_data['latitude_predicted'] = prediction_data['predicted_latitude']

# # Drop any unnecessary columns like 'vesselId'
# test_data = test_data.drop(columns=['time', 'time_diff'])

# test_data.to_csv('data/final_predictions.csv', index=False)
# print(test_data)


          ID  longitude_predicted  latitude_predicted
0          0            10.866823           53.945590
1          1           -65.545000           18.892086
2          2             1.206126           50.647625
3          3           139.172500           34.554300
4          4            15.138873           36.172276
...      ...                  ...                 ...
51734  51734            -3.830854           43.436752
51735  51735            23.591072           37.984165
51736  51736            13.637828           44.517950
51737  51737             0.279041           50.370780
51738  51738           -76.526420           39.243580

[51739 rows x 3 columns]
