In [73]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pre_processing import pre_process

# Split Data into Training and Validation Sets


In [74]:
# Load your features and labels datasets
features = pd.read_csv('data/features.csv')
labels = pd.read_csv('data/labels.csv')

# Check the structure of both dataframes to ensure they match
print(features.shape)  # Ensure the number of rows match in features and labels
print(labels.shape)

# Split the features and labels into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)


(1521377, 3)
(1521377, 2)
(1217101, 3)
(304276, 3)
(1217101, 2)
(304276, 2)


# Initialize and Train the XGBoost Model


In [75]:
# Initialize the XGBoost model for predicting latitude
model_lat = XGBRegressor(n_estimators=1000, max_depth=5, learning_rate=0.1, objective='reg:squarederror')

# Initialize the XGBoost model for predicting longitude
model_long = XGBRegressor(n_estimators=1000, max_depth=5, learning_rate=0.1, objective='reg:squarederror')

# Fit the models
model_lat.fit(X_train, y_train['latitude'])
model_long.fit(X_train, y_train['longitude'])


# Make predictions on the test set


In [76]:
# Now, the test data can be used for prediction with the trained models
preds_lat = model_lat.predict(X_val)
preds_long = model_long.predict(X_val)

# Save predictions if necessary
submission = pd.DataFrame({
    'predicted_latitude': preds_lat,
    'predicted_longitude': preds_long
})

submission.to_csv('data/predictions_test.csv', index=False)

In [77]:
# Load your test dataset
test_data = pd.read_csv('data/pre_processed_test_data')

# Check the processed test data format
# print(test_data.head())

# Convert 'time' to time_diff and add placeholders for latitude and longitude
# test_data['time'] = pd.to_datetime(test_data['time'])
# reference_time = test_data['time'].min()
# test_data['time_diff'] = (test_data['time'] - reference_time).dt.total_seconds()

# Add placeholder columns for latitude and longitude (since these will be predicted)
prediction_data = pd.read_csv('data/predictions_test.csv')
# Add ID column (index based on row numbers, starting from 0)
test_data['ID'] = pd.Series(range(len(final_predictions)))

test_data['longitude_predicted'] = prediction_data['predicted_longitude']
test_data['latitude_predicted'] = prediction_data['predicted_latitude']

# Drop any unnecessary columns like 'vesselId'
test_data = test_data.drop(columns=['time', 'time_diff'])

test_data.to_csv('data/final_predictions.csv', index=False)
print(test_data)


          ID  longitude_predicted  latitude_predicted
0          0            10.866823           53.945590
1          1           -65.545000           18.892086
2          2             1.206126           50.647625
3          3           139.172500           34.554300
4          4            15.138873           36.172276
...      ...                  ...                 ...
51734  51734            -3.830854           43.436752
51735  51735            23.591072           37.984165
51736  51736            13.637828           44.517950
51737  51737             0.279041           50.370780
51738  51738           -76.526420           39.243580

[51739 rows x 3 columns]
