In [38]:

import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pre_processing import pre_process
from validation import prepare_output, calculate_score, calculate_distance

# Split Data into Training and Validation Sets


In [39]:
# Load your features and labels datasets
features = pd.read_csv('data/features.csv')
labels = pd.read_csv('data/labels.csv')

# Check the structure of both dataframes to ensure they match
# print(features.shape, features.head())  # Ensure the number of rows match in features and labels
# print(labels.shape, labels.head())

# Split the features and labels into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.1) #, random_state=42)

print("X-train", X_train.head())
print("X-val", X_val.head())
print("y-train", y_train.head())
print("y-val", y_val.head())


X-train          latitude  longitude  time_diff
280335   53.81436    6.94719     1218.0
439387   39.32004    0.65303     1247.0
1462433 -36.84273  174.76921     1080.0
638157   53.56716    8.55684     1080.0
774608   53.34197    7.19884     1261.0
X-val          latitude  longitude  time_diff
753652   52.56969    3.86592     1201.0
1241803  39.50363    1.22598     1292.0
691216   53.46115   -3.01964     1266.0
1427736  49.68189   -0.07656     1210.0
720525   25.00603   51.60019     1991.0
y-train          latitude  longitude
280335   53.83775    7.10751
439387   39.22727    0.61780
1462433 -36.84274  174.76922
638157   53.56715    8.55683
774608   53.34199    7.19880
y-val          latitude  longitude
753652   52.63424    3.96285
1241803  39.51232    1.08328
691216   53.46115   -3.01966
1427736  49.73847   -0.03331
720525   25.00601   51.60019


# Initialize and Train the XGBoost Model


In [40]:
# Initialize the XGBoost model for predicting latitude
model_lat = XGBRegressor(n_estimators=1500, max_depth=5, learning_rate=0.1, objective='reg:squarederror')

# Initialize the XGBoost model for predicting longitude
model_long = XGBRegressor(n_estimators=1500, max_depth=5, learning_rate=0.1, objective='reg:squarederror')

# Fit the models
model_lat.fit(X_train, y_train['latitude'])
model_long.fit(X_train, y_train['longitude'])


# Evaluate on val set


### Step 1: Prediction on Validation Data


In [None]:
# Predict on validation set
val_preds_lat = model_lat.predict(X_val)
val_preds_long = model_long.predict(X_val)

# Combine the predicted latitudes and longitudes
val_preds_combined = pd.DataFrame({
    'longitude_predicted': val_preds_long,
    'latitude_predicted': val_preds_lat
})
print(val_preds_combined.head())

# Combine with actual validation data
validation_data = X_val.copy()  # This includes the validation features
validation_data['latitude'] = y_val['latitude']  # Adding actual latitude
validation_data['longitude'] = y_val['longitude']  # Adding actual longitude


   longitude_predicted  latitude_predicted
0             2.140414           41.326862
1            12.161174           54.159271
2            15.761900           37.814476
3             9.651759           54.200748
4           -80.421547           31.270767


### Step 2: Prepare the Output for Validation


In [None]:
# Prepare the validation output with predictions
validation_output = prepare_output(val_preds_combined.to_numpy(), validation_data)
print(validation_output.head())

# Calculate the geodesic distance-based score
validation_score = calculate_score(validation_output)

print(f"Validation Score (Weighted Geodesic Distance in km): {validation_score}")


         latitude  longitude  time_diff  longitude_predicted  \
1317831  41.33202    2.14637     1261.0             2.140414   
1379907  54.14543   12.10472     1261.0            12.161174   
339206   37.93537   15.60868     1254.0            15.761900   
1375999  54.21292    9.58242     1219.0             9.651759   
34927    31.13409  -80.71813     1211.0           -80.421547   

         latitude_predicted  
1317831           41.326862  
1379907           54.159271  
339206            37.814476  
1375999           54.200748  
34927             31.270767  
Validation Score (Weighted Geodesic Distance in km): 9.190767113417733


### Step 3: Make predictions on the test set


In [None]:
# Assuming you have already preprocessed the test_data and it's ready for predictions
test_data = pd.read_csv('data/pre_processed_test_data.csv')
print(test_data.head())
# Drop the 'time' column only if it exists
test_data = test_data.drop(columns=['time'], errors='ignore')
print(test_data.head())
test_data['longitude'] = 0.0  # or you can use NaN if you prefer
test_data['latitude'] = 0.0   # or use NaN

# Reorder columns to match the order used in training: ['time_diff', 'longitude', 'latitude']
test_data = test_data[['latitude', 'longitude', 'time_diff']]  # Adjust column order

print(test_data.head())

# Use the trained model to predict on the test data
preds_lat = model_lat.predict(test_data)
preds_long = model_long.predict(test_data)

# Save the predictions
submission = pd.DataFrame({
    'predicted_latitude': preds_lat,
    'predicted_longitude': preds_long
})

print(submission.head())
submission.to_csv('data/final_predictions.csv', index=False)


                  time  time_diff
0  2024-05-08 00:03:16     1978.0
1  2024-05-08 00:06:17     2523.0
2  2024-05-08 00:10:02    40674.0
3  2024-05-08 00:10:34     1761.0
4  2024-05-08 00:12:27     1620.0
   time_diff
0     1978.0
1     2523.0
2    40674.0
3     1761.0
4     1620.0
   latitude  longitude  time_diff
0       0.0        0.0     1978.0
1       0.0        0.0     2523.0
2       0.0        0.0    40674.0
3       0.0        0.0     1761.0
4       0.0        0.0     1620.0
   predicted_latitude  predicted_longitude
0          -13.215732             2.193227
1          -13.207327             2.219582
2          -12.670565             4.478564
3          -12.965896             2.193096
4          -12.951620             2.193096
