In [174]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pre_processing import pre_process
from validation import prepare_output, calculate_score, calculate_distance
import datetime

# Split Data into Training and Validation Sets


In [175]:
# Load your features and labels datasets
features = pd.read_csv('data/features.csv')
labels = pd.read_csv('data/labels.csv')

# Split the features and labels into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2) # , random_state=42)

print("X_train", X_train.shape, X_train.head())
print("X_val", X_val.shape, X_val.head())
print("y_train", y_train.shape, y_train.head())
print("y_val", y_val.shape, y_val.head())

X_train (1216552, 11)          time_diff  longitude  latitude  previous_long  previous_lat  \
1475051     1190.0    6.04662  53.63442        6.13000      53.63562   
1487707     1260.0    8.55697  53.57617        8.55699      53.57617   
558535      1260.0  103.86520   1.25848      103.86450       1.25888   
113276      1257.0    0.20989  49.47205        0.20988      49.47207   
1401542     1080.0   -1.74664  50.40393       -1.64725      50.41961   

         previous_sog  previous_heading  previous_cog   sog  heading    cog  
1475051          14.0             179.0         180.0   0.1    210.0   36.1  
1487707           1.3              69.0         240.4  12.6    163.0  164.2  
558535           16.5             306.0         304.1   7.6    301.0  299.7  
113276           14.9             319.0         318.3   0.0    341.0    0.0  
1401542          13.1             231.0         229.0   2.8    157.0   65.0  
X_val (304138, 11)          time_diff  longitude  latitude  previous_long  pr

## Encode the vesselIds


# Initialize and Train the XGBoost Model


In [180]:
# Initialize the XGBoost models with appropriate hyperparameters
model_lat = XGBRegressor(
    n_estimators=2000,
    max_depth=6,
    learning_rate=0.02,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.5,
    min_child_weight=3,
    objective='reg:squarederror'
)

model_long = XGBRegressor(
    n_estimators=2000,
    max_depth=6,
    learning_rate=0.02,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.5,
    min_child_weight=3,
    objective='reg:squarederror'
)

model_heading = XGBRegressor(
    n_estimators=2000,
    max_depth=6,
    learning_rate=0.02,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.5,
    min_child_weight=3,
    objective='reg:squarederror'
)

model_sog = XGBRegressor(
    n_estimators=2000,
    max_depth=6,
    learning_rate=0.02,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.5,
    min_child_weight=3,
    objective='reg:squarederror'
)

model_cog = XGBRegressor(
    n_estimators=2000,
    max_depth=6,
    learning_rate=0.02,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.5,
    min_child_weight=3,
    objective='reg:squarederror'
)


# Train the models on the updated feature set
# Include 'sog', 'heading', and 'cog' in addition to the previous features
model_cog.fit(X_train[['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog']], y_train['cog'])
model_sog.fit(X_train[['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog']], y_train['sog'])
model_heading.fit(X_train[['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog']], y_train['heading'])
model_lat.fit(X_train[['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog', 'heading']], y_train['latitude'])
model_long.fit(X_train[['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog', 'heading', 'latitude']], y_train['longitude'])


# Evaluate on val set


### Step 1: Prediction on Validation Data


In [191]:
# Predict on validation set with the new features

val_preds_cog = model_cog.predict(X_val[['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog']])
val_preds_sog = model_sog.predict(X_val[['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog']])
val_preds_heading = model_heading.predict(X_val[['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog']])
val_preds_lat = model_lat.predict(X_val[['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog', 'heading']])
val_preds_long = model_long.predict(X_val[['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog', 'heading', 'latitude']])

# Combine the predicted latitudes and longitudes
val_preds_combined = pd.DataFrame({
    'longitude_predicted': val_preds_long,
    'latitude_predicted': val_preds_lat
})

# Optionally, validate the output as before
validation_data = X_val.copy()
validation_data['latitude'] = y_val['latitude']
validation_data['longitude'] = y_val['longitude']

### Step 2: Prepare the Output for Validation


In [192]:
# Prepare the validation output with predictions
validation_output = prepare_output(val_preds_combined.to_numpy(), validation_data)
print(validation_output.head())

# Calculate the geodesic distance-based score
validation_score = calculate_score(validation_output)

print(f"Validation Score (Weighted Geodesic Distance in km): {validation_score}")


         time_diff  longitude  latitude  previous_long  previous_lat  \
126884      2704.0  -79.54179   8.81252      -79.54202       8.81256   
300064       996.0    4.28086  53.10251        4.44182      53.22589   
1362187      300.0  125.10536  36.68591      125.02137      36.62131   
100171       799.0 -119.10187  33.87926     -119.11422      33.87814   
1381002     1260.0   12.99928  55.62335       12.99927      55.62335   

         previous_sog  previous_heading  previous_cog   sog  heading    cog  \
126884            0.1             139.0         170.6   0.1    145.0   14.8   
300064            0.0             114.0         202.9  11.5    114.0  113.9   
1362187          20.9              22.0          22.2   0.0    345.0   67.7   
100171            0.0             205.0         141.6   0.2    121.0   53.1   
1381002           0.2             162.0         300.9  10.6    255.0  252.0   

         longitude_predicted  latitude_predicted  
126884            -78.733482            8

### Step 3: Make predictions on the test set


In [193]:
import pandas as pd
import datetime

# Assuming model_lat and model_long are your trained models, and test_data has already been preprocessed

# Load the preprocessed test data
test_data = pd.read_csv('data/merged_test_and_train_data.csv')
test_data_length = len(test_data)

print(test_data.head())

# Iterate through each row of the dataset (already sorted by vesselId)
for idx, row in test_data.iterrows():
    # Predict latitude first using the current row's features
    lat_features = row[['time_diff', 'previous_lat', 'previous_long', 'previous_sog', 'previous_heading', 'previous_cog']].values.reshape(1, -1)
    predicted_lat = model_lat.predict(lat_features)[0]
    
    # Insert the predicted latitude back into the current row for the next prediction
    test_data.at[idx, 'latitude'] = predicted_lat  # Update the row's latitude with the predicted value
    
    # Now, predict longitude using the updated latitude and other features
    long_features = row[['time_diff', 'previous_lat', 'previous_long', 'previous_sog', 'previous_heading', 'previous_cog']].values.tolist()
    long_features.insert(1, predicted_lat)  # Insert the predicted latitude into the second position (index 1)
    long_features = [long_features]  # Reshape into the correct input shape for the model
    
    predicted_long = model_long.predict(long_features)[0]
    test_data.at[idx, 'longitude'] = predicted_long  # Update the current row's longitude with the predicted value

    # Check if the next row exists and has the same vesselId
    if idx + 1 < test_data_length and test_data.iloc[idx + 1]['vesselId'] == row['vesselId']:
        # Only update the next row's previous_lat and previous_long if the vesselId is the same
        test_data.at[idx + 1, 'previous_lat'] = predicted_lat
        test_data.at[idx + 1, 'previous_long'] = predicted_long
        
        # Check if the current row's sog, heading, and cog should be passed on
        # Only overwrite if they're currently set to zero in the next row
        if test_data.at[idx + 1, 'previous_sog'] == 0.0 and test_data.at[idx + 1, 'previous_heading'] == 0.0 and test_data.at[idx + 1, 'previous_cog'] == 0.0:
            test_data.at[idx + 1, 'previous_sog'] = row['previous_sog']
            test_data.at[idx + 1, 'previous_heading'] = row['previous_heading']
            test_data.at[idx + 1, 'previous_cog'] = row['previous_cog']

    if idx == 5:  # Print after processing a few rows for debugging
        print(test_data.head())
        break
    

# Extract the relevant columns (ID, latitude, and longitude) for the submission
submission = test_data[['ID', 'longitude', 'latitude']]

# Sort the submission by ID to match the original test set order
submission = submission.sort_values(by='ID').reset_index(drop=True)

# Get the current date and time
current_time = datetime.datetime.now()

# Format the current date and time as a string (e.g., "2024-10-22_14-45")
time_str = current_time.strftime("%m-%d_%H-%M")
filename = f'data/final_preds/final_predictions_{time_str}.csv'

# Output to CSV with the timestamp in the filename
submission.to_csv(filename, index=False)

print(f"Row-by-row predictions for each vessel completed and saved as {filename}.")




    ID                  vesselId  time_diff  latitude  longitude  \
0    4  61e9f38eb937134a3c4bfd8d        0.0       0.0        0.0   
1  201  61e9f38eb937134a3c4bfd8d     1620.0       0.0        0.0   
2  583  61e9f38eb937134a3c4bfd8d     3241.0       0.0        0.0   
3  701  61e9f38eb937134a3c4bfd8d     1078.0       0.0        0.0   
4  829  61e9f38eb937134a3c4bfd8d      723.0       0.0        0.0   

   previous_lat  previous_long  previous_sog  previous_heading  previous_cog  
0       48.5332       -6.12003          15.0              58.0          57.8  
1        0.0000        0.00000           0.0               0.0           0.0  
2        0.0000        0.00000           0.0               0.0           0.0  
3        0.0000        0.00000           0.0               0.0           0.0  
4        0.0000        0.00000           0.0               0.0           0.0  


ValueError: Feature shape mismatch, expected: 9, got 6

In [196]:
import pandas as pd
import datetime

# Assuming the XGBoost models are trained and test_data has already been preprocessed

# Load the preprocessed test data
test_data = pd.read_csv('data/merged_test_and_train_data.csv')
test_data_length = len(test_data)

# Iterate through each row of the dataset (already sorted by vesselId)
for idx in range(test_data_length):
    # Extract the row from the updated test_data at each iteration
    row = test_data.iloc[idx]
    
    # 1. Predict COG using the current row's features
    cog_features = test_data.loc[idx, ['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog']].values.reshape(1, -1)
    predicted_cog = model_cog.predict(cog_features)[0]
    test_data.at[idx, 'cog'] = predicted_cog  # Update COG with predicted value
    
    # 2. Predict SOG using the dynamically updated COG value
    sog_features = test_data.loc[idx, ['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog']].values.reshape(1, -1)
    predicted_sog = model_sog.predict(sog_features)[0]
    test_data.at[idx, 'sog'] = predicted_sog  # Update SOG with predicted value
    
    # 3. Predict heading using the dynamically updated COG and SOG
    heading_features = test_data.loc[idx, ['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog']].values.reshape(1, -1)
    predicted_heading = model_heading.predict(heading_features)[0]
    test_data.at[idx, 'heading'] = predicted_heading  # Update heading with predicted value
    
    # 4. Predict latitude using the dynamically updated COG, SOG, and heading
    lat_features = test_data.loc[idx, ['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog', 'heading']].values.reshape(1, -1)
    predicted_lat = model_lat.predict(lat_features)[0]
    test_data.at[idx, 'latitude'] = predicted_lat  # Update latitude with predicted value
    
    # 5. Predict longitude using the dynamically updated latitude, COG, SOG, and heading
    long_features = test_data.loc[idx, ['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog', 'heading', 'latitude']].values.reshape(1, -1)
    predicted_long = model_long.predict(long_features)[0]
    test_data.at[idx, 'longitude'] = predicted_long  # Update longitude with predicted value

    # Check if the next row exists and has the same vesselId
    if idx + 1 < test_data_length and test_data.iloc[idx + 1]['vesselId'] == row['vesselId']:
        # Update the next row's previous values (previous_lat, previous_long, previous_sog, etc.) only if the vesselId is the same
        test_data.at[idx + 1, 'previous_lat'] = predicted_lat
        test_data.at[idx + 1, 'previous_long'] = predicted_long
        test_data.at[idx + 1, 'previous_sog'] = predicted_sog
        test_data.at[idx + 1, 'previous_heading'] = predicted_heading
        test_data.at[idx + 1, 'previous_cog'] = predicted_cog
# Extract the relevant columns (ID, latitude, and longitude) for the submission
submission = test_data[['ID', 'longitude', 'latitude']]

# Sort the submission by ID to match the original test set order
submission = submission.sort_values(by='ID').reset_index(drop=True)

# Get the current date and time
current_time = datetime.datetime.now()

# Format the current date and time as a string (e.g., "2024-10-22_14-45")
time_str = current_time.strftime("%m-%d_%H-%M")
filename = f'data/final_preds/final_predictions_{time_str}.csv'

# Output to CSV with the timestamp in the filename
submission.to_csv(filename, index=False)

print(f"Row-by-row predictions for each vessel completed and saved as {filename}.")



Row-by-row predictions for each vessel completed and saved as data/final_preds/final_predictions_10-23_17-58.csv.
