In [24]:
import pandas as pd
import numpy as np 
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from validation import prepare_output, calculate_score, calculate_distance
from pre_processing import preprocess_AIS_test
import datetime
from geopy.distance import geodesic

# Split Data into Training and Validation Sets


In [23]:
def split_train_test_data(timecutoff: bool, n_days: int):
    """
    Splits the data into training and validation sets. If timecutoff is True,
    it splits the data such that the last 'n_days' of data is used for validation.

    :param timecutoff: Whether to split based on the time.
    :param n_days: Number of days to include in the validation set if timecutoff is True.
    :return: X_train, X_val, y_train, y_val
    """
    # Load your features and labels datasets
    features = pd.read_csv('data/features.csv')
    labels = pd.read_csv('data/labels.csv')
    
    # Merge features and labels so that we can filter them together
    data = features.copy()
    data['latitude_label'] = labels['latitude']
    data['longitude_label'] = labels['longitude']
    
    # Convert 'time' column to datetime format for filtering
    data['time'] = pd.to_datetime(data['time'])
    
    if timecutoff:
        # Step 1: Find the most recent date in the data
        max_time = data['time'].max()
        
        # Step 2: Calculate the cutoff date for the last 'n_days' of data
        cutoff_time = max_time - pd.Timedelta(days=n_days)
        
        # Step 3: Split the data into train and validation based on the time cutoff
        train_data = data[data['time'] < cutoff_time]
        val_data = data[data['time'] >= cutoff_time]
        
        # Step 4: Separate features and labels
        X_train = train_data.drop(columns=['latitude_label', 'longitude_label', 'time'])
        y_train = train_data[['latitude_label', 'longitude_label']]
        
        X_val = val_data.drop(columns=['latitude_label', 'longitude_label', 'time'])
        y_val = val_data[['latitude_label', 'longitude_label']]
        
    else:
        # Step 5: If timecutoff is False, simply do a random split
        X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.05)  # , random_state=42)
    
    # Display split information
    # print("X_train", X_train.head())
    # print("X_val", X_val.head())
    print("y_train", y_train)
    print("y_val", y_val)
    
    return X_train, X_val, y_train, y_val

# Example usage
X_train, X_val, y_train, y_val = split_train_test_data(False, 5)



y_train          latitude  longitude       cog  sog   heading
736594   53.63668    6.17281 -0.783693  0.2  0.984808
352380  -34.47433  151.21628 -0.573576  1.8  0.866025
415773   44.62101  -63.51250 -0.140901  0.0 -0.754710
278075   35.45768  139.67806  0.356738  0.0 -0.874620
658870   34.98536  128.80723 -0.221548  9.1 -0.325568
...           ...        ...       ...  ...       ...
503744   59.88389   10.76660 -0.463296  0.0 -0.342020
895702   39.20896    9.10842  0.241922  0.0  0.515038
458537   48.34437 -123.34129  0.282341  0.0 -0.422618
495015   47.33685 -122.47502 -0.923880  0.0  0.891007
1019653  22.03829  114.10022  0.601815  0.9  0.997564

[1002751 rows x 5 columns]
y_val         latitude  longitude       cog   sog   heading
206566  38.77746  -75.01903 -0.913545   9.3 -0.945519
361375  53.57617    8.55724 -0.964557   0.0 -0.891007
690951  49.78771   -4.31263 -0.987414  16.6 -0.981627
186094 -38.95484  147.28345 -0.951057  17.7 -0.961262
291063  50.56753   -0.83313 -0.500000   

## Encode the vesselIds


# Initialize and Train the XGBoost Model


In [25]:
# Initialize the XGBoost models with appropriate hyperparameters
model_lat = XGBRegressor(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.12,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.5,
    min_child_weight=3,
    objective='reg:squarederror'
)

model_long = XGBRegressor(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.12,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.5,
    min_child_weight=3,
    objective='reg:squarederror'
)

model_heading = XGBRegressor(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.12,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.5,
    min_child_weight=3,
    objective='reg:squarederror'
)

model_sog = XGBRegressor(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.12,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.5,
    min_child_weight=3,
    objective='reg:squarederror'
)

model_cog = XGBRegressor(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.12,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.5,
    min_child_weight=3,
    objective='reg:squarederror'
)


# Train the models on the updated feature set
# Include 'sog', 'heading', and 'cog' in addition to the previous features
model_cog.fit(X_train[['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog']], y_train['cog'])
model_sog.fit(X_train[['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog']], y_train['sog'])
model_heading.fit(X_train[['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog']], y_train['heading'])
model_lat.fit(X_train[['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog', 'heading']], y_train['latitude'])
model_long.fit(X_train[['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog', 'heading', 'latitude']], y_train['longitude'])


# Evaluate on val set


### Step 1: Prediction on Validation Data


In [26]:
# Initialize val_preds_combined DataFrame before the loop to store predictions
val_preds_combined = pd.DataFrame(index=X_val.index, columns=['latitude_predicted', 'longitude_predicted'])

# Make a copy of X_val to dynamically update during the row-by-row prediction process
validation_data = X_val.copy()

# Sequentially predict COG, SOG, Heading, Latitude, and Longitude for each row in validation set
for idx in range(len(validation_data)):
    # Extract the row
    row = validation_data.iloc[idx]
    
    # Predict COG
    cog_features = row[['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog']].values.reshape(1, -1)
    predicted_cog = model_cog.predict(cog_features)[0]
    validation_data.at[idx, 'cog'] = predicted_cog  # Update COG with predicted value

    # Predict SOG using the predicted COG
    sog_features = validation_data.loc[idx, ['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog']].values.reshape(1, -1)
    predicted_sog = model_sog.predict(sog_features)[0]
    validation_data.at[idx, 'sog'] = predicted_sog  # Update SOG with predicted value

    # Predict Heading using the predicted COG and SOG
    heading_features = validation_data.loc[idx, ['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog']].values.reshape(1, -1)
    predicted_heading = model_heading.predict(heading_features)[0]
    validation_data.at[idx, 'heading'] = predicted_heading  # Update Heading with predicted value

    # Predict Latitude using the updated predictions
    lat_features = validation_data.loc[idx, ['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog', 'heading']].values.reshape(1, -1)
    predicted_lat = model_lat.predict(lat_features)[0]
    validation_data.at[idx, 'latitude'] = predicted_lat  # Update Latitude with predicted value

    # Predict Longitude using the updated predictions
    long_features = validation_data.loc[idx, ['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog', 'heading', 'latitude']].values.reshape(1, -1)
    predicted_long = model_long.predict(long_features)[0]
    validation_data.at[idx, 'longitude'] = predicted_long  # Update Longitude with predicted value

    # Store the predicted latitudes and longitudes into val_preds_combined
    val_preds_combined.loc[idx, 'latitude_predicted'] = predicted_lat
    val_preds_combined.loc[idx, 'longitude_predicted'] = predicted_long

    # Dynamically update the previous values for the next row if the vesselId is the same
    if idx + 1 < len(validation_data) and validation_data.iloc[idx + 1]['vesselId'] == validation_data.iloc[idx]['vesselId']:
        validation_data.at[idx + 1, 'previous_lat'] = predicted_lat
        validation_data.at[idx + 1, 'previous_long'] = predicted_long
        validation_data.at[idx + 1, 'previous_sog'] = predicted_sog
        validation_data.at[idx + 1, 'previous_heading'] = predicted_heading
        validation_data.at[idx + 1, 'previous_cog'] = predicted_cog

# Prepare the validation output with predictions using the prepare_output function
validation_output = prepare_output(val_preds_combined[['longitude_predicted', 'latitude_predicted']].to_numpy(), y_val)

print(validation_output.head())

# Calculate the geodesic distance-based score using the calculate_score function
validation_score = calculate_score(validation_output)

print(f"Validation Score (Weighted Geodesic Distance in km): {validation_score}")


KeyboardInterrupt: 

### Step 2: Prepare the Output for Validation


In [None]:
validation_data = preprocess_AIS_test(X_train, X_val)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['time_diff'].fillna(0, inplace=True)


KeyError: 'ID'

### Step 3: Make predictions on the test set


In [27]:
# Load the preprocessed test data
test_data = pd.read_csv('data/merged_test_and_train_data.csv')
test_data_length = len(test_data)

# Iterate through each row of the dataset (already sorted by vesselId)
for idx in range(test_data_length):
    # Extract the row from the updated test_data at each iteration
    row = test_data.iloc[idx]
    
    # 1. Predict COG using the current row's features
    cog_features = test_data.loc[idx, ['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog']].values.reshape(1, -1)
    predicted_cog = model_cog.predict(cog_features)[0]
    test_data.at[idx, 'cog'] = predicted_cog  # Update COG with predicted value
    
    # 2. Predict SOG using the dynamically updated COG value
    sog_features = test_data.loc[idx, ['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog']].values.reshape(1, -1)
    predicted_sog = model_sog.predict(sog_features)[0]
    test_data.at[idx, 'sog'] = predicted_sog  # Update SOG with predicted value
    
    # 3. Predict heading using the dynamically updated COG and SOG
    heading_features = test_data.loc[idx, ['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog']].values.reshape(1, -1)
    predicted_heading = model_heading.predict(heading_features)[0]
    test_data.at[idx, 'heading'] = predicted_heading  # Update heading with predicted value
    
    # 4. Predict latitude using the dynamically updated COG, SOG, and heading
    lat_features = test_data.loc[idx, ['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog', 'heading']].values.reshape(1, -1)
    predicted_lat = model_lat.predict(lat_features)[0]
    test_data.at[idx, 'latitude'] = predicted_lat  # Update latitude with predicted value
    
    # 5. Predict longitude using the dynamically updated latitude, COG, SOG, and heading
    long_features = test_data.loc[idx, ['time_diff', 'previous_long', 'previous_lat', 'previous_sog', 'previous_heading', 'previous_cog', 'cog', 'sog', 'heading', 'latitude']].values.reshape(1, -1)
    predicted_long = model_long.predict(long_features)[0]
    test_data.at[idx, 'longitude'] = predicted_long  # Update longitude with predicted value

    # Check if the next row exists and has the same vesselId
    if idx + 1 < test_data_length and test_data.iloc[idx + 1]['vesselId'] == row['vesselId']:
        # Update the next row's previous values (previous_lat, previous_long, previous_sog, etc.) only if the vesselId is the same
        test_data.at[idx + 1, 'previous_lat'] = predicted_lat
        test_data.at[idx + 1, 'previous_long'] = predicted_long
        test_data.at[idx + 1, 'previous_sog'] = predicted_sog
        test_data.at[idx + 1, 'previous_heading'] = predicted_heading
        test_data.at[idx + 1, 'previous_cog'] = predicted_cog
# Extract the relevant columns (ID, latitude, and longitude) for the submission
submission = test_data[['ID', 'longitude', 'latitude']]

# Sort the submission by ID to match the original test set order
submission = submission.sort_values(by='ID').reset_index(drop=True)
submission = submission.rename(columns={
    'longitude': 'longitude_predicted',
    'latitude': 'latitude_predicted'
})

# Get the current date and time
current_time = datetime.datetime.now()

# Format the current date and time as a string (e.g., "2024-10-22_14-45")
time_str = current_time.strftime("%m-%d_%H-%M")
filename = f'data/final_preds/final_predictions_{time_str}.csv'

# Output to CSV with the timestamp in the filename
submission.to_csv(filename, index=False)

print(f"Row-by-row predictions for each vessel completed and saved as {filename}.")



Row-by-row predictions for each vessel completed and saved as data/final_preds/final_predictions_10-24_14-41.csv.
