# Map-matching error detection model

In [None]:
import os
import math
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm, trange

from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from utils import calc_eval_metrics, collate_fn

In [None]:
proj_dir = "<YOUR_PROJECT_DIRECTORY>"

if torch.cuda.is_available():
    print("GPU is available:", torch.cuda.get_device_name())
    device = torch.device("cuda:0")
else:
    print("GPU is unavailable")
    device = torch.device("cuda:1")

## LSTM-based model

In [None]:
from lstm_error_detect_model import MatchErrorDetectModel

In [None]:
class Trainer():
    def __init__(
            self, model, train_loader, val_loader, save_dir, criterion=nn.BCEWithLogitsLoss(),
            n_epochs=100, patience=1, lr=1e-3, train_batch_size=32, val_batch_size=256, weight_decay=1e-3
    ):
        self.n_epochs = n_epochs
        self.patience = patience

        self.save_dir = save_dir

        self.train_loader = train_loader
        self.val_loader = val_loader
        self.train_batch_size = train_batch_size
        self.val_batch_size = val_batch_size

        self.criterion = criterion.to(device)
        self.optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        self.lr = lr

    def fit(self, model, model_save_prefix, max_kappa=None):
        print("Training model...")
        model = model.to(device)

        # Initialize loss logger
        if max_kappa is None:
            max_kappa = -float('inf')

        loss_log = []
        model_save_name_log = []

        for epoch in range(self.n_epochs):
            train_loss, train_kappa = model.train_model(
                train_loader=self.train_loader, epoch=epoch, train_batch_size=self.train_batch_size,
                optimizer=self.optimizer, criterion=self.criterion
            )
            val_loss, val_kappa = model.eval_model(
                val_loader=self.val_loader, eval_batch_size=self.val_batch_size, criterion=self.criterion
            )

            # Save to log
            loss_log.append([train_loss, train_kappa, val_loss, val_kappa])

            # Compare to the existing best model
            if val_kappa > max_kappa:
                # Performance increase
                perf_incr = val_kappa - max_kappa

                if epoch > self.patience-1 and perf_incr > 0.001:
                    max_kappa = val_kappa
                    model_save_name = model_save_prefix + f"_epoch{epoch+1}_trainkappa{train_kappa:.4f}_valkappa{val_kappa:.4f}.pt"

                    # save model
                    model_save_path = os.path.join(
                        proj_dir, self.save_dir, model_save_name
                    )
                    torch.save(model, model_save_path)

                    print(f"val_kappa increased to {val_kappa:.4f} at epoch {epoch+1}. Model saved to pt: {model_save_path}")
                    model_save_name_log.append(model_save_name)

                else:
                    print(f"val_kappa increased to {val_kappa:.4f} at epoch {epoch+1}. Patience.")
                    model_save_name_log.append("no_save")
            else:
                model_save_name_log.append("no_save")

        loss_log_arr = np.array(loss_log)
        loss_log_df = pd.DataFrame(
            loss_log_arr, columns=["train_loss", "train_kappa", "val_loss", "val_kappa"]
        )
        loss_log_df["save_name"] = model_save_name_log
        self.loss_log_df = loss_log_df

        return loss_log_df

    def plot_train_log(self, loss_log_df=None):
        if loss_log_df == None:
            loss_log_df = self.loss_log_df

        f, ax = plt.subplots(1, 2, figsize=(12, 8))
        x = np.array(range(len(self.loss_log_df)))

        # loss
        ax[0].plot(x, loss_log_df['train_loss'], color='dodgerblue', label='Train loss')
        ax[0].plot(x, loss_log_df['val_loss'], color='coral', label='Validation loss')


        # ade, fde
        ax[1].plot(x, loss_log_df['train_kappa'], color='blue', label='Train kappa')
        ax[1].plot(x, loss_log_df['val_kappa'], color='red', label='Validation kappa')

        ax[0].legend(loc="best")
        ax[1].legend(loc="best")

        ax[0].set_xlabel("Epoch")
        ax[0].set_ylabel("Loss")
        ax[1].set_xlabel("Epoch")
        ax[1].set_ylabel("Kappa")

### Pre-training

In [None]:
tint = 5

# Load pickle
pk_name = f"train_val_test_set_stride20_tint{tint}_ntimes10k_aa.pkl"
with open(
    os.path.join(proj_dir, "Data", pk_name), 'rb'
) as my_file_obj:
    train_set, val_set, test_set = pickle.load(my_file_obj)

print(f"Train set size: {len(train_set)}")
print(f"Val set size: {len(val_set)}")
print(f"Test set size: {len(test_set)}")

# Training datasets
train_batch_size = 32
val_batch_size = 1024
test_batch_size = 1024

train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_set, batch_size=val_batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_set, batch_size=test_batch_size, collate_fn=collate_fn)

| Dataset             | pos_weight |
|---------------------|------------|
| AA synthetic (5 s)  | 3.9606     |
| AA synthetic (10 s) | 4.2149     |
| AA synthetic (30 s) | 5.8076     |
| eVED real (5s)      | 5.2330     |

In [None]:
# Match error detection model
input_dim = 4
hidden_size = 1024
n_enc_layers = 1
edge_embed_dim = 4
n_dec_layers = 1
dropout = 0.2

error_clf = MatchErrorDetectModel(
    input_dim=input_dim, hidden_size=hidden_size, edge_embed_dim=edge_embed_dim,
    n_enc_layers=n_enc_layers, n_dec_layers=n_dec_layers, dropout=dropout
)

# Trainer
save_dir = "<YOUR_MODEL_SAVE_DIRECTION>"
pos_weight = torch.tensor(3.9606)
trainer = Trainer(
    model=error_clf, train_loader=train_loader, val_loader=val_loader, save_dir=save_dir,
    criterion=nn.BCEWithLogitsLoss(pos_weight=pos_weight), n_epochs=100, patience=1, lr=1e-4,
    train_batch_size=train_batch_size, val_batch_size=val_batch_size, weight_decay=1e-3
)

# Fit the model
model_save_prefix = f"trajedge_lstm_tint{tint}_hidden{hidden_size}_nlayers{n_enc_layers}_dp02"
loss_log_df = trainer.fit(error_clf, model_save_prefix)

# save to pickle
pk_name = f"losslogdf_trajedge_lstm_tint{tint}_hidden{hidden_size}_nlayers{n_enc_layers}_dp02.pkl"
with open(os.path.join(proj_dir, save_dir, pk_name), 'wb') as my_file_obj:
    pickle.dump(loss_log_df, my_file_obj)

# print the best model info
loss_min_idx = loss_log_df['val_kappa'].idxmax()
print("\nBest model:", loss_log_df.loc[loss_min_idx, :])

#### Test

In [None]:
# Load best model
save_dir = "<YOUR_MODEL_SAVE_DIRECTION>"
model_save_name = "<YOUR_BEST_MODEL>"
model_save_path = os.path.join(proj_dir, save_dir, model_save_name)
error_clf = torch.load(model_save_path)

In [None]:
tint = 5

# Load pickle
pk_name = f"train_val_test_set_stride20_tint{tint}_ntimes10k_aa.pkl"
with open(
        os.path.join(proj_dir, "Data", pk_name), 'rb'
) as my_file_obj:
    train_set, val_set, test_set = pickle.load(my_file_obj)

print(f"Train set size: {len(train_set)}")
print(f"Val set size: {len(val_set)}")
print(f"Test set size: {len(test_set)}")

# Training datasets
train_batch_size = 32
val_batch_size = 1024
test_batch_size = 1024

train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_set, batch_size=val_batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_set, batch_size=test_batch_size, collate_fn=collate_fn)

In [None]:
# Evaluate the model
pos_weight = torch.tensor(3.9606)
val_loss, val_kappa = error_clf.eval_model(test_loader, test_batch_size, criterion=nn.BCEWithLogitsLoss(pos_weight=pos_weight))

### Fine-tuning

Fine-tuning is performed multiple times on various datasets. Below is an example using the real trajectory dataset.

In [None]:
# Load best model
save_dir = "<YOUR_MODEL_SAVE_DIRECTION>"
model_save_name = "<YOUR_BEST_MODEL>"
model_save_path = os.path.join(proj_dir, save_dir, model_save_name)
error_clf = torch.load(model_save_path)

In [None]:
# Load pickle
pk_name = "train_val_test_set_stride2_labeled184.pkl"
with open(
        os.path.join(proj_dir, "Data", pk_name), 'rb'
) as my_file_obj:
    train_set, val_set, test_set = pickle.load(my_file_obj)

print(f"Train set size: {len(train_set)}")
print(f"Val set size: {len(val_set)}")
print(f"Test set size: {len(test_set)}")

# Training datasets
train_batch_size = 32
val_batch_size = 128
test_batch_size = 128

train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_set, batch_size=val_batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_set, batch_size=test_batch_size, collate_fn=collate_fn)

In [None]:
hidden_size = 1024
n_enc_layers = 1

# Trainer
save_dir = "<YOUR_MODEL_SAVE_DIRECTION>"
pos_weight = torch.tensor(5.233)
trainer = Trainer(
    model=error_clf, train_loader=train_loader, val_loader=val_loader, save_dir=save_dir,
    criterion=nn.BCEWithLogitsLoss(pos_weight=pos_weight), n_epochs=50, patience=1, lr=1e-4,
    train_batch_size=train_batch_size, val_batch_size=val_batch_size, weight_decay=1e-3
)

# Fit the model
model_save_prefix = f"syn5real_trajedge_lstm_hidden{hidden_size}_nlayers{n_enc_layers}_dp02"
loss_log_df = trainer.fit(error_clf, model_save_prefix)

# Save to pickle
pk_name = f"losslogdf_syn5real_trajedge_lstm_hidden{hidden_size}_nlayers{n_enc_layers}_dp02.pkl"
with open(os.path.join(proj_dir, save_dir, pk_name), 'wb') as my_file_obj:
    pickle.dump(loss_log_df, my_file_obj)

# Print the best model info
loss_min_idx = loss_log_df['val_kappa'].idxmax()
print("\nBest model:", loss_log_df.loc[loss_min_idx, :])

## Transformer-based model

In [None]:
from transformer_error_detect_model import MatchErrorDetectModel

In [None]:
class Trainer():
    def __init__(
            self, model, train_loader, val_loader, save_dir, criterion=nn.BCEWithLogitsLoss(),
            n_epochs=100, patience=1, lr=1e-3, train_batch_size=32, val_batch_size=256, weight_decay=1e-3
    ):
        self.n_epochs = n_epochs
        self.patience = patience

        self.save_dir = save_dir

        self.train_loader = train_loader
        self.val_loader = val_loader
        self.train_batch_size = train_batch_size
        self.val_batch_size = val_batch_size

        self.criterion = criterion.to(device)
        self.optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        self.lr = lr

    def fit(self, model, model_save_prefix, max_kappa=None):
        print("Training model...")
        model = model.to(device)

        # Initialize loss logger
        if max_kappa is None:
            max_kappa = -float('inf')

        loss_log = []
        model_save_name_log = []

        for epoch in range(self.n_epochs):
            train_loss, train_kappa = model.train_model(
                train_loader=self.train_loader, epoch=epoch, train_batch_size=self.train_batch_size,
                optimizer=self.optimizer, criterion=self.criterion
            )
            val_loss, val_kappa = model.eval_model(
                val_loader=self.val_loader, eval_batch_size=self.val_batch_size, criterion=self.criterion
            )

            # Save to log
            loss_log.append([train_loss, train_kappa, val_loss, val_kappa])

            # Compare to the existing best model
            if val_kappa > max_kappa:
                # Performance increase
                perf_incr = val_kappa - max_kappa

                if epoch > self.patience-1 and perf_incr > 0.001:
                    max_kappa = val_kappa
                    model_save_name = model_save_prefix + f"_epoch{epoch+1}_trainkappa{train_kappa:.4f}_valkappa{val_kappa:.4f}.pt"

                    # save model
                    model_save_path = os.path.join(
                        proj_dir, self.save_dir, model_save_name
                    )
                    torch.save(model, model_save_path)

                    print(f"val_kappa increased to {val_kappa:.4f} at epoch {epoch+1}. Model saved to pt: {model_save_path}")
                    model_save_name_log.append(model_save_name)

                else:
                    print(f"val_kappa increased to {val_kappa:.4f} at epoch {epoch+1}. Patience.")
                    model_save_name_log.append("no_save")
            else:
                model_save_name_log.append("no_save")

        loss_log_arr = np.array(loss_log)
        loss_log_df = pd.DataFrame(
            loss_log_arr, columns=["train_loss", "train_kappa", "val_loss", "val_kappa"]
        )
        loss_log_df["save_name"] = model_save_name_log
        self.loss_log_df = loss_log_df

        return loss_log_df

    def plot_train_log(self, loss_log_df=None):
        if loss_log_df == None:
            loss_log_df = self.loss_log_df

        f, ax = plt.subplots(1, 2, figsize=(12, 8))
        x = np.array(range(len(self.loss_log_df)))

        # loss
        ax[0].plot(x, loss_log_df['train_loss'], color='dodgerblue', label='Train loss')
        ax[0].plot(x, loss_log_df['val_loss'], color='coral', label='Validation loss')


        # ade, fde
        ax[1].plot(x, loss_log_df['train_kappa'], color='blue', label='Train kappa')
        ax[1].plot(x, loss_log_df['val_kappa'], color='red', label='Validation kappa')

        ax[0].legend(loc="best")
        ax[1].legend(loc="best")

        ax[0].set_xlabel("Epoch")
        ax[0].set_ylabel("Loss")
        ax[1].set_xlabel("Epoch")
        ax[1].set_ylabel("Kappa")

In [None]:
# Load pickle
pk_name = "train_val_test_set_stride2_labeled184.pkl"
with open(
        os.path.join(proj_dir, "Data", pk_name), 'rb'
) as my_file_obj:
    train_set, val_set, test_set = pickle.load(my_file_obj)

print(f"Train set size: {len(train_set)}")
print(f"Val set size: {len(val_set)}")
print(f"Test set size: {len(test_set)}")

# Training datasets
train_batch_size = 32
val_batch_size = 1024
test_batch_size = 1024

train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_set, batch_size=val_batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_set, batch_size=test_batch_size, collate_fn=collate_fn)

In [None]:
# Match error detection model
input_dim = 4
n_head = 2
edge_embed_dim = 4
n_enc_layers = 2
n_dec_layers = 2
dropout = 0.2

d_model_ls = [16, 32, 64]
d_hid_ls = [512, 1024, 2048, 4096]

for d_model in d_model_ls:
    for d_hid in d_hid_ls:
        print(f"Now experimenting with d_model: {d_model}, d_hid: {d_hid}")

        error_clf = MatchErrorDetectModel(
            input_dim=input_dim, d_model=d_model, n_head=n_head, d_hid=d_hid, edge_embed_dim=edge_embed_dim,
            n_enc_layers=n_enc_layers, n_dec_layers=n_dec_layers, dropout=dropout
        )

        # Trainer
        save_dir = "<YOUR_MODEL_SAVE_DIRECTION>"
        pos_weight = torch.tensor(5.233)
        trainer = Trainer(
            model=error_clf, train_loader=train_loader, val_loader=val_loader, save_dir=save_dir,
            criterion=nn.BCEWithLogitsLoss(pos_weight=pos_weight), n_epochs=20, patience=1, lr=1e-3,
            train_batch_size=train_batch_size, val_batch_size=val_batch_size, weight_decay=1e-3
        )

        # Fit the model
        model_save_prefix = f"trajedge_trans_dmodel{d_model}_nhead{n_head}_dhid{d_hid}_nlayers{n_enc_layers}_dp02"
        loss_log_df = trainer.fit(error_clf, model_save_prefix)

        # save to pickle
        pk_name = f"losslogdf_trajedge_trans_dmodel{d_model}_nhead{n_head}_dhid{d_hid}_nlayers{n_enc_layers}_dp02.pkl"
        with open(os.path.join(proj_dir, save_dir, pk_name), 'wb') as my_file_obj:
            pickle.dump(loss_log_df, my_file_obj)

        # print the best model info
        loss_min_idx = loss_log_df['val_kappa'].idxmax()
        print("\nBest model:", loss_log_df.loc[loss_min_idx, :])

## Prediction (error detection) using the LSTM-based model

In [None]:
# Load best model
save_dir = "Data"
model_save_name = "syn5real_trajedge_lstm_hidden1024_nlayers1_dp02_epoch47_trainkappa1.0000_valkappa0.9847.pt"
model_save_path = os.path.join(proj_dir, save_dir, model_save_name)
error_clf = torch.load(model_save_path)

### Predictions on the test set

In [None]:
# Load pickle
pk_name = "train_val_test_set_stride2_labeled184.pkl"
with open(
        os.path.join(proj_dir, "Data", pk_name), 'rb'
) as my_file_obj:
    train_set, val_set, test_set = pickle.load(my_file_obj)

print("eVED real")
print(f"Train set size: {len(train_set)}")
print(f"Val set size: {len(val_set)}")
print(f"Test set size: {len(test_set)}")

In [None]:
probs_ls = []
pred_labels_ls = []

for sample_i in trange(len(test_set)):
    # Sample
    single_sample = test_set[sample_i]
    src = torch.unsqueeze(single_sample[0], 0).to(device)
    src_lengths = torch.tensor([src.shape[1]], dtype=torch.int64)

    with torch.no_grad():
        # Prediction
        probs, pred_labels = error_clf.predict(src=src, src_lengths=src_lengths)

        # Convert to np.arr on CPU
        probs_arr = probs.cpu().numpy()[0]
        pred_labels_arr = pred_labels.cpu().numpy()[0]

    # Save to ls
    probs_ls.append(probs_arr)
    pred_labels_ls.append(pred_labels_arr)

### Predictions on all samples

In [None]:
# Load pickle
pk_name = "xy_ls_stride2_labeled184.pkl"
with open(os.path.join(proj_dir, "Data", pk_name), 'rb'
) as my_file_obj:
    x_ls, y_ls = pickle.load(my_file_obj)

print(f"x_ls len: {len(x_ls)}, y_ls len: {len(y_ls)}")

In [None]:
# PyTorch dataset
all_sample_set = MatchErrorDataset(x_ls, y_ls)

probs_ls = []
pred_labels_ls = []

for sample_i in trange(len(all_sample_set)):
    # Sample
    single_sample = all_sample_set[sample_i]
    src = torch.unsqueeze(single_sample[0], 0).to(device)
    src_lengths = torch.tensor([src.shape[1]], dtype=torch.int64)

    with torch.no_grad():
        # Prediction
        probs, pred_labels = error_clf.predict(src=src, src_lengths=src_lengths)

        # Convert to np.arr on CPU
        probs_arr = probs.cpu().numpy()[0]
        pred_labels_arr = pred_labels.cpu().numpy()[0]

    # Save to ls
    probs_ls.append(probs_arr)
    pred_labels_ls.append(pred_labels_arr)