# Data Clustering

This is how we obtain the `distribshift` data split (see `./data_patch/split.json`) used in our *distribution shift* experiments.

The following code is just for demonstration. Some modification is needed to run it.

In [1]:
import json
import pickle
import random
from typing import Counter

import matplotlib.pyplot as plt
import numpy as np
import torch
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.mixture import GaussianMixture

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
random.seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [2]:
def gen_vector_keys(data_dict):
    vx = data_dict["ego_states"][0] * 0.5
    vy = data_dict["ego_states"][1] * 0.5
    v_yaw = data_dict["ego_states"][4]
    ax = (
        data_dict["ego_hist_traj_diff"][-1, 0]
        - data_dict["ego_hist_traj_diff"][-2, 0]
    )
    ay = (
        data_dict["ego_hist_traj_diff"][-1, 1]
        - data_dict["ego_hist_traj_diff"][-2, 1]
    )
    cx = data_dict["ego_states"][2]
    cy = data_dict["ego_states"][3]
    vhead = data_dict["ego_states"][7] * 0.5
    steeling = data_dict["ego_states"][8]

    return [
        np.array([vx, vy, v_yaw, ax, ay, cx, cy, vhead, steeling]),
        data_dict["goal"],
        data_dict["ego_hist_traj"].flatten(),
    ]

## Clustering

In [None]:
random.seed(0)

with open("./data/split.json", "r") as f:
    split_data = json.load(f)
val_tokens = split_data["val"]
token_list = random.sample(val_tokens, 2000)
assert token_list[0] == "6a1f2ebe1ef8437c87a5a742362b09b4"
token_list[:5]

In [None]:
# Load gt traj
with open("./data/metrics/gt_traj.pkl", "rb") as f:
    all_gt_traj = pickle.load(f)

gt_traj = {key: all_gt_traj[key] for key in token_list}
gt_traj["6a1f2ebe1ef8437c87a5a742362b09b4"] # shape: (1, 6, 2)

In [None]:
# PCA visualize test_set_memory
test_mem_input = []  # will be keys
test_mem_output = []  # will be trajectories
test_mem_joint = []  # will be keys + trajectories

for token in token_list:
    file_path = f"./data/val/{token}.pkl"
    sample = pickle.load(open(file_path, "rb"))
    data_dict = {
        "ego_states": sample["ego_states"],
        "goal": sample["goal"],
        "ego_hist_traj": sample["ego_hist_traj"],
        "ego_hist_traj_diff": sample["ego_hist_traj_diff"],
    }
    key = gen_vector_keys(data_dict)
    key = np.concatenate(key, axis=0)
    test_mem_input.append(key)

    traj = gt_traj[token][0].flatten()
    test_mem_output.append(traj)

    concat_input_output = np.concatenate([key, traj])
    test_mem_joint.append(concat_input_output)


print(len(test_mem_input), len(test_mem_output), len(test_mem_joint))

In [None]:
tsne_input = TSNE(n_components=2, random_state=0, perplexity=30, n_iter=300)
test_mem_input_tsne = tsne_input.fit_transform(np.array(test_mem_input))

plt.figure()
plt.scatter(test_mem_input_tsne[:, 0], test_mem_input_tsne[:, 1], marker='.')
plt.show()

In [None]:
tsne_output = TSNE(n_components=2, random_state=0, perplexity=30, n_iter=300)
test_mem_output_tsne = tsne_output.fit_transform(np.array(test_mem_output))

plt.figure()
plt.scatter(test_mem_output_tsne[:, 0], test_mem_output_tsne[:, 1], marker='.')
plt.show()

In [None]:
tsne_concat = TSNE(n_components=2, random_state=0, perplexity=30, n_iter=300)
test_mem_joint_tsne = tsne_concat.fit_transform(np.array(test_mem_joint))

plt.figure()
plt.scatter(test_mem_joint_tsne[:, 0], test_mem_joint_tsne[:, 1], marker='.')
plt.show()

In [None]:
# GMM cluster input
# fit a new gmm
gmm_input = GaussianMixture(n_components=3, random_state=0, covariance_type="diag")
gmm_input.fit(test_mem_input)
test_mem_input_labels = gmm_input.predict(test_mem_input)

# Plot input
plt.figure()
# scatter with labeling and legend
for i in range(3):
    plt.scatter(
        test_mem_input_tsne[test_mem_input_labels == i, 0],
        test_mem_input_tsne[test_mem_input_labels == i, 1],
        marker=".",
        label=f"cluster {i}",
    )
plt.legend()
plt.show()
print(Counter(test_mem_input_labels))

In [None]:
# Save a new list of tokens following the order
new_token_list = []
for label in [0, 1, 2]:
    for i, l in enumerate(test_mem_input_labels):
        if l == label:
            new_token_list.append(token_list[i])

print(len(new_token_list))

In [None]:
new_token_list[:5]

In [17]:
with open("./data/split_distribution_shift_everything.json", "w") as f:
    json.dump({"distribshift2": new_token_list}, f)

In [16]:
# split first 180 and rest to two different files
token_list_part1 = new_token_list[:180]
token_list_part2 = new_token_list[180:]

with open("./data/split_part1.json", "w") as f:
    json.dump({"cluster0_180mem": token_list_part1}, f)

with open("./data/split_part2.json", "w") as f:
    json.dump({"cluster01_restmem": token_list_part2}, f)

In [17]:
# Save the new list of tokens
with open("./data/split_val_distributionshift_joint_01.json", "w") as f:
    json.dump(new_token_list, f)

In [None]:
# Make a reverse lookup with test_mem_input_labels
reverse_lookup = {token: int(label) for token, label in zip(token_list, test_mem_joint_labels)}
# save it
with open("./data/reverse_lookup_val_distributionshift_joint_01.json", "w") as f:
    json.dump(reverse_lookup, f, indent=4)
reverse_lookup

## Cluster-wise Evaluation

In [19]:
# load reverse_lookup
with open("./data/reverse_lookup_val_distributionshift_joint_01.json", "r") as f:
    reverse_lookup = json.load(f)

In [23]:
RESULT_FILE = "./results/[EXPERIMENT_NAME]/prediction_results.pkl"

with open(RESULT_FILE, "rb") as f:
    results = pickle.load(f)

# results["64d019bf33ba4dcb9eca5e5ab2ef967e"]

In [None]:
# Split the results into clusters, put into a dictionary
cluster_results = {0: {}, 1: {}, 2: {}}
for token, result in results.items():
    label = reverse_lookup[token]
    cluster_results[label][token] = result

print(len(cluster_results[0]), len(cluster_results[1]), len(cluster_results[2]))

In [None]:
from agentdriver.evaluation.evaluation import planning_evaluation

class AttributeDict(dict):
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

for label, cluster_result in cluster_results.items():
    # eval_config = {
    #     "method": f"Distribution Cluster {label}",
    #     "metric": "uniad",
    #     "gt_folder": "data/metrics",
    #     # "success_threshold": 7.5,
    # }
    eval_config = AttributeDict(
        {
            "method": f"Cluster {label}",
            "metric": "uniad",
            "gt_folder": "data/metrics",
            "success_threshold": 7.5,
        }
    )

    planning_evaluation(cluster_result, eval_config, success_threshold=7.5)
    # planning_evaluation(cluster_result, eval_config, success_threshold=5.0)
    # planning_evaluation(cluster_result, eval_config, success_threshold=2.5)
    print()