In [None]:
import numpy as np
import seaborn as sns
from copy import deepcopy
import os
from pathlib import Path
from tqdm.auto import tqdm

In [None]:
root = Path("/home/edu/code/google_fast_or_slow/data/npz_all/npz")
collection = "layout/xla"
ctype = "default"

In [None]:
def prune_graph(data):
    print("Pruning graph...")
    new_data = deepcopy(dict(data))
    print("Original graph has {} nodes and {} edges".format(data["node_feat"].shape[0], data["edge_index"].shape[0]))
    in_edge_index = data["edge_index"][np.isin(data["edge_index"], data["node_config_ids"]).any(1)]

    in_node_ids = np.unique(in_edge_index)
    lookup = np.ones(data["node_feat"].shape[0]) * -1
    lookup[in_node_ids] = np.arange(in_node_ids.shape[0])

    in_node_feats = data["node_feat"][in_node_ids, :]
    in_node_opcode = data["node_opcode"][in_node_ids]
    in_edge_index = lookup[in_edge_index]
    in_node_config_ids = lookup[data["node_config_ids"]]

    new_data["node_feat"] = in_node_feats
    new_data["node_opcode"] = in_node_opcode
    new_data["edge_index"] = in_edge_index
    new_data["node_config_ids"] = in_node_config_ids
    print("New graph has {} nodes and {} edges".format(new_data["node_feat"].shape[0], new_data["edge_index"].shape[0]))
    return new_data


In [None]:
def remove_dupplicated_node_configs(data):
    reshaped_config_feat = data["node_config_feat"].reshape(data["node_config_feat"].shape[0], -1) + 2 # avoid zeros
    positional_array = np.random.random(reshaped_config_feat.shape[1])  # multiply each value by its position to avoid removing permutations by accident
    reshaped_values = (reshaped_config_feat * positional_array[None, :]).sum(1)
    is_equal_matrix = reshaped_values[None, :] == reshaped_values[:, None] # quadratic matrix of all pairwise equalities
    # is_equal_matrix[np.triu_indices(is_equal_matrix.shape[0], 0)] = 0 # only get diagonal to avoid remove twice
    is_equal_matrix = np.tril(is_equal_matrix, -1) # only get diagonal to avoid remove twice
    to_remove_ids = np.unique(np.where(is_equal_matrix)[0])
    print("Removing {} duplicated node configs out of {}".format(to_remove_ids.shape[0], data["node_config_feat"].shape[0]))
    data["config_runtime"] = np.delete(data["config_runtime"], to_remove_ids)
    data["node_config_feat"] = np.delete(data["node_config_feat"], to_remove_ids, axis=0)
    return data

In [None]:
dst_dir = root / f"{collection}_pruned" / ctype
for split in ["train", "valid", "test"]:
    print("Loading {} data...".format(split))
    split_src_dir = root / collection / ctype / split
    split_dst_dir = dst_dir / split
    split_dst_dir.mkdir(parents=True, exist_ok=True)

    for npz_path in tqdm(list(split_src_dir.glob("*.npz"))):
        print(npz_path)
        data = dict(np.load(str(npz_path), allow_pickle=True))
        # data = prune_graph(data)
        if split == "train":
            data = remove_dupplicated_node_configs(data)
        np.savez(split_dst_dir / npz_path.name, **data)

In [None]:
split_src_dir

In [None]:
path = "/home/edu/code/google_fast_or_slow/data/npz_all/npz/layout/xla/default/train/ncf.2x2.fp32.npz"
data = np.load(path)

In [None]:
list(data.keys())

In [None]:
data["node_config_feat"].shape

In [None]:
100040 - 99668

In [None]:
reshaped_config_feat = data["node_config_feat"].reshape(data["node_config_feat"].shape[0], -1)
print(1)
positional_array = (np.arange(reshaped_config_feat.shape[1]) + 1) # multiply each value by its position to avoid removing permutations by accident
reshaped_values = (reshaped_config_feat * positional_array[None, :]).sum(1)
print(2)
is_equal_matrix = reshaped_values[None, :] == reshaped_values[:, None] # quadratic matrix of all pairwise equalities
print(3)
is_equal_matrix = np.tril(is_equal_matrix, -1) # only get diagonal to avoid remove twice
print(4)
to_remove_ids = np.unique(np.where(is_equal_matrix)[0])
print(5)
print("Removing {} duplicated node configs".format(to_remove_ids.shape[0]))

In [None]:
remove_dupplicated_node_configs(data)

In [None]:
(100040 ** 2) * 4 / 1e9

In [None]:
remove_dupplicated_node_configs(dict(data))

In [None]:
data["node_config_feat"].shape

In [None]:
np.delete(x, to_remove_ids)


In [None]:
is_equal_matrix.sum(1)

In [None]:
x = np.array([1, 1, 2, 3, 3, 4, 7, 1, 2])
m = x[None, :] == x[:, None]
# remove 1,4, 7, 8

In [None]:
m

In [None]:
np.tril(m, -1)

In [None]:

m

In [None]:
to_remove_ids = np.unique(np.where(m)[0])

In [None]:
np.delete(x, to_remove_ids)

In [None]:
(5304 ** 2) / 1e9

In [None]:
new_data = prune_graph(data)

In [None]:
data["edge_index"].shape#[3, :]

In [None]:

in_node_feats.shape

In [None]:
data["node_feat"].shape

In [None]:
np.unique(data["node_config_feat"][:, :, 98], return_counts=True)

In [None]:
{k: data[k].shape for k in data.keys()}

In [None]:
np.unique((data["node_config_feat"].sum(2) != -18).sum(1), return_counts=True)

In [None]:
sns.histplot((data["node_config_feat"].sum(2) != -18).sum(1))

In [None]:
((data["node_config_feat"][0] != data["node_config_feat"][3]).sum(1) > 0).sum()

In [None]:
100040 * 100040 * 1 / 1e9

In [None]:
from tqdm.auto import tqdm

In [None]:
data["node_config_feat"][i:i+1].shape

In [None]:
((data["node_config_feat"][i:i+1] != data["node_config_feat"]).sum(2) > 0).sum(1)

In [None]:
max_size = 100
bz = 100
result = np.zeros((max_size, 100040), dtype=np.uint8)
for i in tqdm(range(0, max_size, bz)):
    result[i:i+bz] = ((data["node_config_feat"][i:i+bz, None] != data["node_config_feat"][None, ...]).sum(-1) > 0).sum(-1)

In [None]:
result.max(1)

In [None]:
(result == 1).sum(1)

In [None]:
(result == 0).sum(1)

In [None]:
result.min(1)

In [None]:
np.median(result, 1)

In [None]:
np.percentile(result, 0.1, 1)

In [None]:
(data["node_config_feat"][0] != data["node_config_feat"][0][1]).sum()

In [None]:
np.unique(data["node_feat"][:, 22])