In [2]:
import numpy as np
import seaborn as sns
from copy import deepcopy
import os
from pathlib import Path
from tqdm.auto import tqdm
import sys
import copy


def get_obj_mbs(obj, cp=True):
    return sys.getsizeof(copy.deepcopy(obj) if cp else obj) / (1<<20)

In [3]:
root = Path("./npz_all/npz")
# collection = "layout/xla"
collection = "layout/nlp"
ctype = "default"

In [4]:
def prune_graph(data):
    # print("Pruning graph...")
    new_data = deepcopy(dict(data))
    # print("Original graph has {} nodes and {} edges".format(data["node_feat"].shape[0], data["edge_index"].shape[0]))
    in_edge_index = data["edge_index"][np.isin(data["edge_index"], data["node_config_ids"]).any(1)]

    in_node_ids = np.unique(in_edge_index)
    assert len(set(data["node_config_ids"]) - set(in_node_ids)) == 0
    lookup = np.ones(data["node_feat"].shape[0]) * -1
    lookup[in_node_ids] = np.arange(in_node_ids.shape[0])

    in_node_feats = data["node_feat"][in_node_ids, :]
    in_node_opcode = data["node_opcode"][in_node_ids]
    in_edge_index = lookup[in_edge_index]
    in_node_config_ids = lookup[data["node_config_ids"]]

    new_data["node_feat"] = in_node_feats
    new_data["node_opcode"] = in_node_opcode
    new_data["edge_index"] = in_edge_index
    new_data["node_config_ids"] = in_node_config_ids
    # print("New graph has {} nodes and {} edges".format(new_data["node_feat"].shape[0], new_data["edge_index"].shape[0]))
    return new_data


In [5]:
def remove_dupplicated_node_configs(data):
    reshaped_config_feat = data["node_config_feat"].reshape(data["node_config_feat"].shape[0], -1) + 2 # avoid zeros
    positional_array = np.random.random(reshaped_config_feat.shape[1])  # multiply each value by its position to avoid removing permutations by accident
    reshaped_values = (reshaped_config_feat * positional_array[None, :]).sum(1)
    is_equal_matrix = reshaped_values[None, :] == reshaped_values[:, None] # quadratic matrix of all pairwise equalities
    # is_equal_matrix[np.triu_indices(is_equal_matrix.shape[0], 0)] = 0 # only get diagonal to avoid remove twice
    is_equal_matrix = np.tril(is_equal_matrix, -1) # only get diagonal to avoid remove twice
    to_remove_ids = np.unique(np.where(is_equal_matrix)[0])
    # print("Removing {} duplicated node configs out of {}".format(to_remove_ids.shape[0], data["node_config_feat"].shape[0]))
    data["config_runtime"] = np.delete(data["config_runtime"], to_remove_ids)
    data["node_config_feat"] = np.delete(data["node_config_feat"], to_remove_ids, axis=0)
    return data

In [6]:
def find_duplicate_rows(data):
    matrix = data["node_config_feat"].reshape(data["node_config_feat"].shape[0], -1).astype(np.int32)

    # Get unique rows and inverse index
    _, unique_idx, inverse = np.unique(matrix, axis=0, return_index=True, return_inverse=True)
    
    # Create a dictionary of duplicates
    duplicates = {}
    for i, inv in enumerate(inverse):
        if list(np.where(inverse == inv)[0]) != [i]:
            duplicates.setdefault(unique_idx[inv], []).append(i)
    
    # Filter out entries with only one index (i.e., unique rows)
    dup_config_dct = {k: np.array(v) for k, v in duplicates.items() if len(v) > 1}

    all_dup_idx = [v[v != k] for k, v in dup_config_dct.items()]
    all_dup_idx = np.concatenate(all_dup_idx) if len(all_dup_idx) else []

    return dup_config_dct, all_dup_idx


def dedup_configs(data):
    dup_config_dct, all_dup_idx = find_duplicate_rows(data)

    for org_idx, idx_list in dup_config_dct.items():
        data["config_runtime"][org_idx] = round(np.mean(data["config_runtime"][idx_list]))

    if len(all_dup_idx):
        data["config_runtime"] = np.delete(data["config_runtime"], all_dup_idx)
        data["node_config_feat"] = np.delete(data["node_config_feat"], all_dup_idx, axis=0)

    return data


def test_dedup_configs(data):
    res = remove_dupplicated_node_configs(copy.deepcopy(data))["node_config_feat"].shape == dedup_configs(copy.deepcopy(data))["node_config_feat"].shape
    assert res
    return res


In [7]:
def vec_to_int(vec: np.ndarray) -> np.ndarray:
    # Powers of 7: [1, 7, 49, 343, 2401, 16807]
    powers_of_7 = np.array([7**i for i in range(6)])
    return np.dot(vec, powers_of_7).astype(np.int32)


def int_to_vec(integers: np.ndarray) -> np.ndarray:
    # Create an empty array of shape (N, 6) to store the results
    vectors = np.empty((len(integers), 6), dtype=np.int64)

    # Divide by powers of 7 and take the remainder to find each digit
    for i in range(6):
        vectors[:, i] = integers % 7
        integers //= 7

    return vectors.astype(np.int32)


def compress_configs(node_configs):
    vecs = node_configs.reshape(-1, 6).astype(np.int32) + 1
    ints = vec_to_int(vecs)
    ints = ints.reshape(node_configs.shape[0], node_configs.shape[1], 3)
    return ints


def decompress_configs(node_configs):
    ints = node_configs.astype(np.int32).reshape(-1)
    vecs = int_to_vec(ints)
    vecs = vecs.reshape(node_configs.shape[0], -1, 18) - 1
    return vecs


def test_compression(data, db=False):
    org = data["node_config_feat"].astype(np.int32)
    comp = compress_configs(data["node_config_feat"])
    decomp = decompress_configs(comp)

    if db:
        print(org.shape, comp.shape, decomp.shape)
        print(org[0, :2], comp[0, :2], decomp[0, :2], sep="\n")
        print(get_obj_mbs(org), get_obj_mbs(comp), get_obj_mbs(decomp))
    
    res = (org == decomp).all()

    assert res
    assert round(get_obj_mbs(org) / get_obj_mbs(comp)) == 6

    return res


In [8]:
dst_dir = root / f"{collection}_compressed" / ctype
for split in ["train", "valid", "test"]:
    print("Loading {} data...".format(split))
    split_src_dir = root / collection / ctype / split
    split_dst_dir = dst_dir / split
    split_dst_dir.mkdir(parents=True, exist_ok=True)

    for npz_path in tqdm(list(split_src_dir.glob("*.npz"))):
        out_p = split_dst_dir / npz_path.name

        if out_p.exists():
            continue

        data = dict(np.load(str(npz_path), allow_pickle=True))
        # data = prune_graph(data)
        
        # if split == "train":
        #     # assert test_dedup_configs(data)
        #     data = dedup_configs(data)

        if split == "valid":
            best_idx = np.argsort(data["config_runtime"])[:1000]
            data["node_config_feat"] = data["node_config_feat"][best_idx]
            data["config_runtime"] = data["config_runtime"][best_idx]

        # assert test_compression(data)
        data["node_config_feat"] = compress_configs(data["node_config_feat"])

        # np.savez(split_dst_dir / npz_path.name, **data)
        np.savez_compressed(split_dst_dir / npz_path.name, **data)

Loading train data...


  0%|          | 0/198 [00:00<?, ?it/s]

100%|██████████| 198/198 [04:02<00:00,  1.23s/it]


Loading valid data...


100%|██████████| 20/20 [00:24<00:00,  1.22s/it]


Loading test data...


100%|██████████| 17/17 [00:00<00:00, 25.00it/s]


In [None]:
test_compression(data, db=True)

In [None]:
# ints.reshape(data["node_config_feat"].shape[0], data["node_config_feat"].shape[1], 3).shape
int_to_vec(ints).shape

In [None]:
# vecs = data["node_config_feat"][0, :, :6].astype(np.int32) + 1
# (vecs == int_to_vec(vec_to_int(vecs))).all()

# vecs = data["node_config_feat"].reshape(-1, 3, 6).reshape(-1, 6).astype(np.int32) + 1



vecs = data["node_config_feat"].reshape(-1, 6).astype(np.int32) + 1
ints = vec_to_int(vecs)
res = int_to_vec(ints).reshape(data["node_config_feat"].shape[0], -1, 18)

(data["node_config_feat"].astype(np.int32) + 1 == res).all()


In [None]:
sys.getsizeof(ints) / 1e6

In [None]:
sys.getsizeof(data["node_config_feat"].astype(np.int32)) / 1e6

In [None]:
# data["node_config_feat"][0, 0, :6].tolist()
# sys.getsizeof(data["node_feat"]) / 1e6

reshaped_config_feat = data["node_config_feat"].reshape(data["node_config_feat"].shape[0], -1) + 2 # avoid zeros
positional_array = np.random.random(reshaped_config_feat.shape[1])  # multiply each value by its position to avoid removing permutations by accident
reshaped_values = (reshaped_config_feat * positional_array[None, :]).sum(1)
is_equal_matrix = reshaped_values[None, :] == reshaped_values[:, None] # quadratic matrix of all pairwise equalities
is_equal_matrix = np.tril(is_equal_matrix, -1) # only get diagonal to avoid remove twice
to_remove_ids = np.unique(np.where(is_equal_matrix)[0])

to_remove_ids

# print("Removing {} duplicated node configs out of {}".format(to_remove_ids.shape[0], data["node_config_feat"].shape[0]))
# data["config_runtime"] = np.delete(data["config_runtime"], to_remove_ids)
# data["node_config_feat"] = np.delete(data["node_config_feat"], to_remove_ids, axis=0)

In [None]:
split_src_dir

In [None]:
path = "/home/edu/code/google_fast_or_slow/data/npz_all/npz/layout/xla/default/train/ncf.2x2.fp32.npz"
data = np.load(path)

In [None]:
list(data.keys())

In [None]:
data["node_config_feat"].shape

In [None]:
in_edge_index = data["edge_index"][np.isin(data["edge_index"], data["node_config_ids"]).any(1)]

in_node_ids = np.unique(in_edge_index)

In [None]:
len(set(data["node_config_ids"]) - set(in_node_ids))

In [None]:
100040 - 99668

In [None]:
reshaped_config_feat = data["node_config_feat"].reshape(data["node_config_feat"].shape[0], -1)
print(1)
positional_array = (np.arange(reshaped_config_feat.shape[1]) + 1) # multiply each value by its position to avoid removing permutations by accident
reshaped_values = (reshaped_config_feat * positional_array[None, :]).sum(1)
print(2)
is_equal_matrix = reshaped_values[None, :] == reshaped_values[:, None] # quadratic matrix of all pairwise equalities
print(3)
is_equal_matrix = np.tril(is_equal_matrix, -1) # only get diagonal to avoid remove twice
print(4)
to_remove_ids = np.unique(np.where(is_equal_matrix)[0])
print(5)
print("Removing {} duplicated node configs".format(to_remove_ids.shape[0]))

In [None]:
remove_dupplicated_node_configs(data)

In [None]:
(100040 ** 2) * 4 / 1e9

In [None]:
remove_dupplicated_node_configs(dict(data))

In [None]:
data["node_config_feat"].shape

In [None]:
np.delete(x, to_remove_ids)


In [None]:
is_equal_matrix.sum(1)

In [None]:
x = np.array([1, 1, 2, 3, 3, 4, 7, 1, 2])
m = x[None, :] == x[:, None]
# remove 1,4, 7, 8

In [None]:
m

In [None]:
np.tril(m, -1)

In [None]:

m

In [None]:
to_remove_ids = np.unique(np.where(m)[0])

In [None]:
np.delete(x, to_remove_ids)

In [None]:
(5304 ** 2) / 1e9

In [None]:
new_data = prune_graph(data)

In [None]:
data["edge_index"].shape#[3, :]

In [None]:

in_node_feats.shape

In [None]:
data["node_feat"].shape

In [None]:
np.unique(data["node_config_feat"][:, :, 98], return_counts=True)

In [None]:
{k: data[k].shape for k in data.keys()}

In [None]:
np.unique((data["node_config_feat"].sum(2) != -18).sum(1), return_counts=True)

In [None]:
sns.histplot((data["node_config_feat"].sum(2) != -18).sum(1))

In [None]:
((data["node_config_feat"][0] != data["node_config_feat"][3]).sum(1) > 0).sum()

In [None]:
100040 * 100040 * 1 / 1e9

In [None]:
from tqdm.auto import tqdm

In [None]:
data["node_config_feat"][i:i+1].shape

In [None]:
((data["node_config_feat"][i:i+1] != data["node_config_feat"]).sum(2) > 0).sum(1)

In [None]:
max_size = 100
bz = 100
result = np.zeros((max_size, 100040), dtype=np.uint8)
for i in tqdm(range(0, max_size, bz)):
    result[i:i+bz] = ((data["node_config_feat"][i:i+bz, None] != data["node_config_feat"][None, ...]).sum(-1) > 0).sum(-1)

In [None]:
result.max(1)

In [None]:
(result == 1).sum(1)

In [None]:
(result == 0).sum(1)

In [None]:
result.min(1)

In [None]:
np.median(result, 1)

In [None]:
np.percentile(result, 0.1, 1)

In [None]:
(data["node_config_feat"][0] != data["node_config_feat"][0][1]).sum()

In [None]:
np.unique(data["node_feat"][:, 22])