In [4]:
import random
import subprocess
import numpy as np
from tqdm import tqdm
import os

def split_meshes(num_meshes, val_ratio=0.025):
    indices = list(range(1, num_meshes + 1))  # Mesh indices from 1 to num_meshes
    random.shuffle(indices)
    val_size = int(num_meshes * val_ratio)
    val_set = indices[:val_size]
    train_set = indices[val_size:]
    return train_set, val_set

def sample_mesh_pairs_from_set(mesh_set, num_pairs):
    pairs = set()
    while len(pairs) < num_pairs:
        a = random.choice(mesh_set)
        b = random.choice(mesh_set)
        #if a == b:
        #    continue
        #if a > b:
        #    a, b = b, a
        pairs.add((a, b))
    return list(pairs)



def sample_sources_and_dests(num_vertices, num_sources, num_dests_per_source):
    sources = random.sample(range(num_vertices), num_sources)

    # Repeat each source num_dests_per_source times (we don't need source values, just their count)
    total_dests = num_sources * num_dests_per_source

    # Sample destinations for all sources at once
    dests = np.random.randint(0, num_vertices, size=total_dests).tolist()

    return sources, dests


def write_list_to_file(lst, fn):
    #with open(fn, "w") as f:
    #    f.write(f"{len(lst)}\n")  # Write the count first
    #    f.write("\n".join(str(x) for x in lst))
    lst = np.array(lst)
    lst = np.insert(lst, 0, len(lst))
    np.savetxt(fn, np.array(lst),  fmt='%d')

def read_floats(fn):
    return np.loadtxt(fn)[1:]
    #with open(fn) as f:
    #    return np.fromiter((float(line) for line in f), dtype=np.float32)[1:]

In [None]:
random.seed(42)

NUM_MESHES = 12000
NUM_PAIRS = 40000
NUM_VERTICES = 3889
NUM_SOURCES = 400
NUM_DESTS_PER_SOURCE = 10
# You can do random sample first using split_meshes and save the train & val indexes if you have not computed them. This code allows us to create multiple train dataset from the same train & val split.
train_set, val_set = np.load("TrainIndexPairs.npy"), np.load("ValIndexPairs.npy")#split_meshes(NUM_MESHES, val_ratio=0.025) 
#np.save("TrainIndexPairs.npy", train_set)
#np.save("ValIndexPairs.npy", val_set)
train_pairs = sample_mesh_pairs_from_set(train_set, num_pairs=NUM_PAIRS)
#PAIRS = sample_mesh_pairs(NUM_MESHES, NUM_PAIRS)

# total rows = NUM_PAIRS * NUM_SOURCES * NUM_DESTS_PER_SOURCE
total = NUM_PAIRS * NUM_SOURCES * NUM_DESTS_PER_SOURCE

# For integer arrays, use -1 as a sentinel (assuming valid indices are non-negative)
mesh_a = np.full(total, -1, dtype=np.int16)
mesh_b = np.full(total, -1, dtype=np.int16)
sources = np.full(total, -1, dtype=np.int16)
dests   = np.full(total, -1, dtype=np.int16)

# For float arrays, use np.nan as a sentinel
dist_a  = np.full(total, np.nan, dtype=np.float16)
dist_b  = np.full(total, np.nan, dtype=np.float16)


In [12]:


# If you run out of RAM, uncomment below and comment out the in‑memory lines above:
# mesh_a = np.memmap("mesh_a.dat", dtype=np.int16, mode="w+", shape=(total,))
# mesh_b = np.memmap("mesh_b.dat", dtype=np.int16, mode="w+", shape=(total,))
# sources = np.memmap("sources.dat", dtype=np.int16, mode="w+", shape=(total,))
# dests   = np.memmap("dests.dat",   dtype=np.int16, mode="w+", shape=(total,))
# dist_a  = np.memmap("dist_a.dat",  dtype=np.float32, mode="w+", shape=(total,))
# dist_b  = np.memmap("dist_b.dat",  dtype=np.float32, mode="w+", shape=(total,))

idx = 0
pid = 0
for ma, mb in tqdm(train_pairs, desc="mesh‑pairs"):
    srcs, dsts = sample_sources_and_dests(NUM_VERTICES, NUM_SOURCES, NUM_DESTS_PER_SOURCE)
    write_list_to_file(srcs, f"source_{pid}.txt")
    write_list_to_file(dsts, f"dest_{pid}.txt")
    #print(len(dsts))

    # compute on mesh A
    out_a = f"tmp_a_{pid}.txt"
    subprocess.call([
        "DGG_LC.exe",
        f"DGGData\\smal_{ma}_FD0.0100000000_c20.binary",
        f"source_{pid}.txt",f"dest_{pid}.txt", out_a, "dij","flt"
    ])
    #print(f"DGGData\\human_{ma}_FD0.0100000000_c20.binary")
    if(os.path.exists(out_a) == False):
        continue
    da = read_floats(out_a)
    os.remove(out_a)
    #print(len(da))
    # compute on mesh B
    out_b = f"tmp_b_{pid}.txt"
    subprocess.call([
        "DGG_LC.exe",
        f"DGGData\\smal_{mb}_FD0.0100000000_c20.binary",
        f"source_{pid}.txt",f"dest_{pid}.txt", out_b, "dij","flt"
    ])
    if(os.path.exists(out_b) == False):
        continue
    #print(f"DGGData\\human_{mb}_FD0.0100000000_c20.binary")
    db = read_floats(out_b)
    os.remove(out_b)
    #print(len(db))
    #os.remove(out_b)
    #os.remove(out_a)
    # store block
    block_size = NUM_SOURCES * NUM_DESTS_PER_SOURCE
    sl = slice(idx, idx + block_size)
    mesh_a[sl] = ma
    mesh_b[sl] = mb
    sources[sl] = np.repeat(srcs, NUM_DESTS_PER_SOURCE)
    dests[sl]   = dsts
    dist_a[sl]  = da
    dist_b[sl]  = db

    idx += block_size

    # clean up tmp files
    
    

# finally, save compressed



mesh‑pairs: 100%|██████████████████████████████████████████████████████████████| 40000/40000 [7:22:01<00:00,  1.51it/s]


In [13]:
# For int16 arrays, check if elements are NOT the sentinel value (-1)
is_sources_set = (sources != -1)
is_dests_set = (dests != -1)
is_mesha_set = (mesh_a!= -1)
is_meshb_set = (mesh_b!= -1)
# ... and so on for mesh_a, mesh_b

# For float16 arrays, check if elements are NOT NaN.
# np.isnan() returns True for NaN values, so we use ~ (NOT)
is_dist_a_set = ~np.isnan(dist_a)
is_dist_b_set = ~np.isnan(dist_b)

# Combine masks based on your definition of a "set" element/row.
# Example: A row is considered 'set' if its 'sources' and 'dist_a' values are both set.
# Adjust this 'combined_mask' logic based on your actual data dependencies.
combined_mask = is_sources_set & is_dist_a_set & is_mesha_set & is_meshb_set

# Filter all arrays using the combined mask
filtered_mesh_a = mesh_a[combined_mask]
filtered_mesh_b = mesh_b[combined_mask]
filtered_sources = sources[combined_mask]
filtered_dests = dests[combined_mask]
filtered_dist_a = dist_a[combined_mask]
filtered_dist_b = dist_b[combined_mask]
print(np.sum(combined_mask))
np.savez_compressed(
    "geodesic_data_0.npz",
    mesh_a=filtered_mesh_a,
    mesh_b=filtered_mesh_b,
    source=filtered_sources,
    dest=filtered_dests,
    dist_on_a=filtered_dist_a,
    dist_on_b=filtered_dist_b
)

159976000


In [14]:
combined_mask = is_sources_set & is_dist_a_set & is_mesha_set & is_meshb_set & is_dist_b_set & is_dests_set
print(np.sum(combined_mask))

159976000
