# Compression notes

* 100m SHA-1 hashes, binary, binary, binary, float
    * Standard: 7.6Gb, written in 23 seconds
    * ZSTD 8: 6.85Gb, written in 3 mins
    * ZSTD 15: 6.85Gb, written in 13 mins

In [1]:
from hashlib import sha1
import numpy as np
import pyarrow as pa
import random

def fast_generate_hashes(n: int = int(2e7)) -> pa.Array:
    """Generate n SHA1 hashes using PyArrow arrays."""
    rand = random.randint(0, int(1e5))
    hashes = [
        sha1((i + rand).to_bytes(8, 'big')).digest() 
        for i in range(int(n))
    ]
    return pa.array(hashes, type=pa.binary())

def fast_sample_pairs(hashes: pa.Array, n: int = int(1e8)) -> pa.Table:
    """Generate hash pairs with random new hashes."""
    hash_count = len(hashes)
    
    # Generate indices
    left = np.random.randint(0, hash_count, n)
    right = np.random.randint(0, hash_count - 1, n)
    right += (right >= left)
    
    # Take values using PyArrow
    left_hashes = hashes.take(pa.array(left))
    right_hashes = hashes.take(pa.array(right))
    
    # Generate probabilities as PyArrow array (between 0.7 and 1.0 to 2 DP)
    probs = pa.array(np.round(0.7 + 0.3 * np.random.random(n), 2), type=pa.float64())
    
    # Generate completely new random hashes instead of combining
    new_hashes = [sha1(i.to_bytes(8, 'big')).digest() 
                  for i in range(n)]
    combined_arr = pa.array(new_hashes, type=pa.binary())
    
    # Create table directly with PyArrow
    return pa.table({
        'hash': combined_arr,
        'left': left_hashes,
        'right': right_hashes,
        'probability': probs
    })


In [2]:
import rustworkx as rx
from matchbox.common.hash import list_to_value_ordered_hash

def to_clusters(results: pa.Table) -> pa.Table:
    """
    Converts probabilities into a list of connected components formed at each threshold.

    Returns:
        Probabilities sorted by threshold descending.
    """
    G = rx.PyGraph()
    added: dict[bytes, int] = {}
    components: dict[str, list] = {"parent": [], "child": [], "threshold": []}

    # Sort probabilities descending and group by probability
    edges_df = results.select(['left', 'right', 'probability']).sort_by([("probability", "descending")])
    
    # Get unique probability thresholds, sorted
    thresholds = pa.compute.unique(edges_df.column('probability'))

    # Process edges grouped by probability threshold
    for prob in thresholds.to_pylist():
        mask = pa.compute.equal(edges_df.column('probability'), prob)
        threshold_edges = edges_df.filter(mask)
        # Get state before adding this batch of edges
        old_components = {frozenset(comp) for comp in rx.connected_components(G)}

        # Add all nodes and edges at this probability threshold
        edge_values = zip(
            threshold_edges.column('left').to_pylist(),
            threshold_edges.column('right').to_pylist()
        )

        for left, right in edge_values:
            for hash_val in (left, right):
                if hash_val not in added:
                    idx = G.add_node(hash_val)
                    added[hash_val] = idx

            G.add_edge(added[left], added[right], None)

        new_components = {frozenset(comp) for comp in rx.connected_components(G)}
        changed_components = new_components - old_components

        # For each changed component, add ALL members at current threshold
        for comp in changed_components:
            children = sorted([G.get_node_data(n) for n in comp])
            parent = list_to_value_ordered_hash(children)

            components["parent"].extend([parent] * len(children))
            components["child"].extend(children)
            components["threshold"].extend([prob] * len(children))

    return pa.Table.from_pydict(components)

In [3]:
import pyarrow as pa
from collections import defaultdict

def _cluster_results_to_hierarchical_pa(
    probabilities: pa.Table,
    clusters: pa.Table,
) -> pa.Table:
    """
    Converts results to a hierarchical structure by processing thresholds sequentially,
    maintaining ultimate parent tracking to avoid graph traversal.
    
    Args:
        probabilities: Original pairwise probabilities containing base components
        clusters: Connected components at each threshold
        
    Returns:
        PyArrow Table with schema:
            parent: binary
            child: binary
            threshold: double
    """
    # Sort thresholds in descending order
    thresholds = pa.compute.unique(clusters['threshold']).sort(order='descending')
    
    # Initialize data structures
    hierarchy: list[tuple[bytes, bytes, float]] = []
    ultimate_parents: dict[bytes, set[bytes]] = defaultdict(set)
    
    # Process each threshold level
    for threshold in thresholds:
        threshold_float = float(threshold.as_py())
        
        # Filter and process pairwise probabilities at this threshold
        prob_mask = pa.compute.equal(probabilities['probability'], threshold)
        current_probs = probabilities.filter(prob_mask)
        
        # Add new pairwise relationships at this threshold
        for batch in current_probs.to_batches():
            parent_array = batch.column('hash')
            left_array = batch.column('left')
            right_array = batch.column('right')
            
            for i in range(len(batch)):
                parent = parent_array[i].as_py()
                left_id = left_array[i].as_py()
                right_id = right_array[i].as_py()
                
                # Add to hierarchy
                hierarchy.extend([
                    (parent, left_id, threshold_float),
                    (parent, right_id, threshold_float)
                ])
                
                # Update ultimate parents
                ultimate_parents[left_id].add(parent)
                ultimate_parents[right_id].add(parent)
        
        # Process clusters at this threshold
        cluster_mask = pa.compute.equal(clusters['threshold'], threshold)
        current_clusters = clusters.filter(cluster_mask)
        
        # Group by parent to process components together
        for batch in current_clusters.to_batches():
            parent_col = batch.column('parent')
            child_col = batch.column('child')
            
            parent_groups: dict[bytes, set[bytes]] = defaultdict(set)
            for i in range(len(batch)):
                parent = parent_col[i].as_py()
                child = child_col[i].as_py()
                parent_groups[parent].add(child)
            
            # Process each component
            for new_parent, children in parent_groups.items():
                if len(children) <= 2:
                    continue  # Skip pairs already handled by pairwise probabilities
                
                # Collect all current ultimate parents for children in this component
                current_ultimate_parents: set[bytes] = set()
                for child in children:
                    current_ultimate_parents.update(ultimate_parents[child])
                
                # Add edges from ultimate parents to new parent
                for up in current_ultimate_parents:
                    hierarchy.append((new_parent, up, threshold_float))
                
                # Update ultimate parents for all children in the component
                for child in children:
                    ultimate_parents[child] = {new_parent}
    
    # Sort hierarchy by threshold (descending), then parent, then child
    hierarchy.sort(key=lambda x: (x[2], x[0], x[1]), reverse=True)
    
    # Convert to PyArrow Table
    return pa.Table.from_arrays(
        [
            pa.array([h[0] for h in hierarchy], type=pa.binary()),
            pa.array([h[1] for h in hierarchy], type=pa.binary()),
            pa.array([h[2] for h in hierarchy], type=pa.float64())
        ],
        names=['parent', 'child', 'threshold']
    )

In [4]:
from typing import Generator

def create_test_tables(
    size: int,
    name: bytes,
    id_gen: Generator[str, None, None]
) -> tuple[pa.Table, pa.Table]:
    """
    Create two Arrow tables: a standard version and an update version with controlled changes.
    The update version maintains some hashes, reassigns some IDs, and adds new entries.
    
    Args:
        size: Number of rows in each table
        name: Dataset name as bytes (will be used as prefix for both versions)
        id_gen: Generator that yields string IDs
        
    Returns:
        Tuple[pa.Table, pa.Table]: Standard and update tables
    """
    # Generate standard table data
    standard_hashes = fast_generate_hashes(size)
    standard_ids = [[str(next(id_gen))] for _ in range(size)]
    
    # Create standard table
    standard_ids_nested = pa.array(standard_ids, type=pa.list_(pa.string()))
    standard_dataset = pa.array([name] * len(standard_hashes), type=pa.binary())
    
    standard_table = pa.Table.from_arrays(
        [standard_hashes, standard_dataset, standard_ids_nested],
        names=['hash', 'dataset', 'id']
    )

    # Calculate sizes for different types of changes
    keep_count = int(size * 0.3)  # 30% remain same
    reuse_count = int(size * 0.2)  # 20% same hash, new ID
    move_count = int(size * 0.2)   # 20% same ID, new hash
    new_count = size - (keep_count + reuse_count + move_count)  # 30% new entries

    # Build update version
    update_hashes = []
    update_ids = []

    # 1. Keep some entries exactly the same
    for i in range(keep_count):
        update_hashes.append(standard_hashes[i])
        update_ids.append(standard_ids[i])
    
    # 2. Same hash, new IDs
    for i in range(keep_count, keep_count + reuse_count):
        update_hashes.append(standard_hashes[i])
        update_ids.append(
            [str(next(id_gen))]
        )
    
    # 3. Same IDs, new hashes
    new_hashes = fast_generate_hashes(move_count)
    for i, hash in enumerate(new_hashes):
        idx = keep_count + reuse_count + i
        update_hashes.append(hash)
        update_ids.append(standard_ids[idx])
    
    # 4. Completely new entries
    final_new_hashes = fast_generate_hashes(new_count)
    for hash in final_new_hashes:
        update_hashes.append(hash)
        update_ids.append(
            [str(next(id_gen))]
        )

    # Create update table
    update_ids_nested = pa.array(update_ids, type=pa.list_(pa.string()))
    update_dataset = pa.array([name] * len(update_hashes), type=pa.binary())

    update_table = pa.Table.from_arrays(
        [pa.array(update_hashes), update_dataset, update_ids_nested],
        names=['hash', 'dataset', 'id']
    )

    return standard_table, update_table

In [5]:
def fast_consistent_pairs(
   standard_left: pa.Array,
   standard_right: pa.Array,
   update_left: pa.Array, 
   update_right: pa.Array,
   n: int = int(1e8)
) -> tuple[pa.Table, pa.Table]:
   """Generate partially consistent hash pairs between standard and update tables."""
   # Calculate sizes and adjust n
   standard_total = len(standard_left) * len(standard_right)
   update_total = len(update_left) * len(update_right)
   n = min(n, min(standard_total, update_total))
   
   # Generate random permutation of just the first n indices instead of all possible indices
   standard_flat = np.random.permutation(n)
   # Scale to full range 
   standard_flat = (standard_flat * (standard_total / n)).astype(np.int64)
   std_left_idx = standard_flat // len(standard_right)
   std_right_idx = standard_flat % len(standard_right)
   
   # For update table, make ~30% of pairs match standard pairs
   match_count = n // 3
   update_flat = np.empty(n, dtype=np.int64)
   update_flat[:match_count] = standard_flat[:match_count]  # Copy matching pairs
   
   # Same optimization for the remaining indices
   remaining = np.random.permutation(n - match_count)
   remaining = (remaining * (update_total / (n - match_count))).astype(np.int64)
   update_flat[match_count:] = remaining
   
   up_left_idx = update_flat // len(update_right)
   up_right_idx = update_flat % len(update_right)
   
   # Generate hashes using original SHA1 approach
   standard_hashes = [
       sha1(i.to_bytes(8, 'big')).digest() 
       for i in range(n)
   ]
   standard_combined = pa.array(standard_hashes, type=pa.binary())
   
   # For update table, reuse hashes for matching pairs
   update_hashes = standard_hashes[:match_count] + [
       sha1((i + n).to_bytes(8, 'big')).digest()
       for i in range(n - match_count)
   ]
   update_combined = pa.array(update_hashes, type=pa.binary())
   
   # Create tables
   standard_table = pa.table({
       'hash': standard_combined,
       'left': standard_left.take(pa.array(std_left_idx)),
       'right': standard_right.take(pa.array(std_right_idx)),
       'probability': pa.array(np.round(0.7 + 0.3 * np.random.random(n), 1), type=pa.float64())
   })

   update_table = pa.table({
       'hash': update_combined,
       'left': update_left.take(pa.array(up_left_idx)),
       'right': update_right.take(pa.array(up_right_idx)),
       'probability': pa.array(np.round(0.7 + 0.3 * np.random.random(n), 1), type=pa.float64())
   })
   
   return standard_table, update_table

In [6]:
def create_cluster_tables(
    hierarchical_results: pa.Table,
    model: bytes,
) -> tuple[pa.Table, pa.Table, pa.Table]:
    """
    Creates three PyArrow tables from hierarchical clustering results:
    - clusters: unique clusters with their metadata
    - contains: parent-child relationships between clusters
    - probabilities: probability scores for each cluster

    Args:
        hierarchical_results: PyArrow Table with columns (parent, child, threshold)
        model: bytes identifier for the model

    Returns:
        Tuple of (clusters, contains, probabilities) PyArrow tables where:
        - clusters: schema (hash: binary, dataset: binary, id: list[string])
        - contains: schema (parent: binary, child: binary)
        - probabilities: schema (model: binary, cluster: binary, probability: double)
    """
    # Extract unique clusters (parents) and create clusters table
    unique_clusters = pa.compute.unique(hierarchical_results['parent'])
    clusters = pa.Table.from_arrays(
        [
            unique_clusters,  # hash
            pa.array([None] * len(unique_clusters), type=pa.binary()),  # dataset
            pa.array([[]] * len(unique_clusters), type=pa.list_(pa.string())),  # id as list[str]
        ],
        names=['hash', 'dataset', 'id']
    )

    # Contains table is just parent-child relationships
    contains = pa.Table.from_arrays(
        [
            hierarchical_results['parent'],
            hierarchical_results['child']
        ],
        names=['parent', 'child']
    )

    # Probabilities table with model reference
    probabilities = pa.Table.from_arrays(
        [
            pa.array([model] * len(hierarchical_results)),
            hierarchical_results['parent'],
            hierarchical_results['threshold']
        ],
        names=['model', 'cluster', 'probability']
    )

    return clusters, contains, probabilities

## Dummy data

Use the functions and make the data.

In [7]:
e = 5
size = "sml"

In [8]:
from itertools import count
from uuid import UUID, uuid4
from typing import Generator

def d1gen() -> Generator[UUID, None, None]:
    """Generate UUIDs for dataset 1."""

    for i in count():
        yield UUID(int=i, version=4)

def d2gen() -> Generator[UUID, None, None]:
    """Generate UUIDs for dataset 2."""
    for _ in count():
        yield uuid4()

d1ids = d1gen()
d2ids = d2gen()

def num(c: int, e: int) -> int:
    return int(c * (10 ** e))

In [9]:
dataset1_standard, dataset1_update = create_test_tables(num(2, e), b'dataset1', d1ids)

dataset1_standard, dataset1_update

(pyarrow.Table
 hash: binary
 dataset: binary
 id: list<item: string>
   child 0, item: string
 ----
 hash: [[B10421434DA41B7C94C47718EFDE3CE6CB1FF122,A15B231090CC4807D51B8052B58FF57E4470191E,D29641E62DAD17B8377E950C61CC9808B2066024,2A43B84231A5A07A7AEDF9CFC9CB2FB52F0BCA79,79BB59CF199E1DFB3A9C74B6ACFA93423B9ABE51,...,713E1EF2E53E791A2619DFCA3D97036D8B28C305,5D3B6381F8A484AADDE45CE36CCDD7A5A6F768EE,CBF03FD01FBAD9B3A3DC3C39D94D5D5BAA5486B6,1B98FEB4EADE4E1E2A6F49B053375870CC5DFD35,9C32D1D9A0229E99683FFE4F7ED5E8BF5A438862]]
 dataset: [[6461746173657431,6461746173657431,6461746173657431,6461746173657431,6461746173657431,...,6461746173657431,6461746173657431,6461746173657431,6461746173657431,6461746173657431]]
 id: [[["00000000-0000-4000-8000-000000000000"],["00000000-0000-4000-8000-000000000001"],...,["00000000-0000-4000-8000-000000030d3e"],["00000000-0000-4000-8000-000000030d3f"]]],
 pyarrow.Table
 hash: binary
 dataset: binary
 id: list<item: string>
   child 0, item: string
 ----
 hash: 

In [10]:
dataset2_standard, dataset2_update = create_test_tables(num(5, e - 1), b'dataset2', d2ids)

dataset2_standard, dataset2_update

(pyarrow.Table
 hash: binary
 dataset: binary
 id: list<item: string>
   child 0, item: string
 ----
 hash: [[51F3669C17B5570DE5187B35B9019960F13A60BF,72B5B923969D62530CF156CCD3795D73F87BECCB,5C48EAD71D5A3057BFBC3F2511F57E831B59AC76,AC30A70215DEFF28FC75EB1CFC755744E117CE1A,DC1602B6D0768EA9B749DAAAEC4EDD1265002271,...,72FB481854F4C2237FA6B331D48C12A213CE44B3,AB66AD8433006A8F6F14029A9305F9D0B46D3186,D8048A51ADC25D94A374284F01C5E476445D240E,9888525ED6FED103445D75A4097158514F4CC502,C9ADD6CC80AFB498F66A8275E1B639DDB815C7FF]]
 dataset: [[6461746173657432,6461746173657432,6461746173657432,6461746173657432,6461746173657432,...,6461746173657432,6461746173657432,6461746173657432,6461746173657432,6461746173657432]]
 id: [[["b4e5e10c-8b7c-4790-8bab-fb5dea9eece3"],["7551a529-2e75-4ed4-9a73-3d75e4fe8c45"],...,["ef605c11-5de8-4fbd-8747-cec2e6a7d7be"],["1a763f37-5018-416f-a85a-21ad749227b7"]]],
 pyarrow.Table
 hash: binary
 dataset: binary
 id: list<item: string>
   child 0, item: string
 ----
 hash: 

In [11]:
probresults_standard, probresults_update = fast_consistent_pairs(
    dataset1_standard['hash'],
    dataset2_standard['hash'],
    dataset1_update['hash'],
    dataset2_update['hash'],
    n=num(1, e + 1)
)

probresults_standard, probresults_update

(pyarrow.Table
 hash: binary
 left: binary
 right: binary
 probability: double
 ----
 hash: [[05FE405753166F125559E7C9AC558654F107C7E9,CB473678976F425D6EC1339838F11011007AD27D,07AAE1B618F604C684EE3189FA1723BEF8656FE4,461D6580E38CCB6DC72699B6C945E53831DCDF03,7F028DDBB42E47AC2CD00E27A37BD191F1C2B925,...,D3425A8AD85C009A7C0207DE64C76CD24A022146,64F3C46ED38031FC857E20E304267CC5E133D292,F5DF0A4361A13D8AF3D6D8382BAE765529C133D7,550E7F8689BFF4E9A8C04DBC7B34684604C8B8C7,A3E65CF5997DDA194FC6B35EF0269CBB2B941EC3]]
 left: [[CF5E9FB3957C4EC962A4103462C15E7F8FC0DF8E,355BEE9C22137718A46B0CF2CA39BCDBB26BEF8F,5BCE428231451E8FFE4362AD657B220207844A3B,F8801D4165D23A5ED8E719862A952F1C135FA1CC,AD28B61ECB101285A7B2E07120581BD7E7AE1CCB,...,A222F9016CDB31E4E95F1AA322CB0937A909D431,9CD5CF0309C0CE03675D1C285EF7DCA1E89DC2EF,7CCA0B89F7F723839BB32D0E29FF5E28A8AA29EC,1F26288DD88FCCA79EE9E8018FB4E33E3F073191,731920CB3B1CC21C1B909179D4C4E6175EC79172]]
 right: [[4F85C373359E57A62BD647D55FC7C8521CBD0EF2,AA7695BD81ACF7

In [12]:
clusresults_standard = to_clusters(probresults_standard)
clusresults_update = to_clusters(probresults_update)

clusresults_standard, clusresults_update

(pyarrow.Table
 parent: binary
 child: binary
 threshold: double
 ----
 parent: [[07F58C1D1D7F7E9B39BB5CD9A65FD254E92AD26E68D29B5B36BA42BD8FE40714,07F58C1D1D7F7E9B39BB5CD9A65FD254E92AD26E68D29B5B36BA42BD8FE40714,07F58C1D1D7F7E9B39BB5CD9A65FD254E92AD26E68D29B5B36BA42BD8FE40714,07F58C1D1D7F7E9B39BB5CD9A65FD254E92AD26E68D29B5B36BA42BD8FE40714,07F58C1D1D7F7E9B39BB5CD9A65FD254E92AD26E68D29B5B36BA42BD8FE40714,...,57CF45A6B8ABC3A40018C3CAB83385F7F3E4EA053888454A6D60224804149075,57CF45A6B8ABC3A40018C3CAB83385F7F3E4EA053888454A6D60224804149075,57CF45A6B8ABC3A40018C3CAB83385F7F3E4EA053888454A6D60224804149075,57CF45A6B8ABC3A40018C3CAB83385F7F3E4EA053888454A6D60224804149075,57CF45A6B8ABC3A40018C3CAB83385F7F3E4EA053888454A6D60224804149075]]
 child: [[00001658DE925885461A13C4E24CD137E744ED1B,00013143B8E9564450AA2AC631F3208C9791B327,00015E864E65931319EEA1EE51760E7CA7B3C915,000197407B326FD40BE3F52D74261B5227CE9BFE,0001D00DED90766C18B4C6622776DDC405BB321C,...,FFFCC3E75539A6352EA516F0DAD6E28A2CCD9900,FF

In [13]:
hierarchy_standard = _cluster_results_to_hierarchical_pa(probresults_standard, clusresults_standard)
hierarchy_update = _cluster_results_to_hierarchical_pa(probresults_update, clusresults_update)

hierarchy_standard, hierarchy_update

(pyarrow.Table
 parent: binary
 child: binary
 threshold: double
 ----
 parent: [[FFFE461EB0EBF8728C2C0D056E9182990F164456,FFFE461EB0EBF8728C2C0D056E9182990F164456,FFFD38D1675EBEC0848FD0B0D8F162F43E178EEC,FFFD38D1675EBEC0848FD0B0D8F162F43E178EEC,FFFC7CECF46E6A00E591F0932DF14E2E7652713B,...,000197407B326FD40BE3F52D74261B5227CE9BFE,00017F1C323AACA322EE44CC134F3DBF4807B1C5,00017F1C323AACA322EE44CC134F3DBF4807B1C5,00015632977C2E0908883990F2DE9151C6B781A4,00015632977C2E0908883990F2DE9151C6B781A4]]
 child: [[E4E1568D63F363CDDAA4EC2D50CDA1868CDDFC5B,202E1AF2B1F8C1491B8B4112000529A6594F3283,BCEA5C3427F09219C43F802511222611E810BBE8,73009B330C21FE56910F11AD18A88DC125547CD1,BCEA5C3427F09219C43F802511222611E810BBE8,...,E4E1568D63F363CDDAA4EC2D50CDA1868CDDFC5B,EC7A80600D3C33268CA4FD6A0E7A593C0BB47D8B,E4E1568D63F363CDDAA4EC2D50CDA1868CDDFC5B,BCEA5C3427F09219C43F802511222611E810BBE8,AF2B4CF3F633F3E5C2282E54C42E4DA9B8BBE184]]
 threshold: [[1,1,1,1,1,...,0.7,0.7,0.7,0.7,0.7]],
 pyarrow.Table
 parent: b

In [14]:
clusters_standard, contains_standard, probabilities_standard = create_cluster_tables(hierarchy_standard, b'model1')

clusters_standard, contains_standard, probabilities_standard

(pyarrow.Table
 hash: binary
 dataset: binary
 id: list<item: string>
   child 0, item: string
 ----
 hash: [[FFFE461EB0EBF8728C2C0D056E9182990F164456,FFFD38D1675EBEC0848FD0B0D8F162F43E178EEC,FFFC7CECF46E6A00E591F0932DF14E2E7652713B,FFFC64F1971D751AF36B7532ED88E3C3771CA478,FFFBAC26F6D4A88C2C522FDDDE3ED34FB6343E67,...,00021FDDFE3897E5DBA2E34EB22404F472531CEA,0001D8BFAFB8F202558801A8AF76639E9F21B3FE,000197407B326FD40BE3F52D74261B5227CE9BFE,00017F1C323AACA322EE44CC134F3DBF4807B1C5,00015632977C2E0908883990F2DE9151C6B781A4]]
 dataset: [[null,null,null,null,null,...,null,null,null,null,null]]
 id: [[[],[],...,[],[]]],
 pyarrow.Table
 parent: binary
 child: binary
 ----
 parent: [[FFFE461EB0EBF8728C2C0D056E9182990F164456,FFFE461EB0EBF8728C2C0D056E9182990F164456,FFFD38D1675EBEC0848FD0B0D8F162F43E178EEC,FFFD38D1675EBEC0848FD0B0D8F162F43E178EEC,FFFC7CECF46E6A00E591F0932DF14E2E7652713B,...,000197407B326FD40BE3F52D74261B5227CE9BFE,00017F1C323AACA322EE44CC134F3DBF4807B1C5,00017F1C323AACA322EE44CC13

In [15]:
clusters_update, contains_update, probabilities_update = create_cluster_tables(hierarchy_update, b'model1')

clusters_update, contains_update, probabilities_update

(pyarrow.Table
 hash: binary
 dataset: binary
 id: list<item: string>
   child 0, item: string
 ----
 hash: [[FFFF7D7EB344D5610CE897F6F3DE9B05A3B1E235,FFFEF59A828A2D28C6565B9C996898710DE0BAA3,FFFE679806033605138F4E4A1B4603937A120B58,FFFE5361A6785092508A12075294B2F81663F8F4,FFFE18EE7325D9E2A0412CB2F44F33D2E198A40E,...,00015E864E65931319EEA1EE51760E7CA7B3C915,0000A02A1DC9D8C923CDA243A943C32AC2165A2F,000045D0E91911FDABC4C793BBB4D8159960E4D7,000041D5A799DF96700CBE89646C241C8B3D7C24,00001658DE925885461A13C4E24CD137E744ED1B]]
 dataset: [[null,null,null,null,null,...,null,null,null,null,null]]
 id: [[[],[],...,[],[]]],
 pyarrow.Table
 parent: binary
 child: binary
 ----
 parent: [[FFFF7D7EB344D5610CE897F6F3DE9B05A3B1E235,FFFF7D7EB344D5610CE897F6F3DE9B05A3B1E235,FFFEF59A828A2D28C6565B9C996898710DE0BAA3,FFFEF59A828A2D28C6565B9C996898710DE0BAA3,FFFE679806033605138F4E4A1B4603937A120B58,...,000045D0E91911FDABC4C793BBB4D8159960E4D7,000041D5A799DF96700CBE89646C241C8B3D7C24,000041D5A799DF96700CBE8964

I've therefore decided to produce three sets of parquets: small, medium and large. I figured this would be useful for you too as you can use small to ensure your plumbing is all set up right before you scale to medium and large.
 
Each set of files includes:

* `clusters_dataset1_standard_{size}`
* `clusters_dataset1_update_{size}`
* `clusters_dataset2_standard_{size}`
* `clusters_dataset2_update_{size}`
* `clusters_clusters_standard_{size}`
* `clusters_clusters_update_{size}`
* `contains_standard_{size}`
* `contains_update_{size}`
* `probabilities_standard_{size}`
* `probabilities_update_{size}`

The first word of these names is the table to insert into. Final word is whether it's the initial or updated insert. The shape of them should be exactly right for the PostgreSQL table with column names matching the ORM, and datatypes in the paired Arrow format, so `BYTEA == pa.binary`, `ARRAY<VARCHAR> == pa.list_(pa.string)`, `FLOAT == pa.double`. I've used snappy compression to match the post-processed parquet format we're likely to use, rather than the hyper-compressed format we upload.
 
Because the `Clusters` table contains both data and clusters, and these files describe a linking process, there's the inserts for each dataset, then the inserts of the clusters that match those datasets.
 
To achieve proportional scaling with the same functions, my counts are in scientific notation where I've parameterised the exponent. In small it's 5, in medium it's 6, and in large it's 7, which I believe gets us to the 100m probabilities scale for the probabilities table (1 ** 10 ^ e + 1 == 1e8 == 100m).
 
Dataset1 is analogous to HMRC exports: there's 2 ** 10 ^ e rows, or 20m in large.
 
Dataset2 is analogous to Companies House: there's 5 ** 10 ^ e - 1 rows, or 5m in large.

In [None]:
from pathlib import Path
import pyarrow.parquet as pq
from pathlib import Path

file_path = Path.cwd()

dir_path = Path.cwd() / size

pq.write_table(dataset1_standard, dir_path / f'clusters_dataset1_standard_{size}.parquet')
pq.write_table(dataset1_update, dir_path / f'clusters_dataset1_update_{size}.parquet')
pq.write_table(dataset2_standard, dir_path / f'clusters_dataset2_standard_{size}.parquet')
pq.write_table(dataset2_update, dir_path / f'clusters_dataset2_update_{size}.parquet')
pq.write_table(clusters_standard, dir_path / f'clusters_clusters_standard_{size}.parquet')
pq.write_table(clusters_update, dir_path / f'clusters_clusters_update_{size}.parquet')
pq.write_table(contains_standard, dir_path / f'contains_standard_{size}.parquet')
pq.write_table(contains_update, dir_path / f'contains_update_{size}.parquet')
pq.write_table(probabilities_standard, dir_path / f'probabilities_standard_{size}.parquet')
pq.write_table(probabilities_update, dir_path / f'probabilities_update_{size}.parquet')

## Compression

In [None]:
arrow_table.schema, f"{arrow_table.nbytes / (1024 * 1024):.2f}MB"

(hash: binary
 left: binary
 right: binary
 probability: double,
 '7653.24MB')

In [None]:
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.parquet as pq
from pathlib import Path
import hashlib

class HashIndex:
    def __init__(self, hash_type: str = 'sha1', hashes: list[bytes] | pa.Array | None = None):
        """Create a new HashIndex instance.
        
        Args:
            hash_type: Hash algorithm to use (default: 'sha1')
            hashes (Optional): Initial list of hashes to insert (default: None)
        """
        try:
            hash_obj = hashlib.new(hash_type)
            self._hash_size: int = hash_obj.digest_size   # Return the digest size in bytes
        except ValueError as e:
            raise ValueError(f'Unsupported hash type: {hash_type}') from e
        
        # Initialize empty table with correct schema
        self.schema = pa.schema([
            ('hash', pa.binary(self._hash_size)),
            ('id', pa.int32())
        ])
        self.table = pa.Table.from_pydict({
            'hash': [],
            'id': []
        }, schema=self.schema)
        self.next_id: int = 0

        if hashes is not None:
            self.insert_hashes(hashes)

    def __eq__(self, other: 'HashIndex') -> bool:
        """
        Compare this HashIndex with another for equality.
        
        Two HashIndex instances are considered equal if they:
            1. Have the same hash size
            2. Have the same next_id
            3. Have equal tables (same schema and data)
        
        Args:
            other: Another HashIndex instance to compare with
            
        Returns:
            bool: True if the indexes are equal, False otherwise
        """
        if not isinstance(other, HashIndex):
            return False
            
        return (
            self._hash_size == other._hash_size and
            self.next_id == other.next_id and
            self.table.equals(other.table)
        )

    def _list_to_array(self, hashes: list[bytes] | pa.Array) -> pa.Array:
        if isinstance(hashes, list):
            return pa.array(hashes, type=pa.binary(self._hash_size))
        return hashes

    def insert_hashes(self, hashes: list[bytes] | pa.Array) -> pa.Array:
        """
        Insert new hashes and return their indices. For existing hashes, returns their
        current indices. For new hashes, assigns and returns new indices.
        
        Args:
            hashes: Array of SHA-1 hashes to insert
        Returns:
            Array of indices (both existing and newly assigned)
        """
        hashes = self._list_to_array(hashes)
        
        # If table is empty, fast path to create initial table
        if len(self.table) == 0:
            ids = pa.array(range(len(hashes)), type=pa.int32())
            self.table = pa.Table.from_arrays([hashes, ids], schema=self.schema)
            self.next_id = len(hashes)
            # Sort table by hash for future binary searches
            self.table = self.table.sort_by('hash')
            return ids

        # Find existing hashes using binary search
        indices = pc.index_in(hashes, self.table['hash'])
        is_new = pc.is_null(indices)
        new_count = pc.sum(pc.cast(is_new, pa.int32())).as_py()
        
        if new_count > 0:
            # Get the new hashes
            new_hashes = pc.filter(hashes, is_new)
            
            # Pre-allocate new IDs array
            new_ids = pa.array(
                range(self.next_id, self.next_id + new_count), 
                type=pa.int32()
            )
            self.next_id += new_count
            
            # Append in one operation and sort once
            new_table = pa.Table.from_arrays([new_hashes, new_ids], schema=self.schema)
            self.table = pa.concat_tables([self.table, new_table])
            self.table = self.table.sort_by('hash')
            
            # Final lookup to get all IDs in correct order
            indices = pc.index_in(hashes, self.table['hash'])
        
        return pc.take(self.table['id'], indices)

    def get_hashes(self, ids: list[int] | pa.Array) -> pa.Array:
        """
        Look up hashes by their IDs
        
        Args:
            ids: Array of IDs to look up
        Returns:
            Array of corresponding hashes (null for unknown indices)
        """
        if isinstance(ids, list):
            ids = pa.array(ids, type=pa.int32())
        
        positions = pc.index_in(ids, self.table['id'])
        return pc.take(self.table['hash'], positions)

    def get_indices(self, hashes: list[bytes] | pa.Array) -> pa.Array:
        """
        Look up IDs for existing hashes. Returns null for unknown hashes.
        
        Args:
            hashes: Array of hashes to look up
        Returns:
            Array of corresponding IDs (null for unknown hashes)
        """
        hashes = self._list_to_array(hashes)
            
        indices = pc.index_in(hashes, self.table['hash'])
        return pc.take(self.table['id'], indices)
    

    def to_parquet(self, path: str | Path, compression: str = 'zstd') -> None:
        """
        Save the HashIndex to a Parquet file.
        
        Args:
            path: Path to save the Parquet file
            compression: Compression algorithm to use (default: 'zstd')
                Options include: 'none', 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'
        
        Raises:
            IOError: If the file cannot be written
            ValueError: If the compression algorithm is not supported
        """
        path = Path(path)
        path.parent.mkdir(parents=True, exist_ok=True)
        
        metadata = {
            b'next_id': str(self.next_id).encode(),
            b'hash_size': str(self._hash_size).encode()
        }

        existing_metadata = self.table.schema.metadata or {}
        merged_metadata = {**existing_metadata, **metadata}
        
        try:
            pq.write_table(
                self.table.replace_schema_metadata(merged_metadata),
                path,
                compression=compression,
            )
        except Exception as e:
            raise IOError(f"Failed to write Parquet file: {e}") from e

    @classmethod
    def from_parquet(cls, path: str | Path) -> 'HashIndex':
        """
        Load a HashIndex from a Parquet file.
        
        Args:
            path: Path to the Parquet file
        
        Returns:
            HashIndex: New HashIndex instance loaded from the file
        
        Raises:
            IOError: If the file cannot be read or is invalid
            ValueError: If the file format is invalid
        """
        path = Path(path)
        
        try:
            table = pq.read_table(path)
            metadata = table.schema.metadata
            
            if not metadata or b'next_id' not in metadata or b'hash_size' not in metadata:
                raise ValueError("Invalid Parquet file: missing required metadata")
            
            # Create new instance
            instance = cls.__new__(cls)
            
            instance._hash_size = int(metadata[b'hash_size'].decode())
            instance.next_id = int(metadata[b'next_id'].decode())
            
            instance.schema = table.schema
            instance.table = table
            
            return instance
            
        except Exception as e:
            raise IOError(f"Failed to load Parquet file: {e}") from e

In [None]:
test_hashes = fast_generate_hashes(int(2e7))
test_hashes2 = pa.concat_arrays([fast_generate_hashes(int(2e7) / 2), test_hashes[:int(2e7 / 2)]])
hidx_5 = HashIndex(hash_type='sha1', hashes=test_hashes)
test_hashes[:5], test_hashes2[:5]

(<pyarrow.lib.BinaryArray object at 0x118363fa0>
 [
   7BDB9431F1E8DDBF1ACCF691D4B8661CF9B43A25,
   392BCA5B7AFC18C6727FC30EC57B718DDAAFF953,
   F884626B7A54AEFFECB3CA3DDB1493613CC0C7A9,
   C94022C8F551879B31D10F99BCC9A3561BAE7612,
   08458C06AD784FCDC7809825056654AD5F119C4C
 ],
 <pyarrow.lib.BinaryArray object at 0x11f06ada0>
 [
   31E7C33F498E4B7E95B1C52ECACAF348CEFFAE71,
   CC141AB85B40C152864B1A24689072D718332BDD,
   C769376877CFA78C46210E9B91D4446E82A6D7CE,
   9263614A6B6C53967ABA62FA2859F785C58B1016,
   15B8ABB98A8ED7D0F008C1234583DCE9DC34AFFF
 ])

In [None]:
indices = pa.array(random.sample(range(len(test_hashes2)), k=len(test_hashes2)))
test_hashes3 = test_hashes2.take(indices)

In [None]:
hidx_5.insert_hashes(test_hashes3)

<pyarrow.lib.ChunkedArray object at 0x13dc3c8b0>
[
  [
    8997172,
    6063643,
    6260234,
    3255430,
    7485144,
    ...
    9917674,
    5612100,
    2723141,
    201414,
    4680984
  ]
]

In [None]:
test_hashes = fast_generate_hashes(40)
test_hashes2 = fast_generate_hashes(40)
hidx = HashIndex(hash_type='sha1')
test_hashes[:5], test_hashes2[:5]

(<pyarrow.lib.BinaryArray object at 0x1187140a0>
 [
   1F21ABB3185DA8D5340A8298C6CBC9C21F8AF9CD,
   813583A225BA22DE65149EC502FA79A082F93D8C,
   C4587910358DC70CD81B6B106FD724685D8AE971,
   79924525EAF98B48E03E00213E0FFBFD17AD8778,
   5E668D4D878659EE35DC8F3E7EFE5B19D68890FA
 ],
 <pyarrow.lib.BinaryArray object at 0x11ec591e0>
 [
   BDBD93052D2290495857C2A46936C468CA4A7FD6,
   AC0FFE1E2F41A6D12D5F56E32A00F8B48D47EF66,
   A3DD64FC8F69428EA0693218AF31FEA22F8BCDDE,
   0C7FF3A1FB8441369A80059DC2FA08F678A05D86,
   BD939DD6F750A40D0C3CFCB3F1C3B202A8BEADE2
 ])

In [None]:
file_path = Path.cwd()

hidx3 = HashIndex(hash_type='sha1', hashes=fast_generate_hashes(int(2e7)))

hidx3.to_parquet(file_path / 'hash_index.parquet')

# del hidx3

In [None]:
hidx3 = HashIndex.from_parquet(file_path / 'hash_index.parquet')

In [None]:
hidx3.get_hashes([random.randint(0, int(2e7)) for _ in range(int(2e5))])[:5]

<pyarrow.lib.ChunkedArray object at 0x11ec7e6b0>
[
  [
    3F0B311BB3EF9EAEB211378D538A67491E43429F,
    1234FE9CB4C1413559FD09FBBAF512DD26DF7CF6,
    C163CF811B2021CE8D17AFE5720F9DED2D602543,
    D151C016C7C94022B0E8A7826EE698A0B5FB947E,
    30EE3D4D4A6497F0C1A85158F895398C9E901F63
  ]
]

In [None]:
hidx.insert_hashes(test_hashes)[:5], hidx.insert_hashes(test_hashes2)[:5]

(<pyarrow.lib.Int32Array object at 0x11ecc1f60>
 [
   0,
   1,
   2,
   3,
   4
 ],
 <pyarrow.lib.ChunkedArray object at 0x11ef1f600>
 [
   [
     40,
     41,
     42,
     43,
     44
   ]
 ])

In [None]:
h = hidx.get_hashes([0])[0]
i = hidx.get_indices([h])[0]
h2 = hidx.get_hashes([i])[0]

h == h2

True

In [None]:
file_path = Path.cwd()

hidx.to_parquet(file_path / 'hash_index.parquet')
hidx2 = HashIndex.from_parquet(file_path / 'hash_index.parquet')

hidx == hidx2

True

In [None]:
hidx.get_hashes([0, 40, 7, 42, 190])[:5]

<pyarrow.lib.ChunkedArray object at 0x15a223d80>
[
  [
    9E24FC1BC65D8A134990D02A6C4A40E3AAB20FAA,
    09C61C3BE151A112228E61AC7C7C1B81DC625CDF,
    CEED556D7FE741F9BC0FA5A464DEF839989F3E74,
    76506960448B8BB6CB2AF337FBE9852E1ABB1486,
    null
  ]
]

In [None]:
hidx.get_indices(test_hashes[:5]), hidx.get_indices(test_hashes2[:5])

(<pyarrow.lib.ChunkedArray object at 0x120b61e90>
 [
   [
     0,
     1,
     2,
     3,
     4
   ]
 ],
 <pyarrow.lib.ChunkedArray object at 0x15a2d1800>
 [
   [
     40,
     41,
     42,
     43,
     44
   ]
 ])

In [None]:
hashes = fast_generate_hashes(int(2e7))
t1 = fast_sample_pairs(hashes, int(1e8))

In [None]:
def index_probability_table(table: pa.Table) -> tuple[HashIndex, pa.Table]:

    hidx = HashIndex(hash_type='sha1')

    table = table.set_column(
        table.column_names.index("left"),
        "left",
        hidx.insert_hashes(table['left'])
    )
    table = table.set_column(
        table.column_names.index("right"),
        "right",
        hidx.insert_hashes(table['right'])
    )
    table = table.set_column(
        table.column_names.index("hash"),
        "hash",
        hidx.insert_hashes(table['hash'])
    )

    return hidx, table

hidx, t2 = index_probability_table(t1)

f"{t1.nbytes / (1024 * 1024):.2f}MB", f"{hidx.table.nbytes / (1024 * 1024):.2f}MB", f"{t2.nbytes / (1024 * 1024):.2f}MB", f"{(hidx.table.nbytes + t2.nbytes) / (1024 * 1024):.2f}MB"

('7653.24MB', '4135.36MB', '1907.35MB', '6042.71MB')

In [None]:
(15 * 15) / 60

3.75

In [None]:
(4230 / 5) / 60

14.1

5mb/s upload speed -- 25 mins for 7.6Gb, 14min for 4.3Gb

In [None]:
import pyarrow.parquet as pq
from pathlib import Path

file_path = Path.cwd()

pq.write_table(
    t2,
    file_path / 'probabilities_normalised_brot.parquet',
    compression='BROTLI',
    # compression_level=16,
    # use_dictionary=True,
    # write_statistics=True,
    # use_byte_stream_split=True,
    # row_group_size=1048576  # 1MB row groups
)

In [None]:
hidx.to_parquet(file_path / 'hash_index.parquet')

* Impose threshold (don't store below x, default x)
* Can we automate to check rub probabilities?
    * Find any gigantic clusters as a result
    * Run at plausible thresholds?
* Is threshold but need a rule of thumb
    * If dedupe, n^2 - n/2
    * If link, n^2 - n/2 ish
* "How do you know that" is hard in a chain, and people want that

* duckdb from pg OR pg from duckdb -- both available to work with pg
    * Bad for atomic updates

---

100m

ZSTD 10s at 1, 1.86Gb
ZSTD 10s at 4, 1.86Gb 
ZSTD 50s at 15, 1.86Gb
ZSTD 4m at 16, 1.86Gb
ZSTD 7m at 22, 1.52Gb
BROTLI 2m at default, 1.57Gb

Index

Snappy 13s at default, 3.8Gb
BROTLI 6m at default, 2.6Gb
ZSTD 20s at default, 2.75Gb

ZSTD balanced between the two: 4.235Gb (3Gb saving)

--

Work through pg/duckdb idea

- Clusters and contains need appending -- parquet or postgres?
- How is duckdb informed about new parquet
    - Lambda? API?
- Can this perform?

* Duckdb
    * R/W or R mode -- we'd launch in one or tother
        * No lambdas, just run the new command
* Consolidating a dataset (pruning old records) -- like upsert
    * 3 ways
        * duckdb directly, w process. Load two parquets, prune old
        * pandas
        * polars
* (?) married to parquet