# Compression notes

* 100m SHA-1 hashes, binary, binary, binary, float
    * Standard: 7.6Gb, written in 23 seconds
    * ZSTD 8: 6.85Gb, written in 3 mins
    * ZSTD 15: 6.85Gb, written in 13 mins

In [1]:
from hashlib import sha1
import numpy as np
import pyarrow as pa
import random

def fast_generate_hashes(n: int = int(2e7)) -> pa.Array:
    """Generate n SHA1 hashes using PyArrow arrays."""
    rand = random.randint(0, int(1e5))
    hashes = [
        sha1((i + rand).to_bytes(8, 'big')).digest() 
        for i in range(int(n))
    ]
    return pa.array(hashes, type=pa.binary())

def fast_sample_pairs(hashes: pa.Array, n: int = int(1e8)) -> pa.Table:
    """Generate hash pairs with random new hashes."""
    hash_count = len(hashes)
    
    # Generate indices
    left = np.random.randint(0, hash_count, n)
    right = np.random.randint(0, hash_count - 1, n)
    right += (right >= left)
    
    # Take values using PyArrow
    left_hashes = hashes.take(pa.array(left))
    right_hashes = hashes.take(pa.array(right))
    
    # Generate probabilities as PyArrow array (between 0.7 and 1.0 to 2 DP)
    probs = pa.array(np.round(0.7 + 0.3 * np.random.random(n), 2), type=pa.float64())
    
    # Generate completely new random hashes instead of combining
    new_hashes = [sha1(i.to_bytes(8, 'big')).digest() 
                  for i in range(n)]
    combined_arr = pa.array(new_hashes, type=pa.binary())
    
    # Create table directly with PyArrow
    return pa.table({
        'hash': combined_arr,
        'left': left_hashes,
        'right': right_hashes,
        'probability': probs
    })

def fast_sample_pairs_lr(left: pa.Array, right: pa.Array, n: int = int(1e8)) -> pa.Table:
    """Generate hash pairs with random new hashes.
    
    Assumes left and right are the same length.
    """
    # Calculate total size of product space
    hash_count = len(left)
    total_pairs = len(left) * len(right)
    
    # If n is larger than total possible pairs, adjust n
    n = min(n, total_pairs)
    
    # Generate n random indices from the product space
    flat_indices = np.random.choice(total_pairs, size=n, replace=False)
    
    # Convert flat indices back to left and right indices
    left_indices = flat_indices // hash_count
    right_indices = flat_indices % hash_count
    
    # Take values using PyArrow for better performance
    left_hashes = left.take(pa.array(left_indices))
    right_hashes = right.take(pa.array(right_indices))
    
    # Generate probabilities as PyArrow array (between 0.7 and 1.0 to 2 DP)
    probs = pa.array(np.round(0.7 + 0.3 * np.random.random(n), 2), type=pa.float64())
    
    # Generate completely new random hashes instead of combining
    new_hashes = [
        sha1(i.to_bytes(8, 'big')).digest() 
        for i in range(n)
    ]
    combined_arr = pa.array(new_hashes, type=pa.binary())
    
    # Create table directly with PyArrow
    return pa.table({
        'hash': combined_arr,
        'left': left_hashes,
        'right': right_hashes,
        'probability': probs
    })


In [2]:
import rustworkx as rx
from matchbox.common.hash import list_to_value_ordered_hash

def to_clusters(results: pa.Table) -> pa.Table:
    """
    Converts probabilities into a list of connected components formed at each threshold.

    Returns:
        Probabilities sorted by threshold descending.
    """
    G = rx.PyGraph()
    added: dict[bytes, int] = {}
    components: dict[str, list] = {"parent": [], "child": [], "threshold": []}

    # Sort probabilities descending and group by probability
    edges_df = results.select(['left', 'right', 'probability']).sort_by([("probability", "descending")])
    
    # Get unique probability thresholds, sorted
    thresholds = pa.compute.unique(edges_df.column('probability'))

    # Process edges grouped by probability threshold
    for prob in thresholds.to_pylist():
        mask = pa.compute.equal(edges_df.column('probability'), prob)
        threshold_edges = edges_df.filter(mask)
        # Get state before adding this batch of edges
        old_components = {frozenset(comp) for comp in rx.connected_components(G)}

        # Add all nodes and edges at this probability threshold
        edge_values = zip(
            threshold_edges.column('left').to_pylist(),
            threshold_edges.column('right').to_pylist()
        )

        for left, right in edge_values:
            for hash_val in (left, right):
                if hash_val not in added:
                    idx = G.add_node(hash_val)
                    added[hash_val] = idx

            G.add_edge(added[left], added[right], None)

        new_components = {frozenset(comp) for comp in rx.connected_components(G)}
        changed_components = new_components - old_components

        # For each changed component, add ALL members at current threshold
        for comp in changed_components:
            children = sorted([G.get_node_data(n) for n in comp])
            parent = list_to_value_ordered_hash(children)

            components["parent"].extend([parent] * len(children))
            components["child"].extend(children)
            components["threshold"].extend([prob] * len(children))

    return pa.Table.from_pydict(components)

In [3]:
from collections import defaultdict

def _cluster_results_to_hierarchical(
    probabilities: pa.Table,
    clusters: pa.Table,
) -> list[tuple[bytes, bytes, float]]:
    """
    Converts results to a hierarchical structure by building up from base components.

    Args:
        probabilities: Original pairwise probabilities containing base components
        clusters: Connected components at each threshold

    Returns:
        List of (parent, child, threshold) tuples representing the hierarchy
    """
    thresholds = pa.compute.unique(clusters['threshold']).sort(order='descending')

    # Add all clusters corresponding to a simple two-item probability edge

    hierarchy = []
    # Convert to record batches for efficient iteration
    for batch in probabilities.to_batches():
        parent_array = batch.column('hash')
        left_array = batch.column('left')
        right_array = batch.column('right')
        prob_array = batch.column('probability')
        
        for i in range(len(batch)):
            parent = parent_array[i].as_py()
            left_id = left_array[i].as_py()
            right_id = right_array[i].as_py()
            prob = float(prob_array[i].as_py())
            hierarchy.extend(
                [(parent, left_id, prob), (parent, right_id, prob)]
            )

    # Create adjacency structure for quick lookups
    adj_dict: dict[bytes, set[tuple[bytes, float]]] = defaultdict(set)
    for parent, child, prob in hierarchy:
        adj_dict[child].add((parent, prob))

    # Process each threshold level, getting clusters at each threshold
    for threshold in thresholds:
        threshold_float = float(threshold.as_py())

        # Filter clusters at current threshold
        mask = pa.compute.equal(clusters.column('threshold'), threshold)
        current_clusters = clusters.filter(mask)

        # Group by parent
        parent_groups = {}
        for batch in current_clusters.to_batches():
            parent_col = batch.column('parent')
            child_col = batch.column('child')
            for i in range(len(batch)):
                parent = parent_col[i].as_py()
                child = child_col[i].as_py()
                if parent not in parent_groups:
                    parent_groups[parent] = set()
                parent_groups[parent].add(child)

        # Process each component
        for parent, members in parent_groups.items():
            if len(members) <= 2:
                continue

            seen = set(members)
            current = set(members)
            ultimate_parents = set()

            # Keep traversing until we've explored all paths
            while current:
                next_level = set()
                # If any current nodes have no parents above threshold,
                # they are ultimate parents for this threshold
                for node in current:
                    parents = {
                        p for p, prob in adj_dict[node] if prob >= threshold_float
                    }
                    next_parents = parents - seen
                    if not parents:  # No parents = ultimate parent
                        ultimate_parents.add(node)

                    next_level.update(next_parents)
                    seen.update(parents)

                current = next_level

            for up in ultimate_parents:
                hierarchy.append((parent, up, threshold_float))
                adj_dict[up].add((parent, threshold_float))

    return sorted(hierarchy, key=lambda x: (x[2], x[0], x[1]), reverse=True)

In [91]:
from collections import defaultdict
import pyarrow as pa
import pyarrow.compute as pc

def _cluster_results_to_hierarchical_2(
    probabilities: pa.Table,
    clusters: pa.Table,
) -> list[tuple[bytes, bytes, float]]:
    """
    Converts results to a hierarchical structure by building up from base components.
    Optimized to maximize Arrow operations and minimize Python object conversions.

    Args:
        probabilities: Original pairwise probabilities containing base components
        clusters: Connected components at each threshold

    Returns:
        List of (parent, child, threshold) tuples representing the hierarchy
    """
    thresholds = pa.compute.unique(clusters['threshold']).sort(order='descending')
    
    # Initialize hierarchy with base probabilities
    # Use Arrow to create the initial hierarchy structure
    probabilities_hash_array = probabilities['hash'].combine_chunks()
    probabilities_left_array = probabilities['left'].combine_chunks()
    probabilities_right_array = probabilities['right'].combine_chunks()
    probabilities_prob_array = probabilities['probability'].combine_chunks()
    hierarchy_parent = pa.concat_arrays([
        probabilities_hash_array,
        probabilities_hash_array
    ])
    hierarchy_child = pa.concat_arrays([
        probabilities_left_array,
        probabilities_right_array
    ])
    hierarchy_prob = pa.concat_arrays([
        probabilities_prob_array,
        probabilities_prob_array
    ])
    
    # Create initial hierarchy table
    hierarchy_table = pa.Table.from_arrays(
        [hierarchy_parent, hierarchy_child, hierarchy_prob],
        names=['parent', 'child', 'probability']
    )
    
    # Convert to Python objects only once for adjacency structure
    hierarchy = list(zip(
        hierarchy_table['parent'].to_pylist(),
        hierarchy_table['child'].to_pylist(),
        hierarchy_table['probability'].to_pylist()
    ))
    
    # Create adjacency structure for quick lookups
    adj_dict: dict[bytes, set[tuple[bytes, float]]] = defaultdict(set)
    for parent, child, prob in hierarchy:
        adj_dict[child].add((parent, prob))

    # Process each threshold level
    for threshold in thresholds.to_pylist():  # Convert threshold array once
        threshold_float = float(threshold)
        
        # Filter clusters at current threshold using Arrow operations
        current_clusters = clusters.filter(pc.equal(clusters['threshold'], threshold))
        
        # Group by parent using Arrow operations
        parent_groups = defaultdict(set)
        parent_col = current_clusters['parent']
        child_col = current_clusters['child']
        
        # Batch process the grouping
        parent_list = parent_col.to_pylist()
        child_list = child_col.to_pylist()
        for parent, child in zip(parent_list, child_list):
            parent_groups[parent].add(child)

        # Process each component
        for parent, members in parent_groups.items():
            if len(members) <= 2:
                continue

            seen = set(members)
            current = set(members)
            ultimate_parents = set()

            while current:
                next_level = set()
                for node in current:
                    parents = {
                        p for p, prob in adj_dict[node] if prob >= threshold_float
                    }
                    next_parents = parents - seen
                    if not parents:
                        ultimate_parents.add(node)

                    next_level.update(next_parents)
                    seen.update(parents)

                current = next_level

            # Batch add new hierarchy entries
            for up in ultimate_parents:
                hierarchy.append((parent, up, threshold_float))
                adj_dict[up].add((parent, threshold_float))

    return sorted(hierarchy, key=lambda x: (x[2], x[0], x[1]), reverse=True)

In [94]:
from collections import defaultdict
import pyarrow as pa
import pyarrow.compute as pc
import logging
import time
from typing import Any

def setup_logger() -> logging.Logger:
    """Set up a logger with appropriate formatting."""
    logger = logging.getLogger('cluster_profiler')
    if not logger.handlers:  # Avoid adding handlers if they already exist
        handler = logging.StreamHandler()
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        )
        handler.setFormatter(formatter)
        logger.addHandler(handler)
        logger.setLevel(logging.DEBUG)
    return logger

def log_time(logger: logging.Logger, start_time: float, operation: str) -> None:
    """Log the time taken for an operation."""
    duration = time.time() - start_time
    logger.debug(f"{operation} took {duration:.4f} seconds")

def log_memory(logger: logging.Logger, obj: Any, name: str) -> None:
    """Log memory usage of an object."""
    if hasattr(obj, 'nbytes'):
        size_mb = obj.nbytes / (1024 * 1024)
        logger.debug(f"Memory usage of {name}: {size_mb:.2f} MB")

def _cluster_results_to_hierarchical_3(
    probabilities: pa.Table,
    clusters: pa.Table,
) -> list[tuple[bytes, bytes, float]]:
    """
    Converts results to a hierarchical structure by building up from base components.
    Optimized to maximize Arrow operations and minimize Python object conversions.

    Args:
        probabilities: Original pairwise probabilities containing base components
        clusters: Connected components at each threshold

    Returns:
        List of (parent, child, threshold) tuples representing the hierarchy
    """
    logger = setup_logger()
    total_start_time = time.time()
    
    logger.info("Starting clustering process")
    logger.debug(f"Input probabilities table size: {len(probabilities)}")
    logger.debug(f"Input clusters table size: {len(clusters)}")
    
    # Threshold computation
    thresh_start = time.time()
    thresholds = pa.compute.unique(clusters['threshold']).sort(order='descending')
    log_time(logger, thresh_start, "Threshold computation")
    logger.debug(f"Number of unique thresholds: {len(thresholds)}")
    
    # Initialize hierarchy
    init_start = time.time()
    probabilities_hash_array = probabilities['hash'].combine_chunks()
    probabilities_left_array = probabilities['left'].combine_chunks()
    probabilities_right_array = probabilities['right'].combine_chunks()
    probabilities_prob_array = probabilities['probability'].combine_chunks()
    
    hierarchy_parent = pa.concat_arrays([
        probabilities_hash_array,
        probabilities_hash_array
    ])
    hierarchy_child = pa.concat_arrays([
        probabilities_left_array,
        probabilities_right_array
    ])
    hierarchy_prob = pa.concat_arrays([
        probabilities_prob_array,
        probabilities_prob_array
    ])
    log_time(logger, init_start, "Hierarchy initialization")
    
    # Create initial hierarchy table
    table_start = time.time()
    hierarchy_table = pa.Table.from_arrays(
        [hierarchy_parent, hierarchy_child, hierarchy_prob],
        names=['parent', 'child', 'probability']
    )
    log_time(logger, table_start, "Hierarchy table creation")
    log_memory(logger, hierarchy_table, "hierarchy_table")
    
    # Convert to Python objects
    conv_start = time.time()
    hierarchy = list(zip(
        hierarchy_table['parent'].to_pylist(),
        hierarchy_table['child'].to_pylist(),
        hierarchy_table['probability'].to_pylist()
    ))
    log_time(logger, conv_start, "Python object conversion")
    logger.debug(f"Initial hierarchy size: {len(hierarchy)}")
    
    # Create adjacency structure
    adj_start = time.time()
    adj_dict: dict[bytes, set[tuple[bytes, float]]] = defaultdict(set)
    for parent, child, prob in hierarchy:
        adj_dict[child].add((parent, prob))
    log_time(logger, adj_start, "Adjacency dictionary creation")
    logger.debug(f"Adjacency dictionary size: {len(adj_dict)}")
    
    # Process thresholds
    for threshold in thresholds.to_pylist():
        thresh_iter_start = time.time()
        logger.debug(f"Processing threshold: {threshold}")
        threshold_float = float(threshold)
        
        # Filter clusters
        filter_start = time.time()
        current_clusters = clusters.filter(pc.equal(clusters['threshold'], threshold))
        log_time(logger, filter_start, f"Cluster filtering for threshold {threshold}")
        logger.debug(f"Filtered clusters size: {len(current_clusters)}")
        
        # Group by parent
        group_start = time.time()
        parent_groups = defaultdict(set)
        parent_list = current_clusters['parent'].to_pylist()
        child_list = current_clusters['child'].to_pylist()
        for parent, child in zip(parent_list, child_list):
            parent_groups[parent].add(child)
        log_time(logger, group_start, "Parent grouping")
        logger.debug(f"Number of parent groups: {len(parent_groups)}")
        
        # Process components
        comp_start = time.time()
        components_processed = 0
        for parent, members in parent_groups.items():
            if len(members) <= 2:
                continue
                
            components_processed += 1
            seen = set(members)
            current = set(members)
            ultimate_parents = set()
            
            while current:
                next_level = set()
                for node in current:
                    parents = {
                        p for p, prob in adj_dict[node] if prob >= threshold_float
                    }
                    next_parents = parents - seen
                    if not parents:
                        ultimate_parents.add(node)
                    
                    next_level.update(next_parents)
                    seen.update(parents)
                
                current = next_level
            
            # Add new hierarchy entries
            for up in ultimate_parents:
                hierarchy.append((parent, up, threshold_float))
                adj_dict[up].add((parent, threshold_float))
                
        log_time(logger, comp_start, f"Component processing for threshold {threshold}")
        logger.debug(f"Components processed: {components_processed}")
        log_time(logger, thresh_iter_start, f"Total processing for threshold {threshold}")
    
    # Final sorting
    sort_start = time.time()
    result = sorted(hierarchy, key=lambda x: (x[2], x[0], x[1]), reverse=True)
    log_time(logger, sort_start, "Final sorting")
    logger.debug(f"Final hierarchy size: {len(result)}")
    
    log_time(logger, total_start_time, "Total function execution")
    return result

In [106]:
from collections import defaultdict
import pyarrow as pa
import pyarrow.compute as pc
import logging
import time

def setup_logger() -> logging.Logger:
    """Set up a logger with appropriate formatting."""
    logger = logging.getLogger('cluster_profiler')
    if not logger.handlers:
        handler = logging.StreamHandler()
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        )
        handler.setFormatter(formatter)
        logger.setLevel(logging.DEBUG)
    return logger

def _cluster_results_to_hierarchical_4(
    probabilities: pa.Table,
    clusters: pa.Table,
) -> list[tuple[bytes, bytes, float]]:
    """
    Converts results to a hierarchical structure using hybrid approach:
    1. Use parent counting to identify potential ultimate parents
    2. Use targeted graph traversal to verify and find missed relationships
    """
    logger = setup_logger()
    total_start_time = time.time()
    
    logger.info("Starting clustering process")
    logger.debug(f"Input probabilities table size: {len(probabilities)}")
    logger.debug(f"Input clusters table size: {len(clusters)}")
    
    # Initialize hierarchy with base probabilities
    init_start = time.time()
    probabilities_hash_array = probabilities['hash'].combine_chunks()
    probabilities_left_array = probabilities['left'].combine_chunks()
    probabilities_right_array = probabilities['right'].combine_chunks()
    probabilities_prob_array = probabilities['probability'].combine_chunks()
    
    hierarchy_parent = pa.concat_arrays([
        probabilities_hash_array,
        probabilities_hash_array
    ])
    hierarchy_child = pa.concat_arrays([
        probabilities_left_array,
        probabilities_right_array
    ])
    hierarchy_prob = pa.concat_arrays([
        probabilities_prob_array,
        probabilities_prob_array
    ])
    
    hierarchy_table = pa.Table.from_arrays(
        [hierarchy_parent, hierarchy_child, hierarchy_prob],
        names=['parent', 'child', 'probability']
    )
    
    # Convert to Python objects
    hierarchy = list(zip(
        hierarchy_table['parent'].to_pylist(),
        hierarchy_table['child'].to_pylist(),
        hierarchy_table['probability'].to_pylist()
    ))
    
    # Create initial parent map
    adj_dict: dict[bytes, set[tuple[bytes, float]]] = defaultdict(set)
    for parent, child, prob in hierarchy:
        adj_dict[child].add((parent, prob))
    
    logger.debug(f"Initialization took {time.time() - init_start:.4f}s")
    
    thresholds = pa.compute.unique(clusters['threshold']).sort(order='descending')
    
    # Process each threshold
    for threshold in thresholds.to_pylist():
        threshold_start = time.time()
        logger.debug(f"Processing threshold: {threshold}")
        threshold_float = float(threshold)
        
        # Filter clusters for this threshold
        current_clusters = clusters.filter(pc.equal(clusters['threshold'], threshold))
        logger.debug(f"Filtered clusters size: {len(current_clusters)}")
        
        # Group by parent
        parent_groups = defaultdict(set)
        parent_list = current_clusters['parent'].to_pylist()
        child_list = current_clusters['child'].to_pylist()
        for parent, child in zip(parent_list, child_list):
            parent_groups[parent].add(child)
        
        comp_start = time.time()
        # Process each component
        for parent, members in parent_groups.items():
            if len(members) <= 2:
                continue
                
            # Count direct parents for each node at this threshold
            parent_count = defaultdict(int)
            for node in members:
                for p, prob in adj_dict[node]:
                    if prob >= threshold_float:
                        parent_count[node] += 1
            
            # Find initial candidates with no direct parents
            candidates = {
                node for node in members 
                if parent_count[node] == 0
            }
            
            # Now do targeted graph traversal from each non-candidate
            # to identify any candidates that have indirect parents
            not_ultimate = set()
            for node in members - candidates:
                if node in not_ultimate:
                    continue
                    
                # Start traversal from this node
                stack = [node]
                seen = {node}
                
                while stack:
                    current = stack.pop()
                    for p, prob in adj_dict[current]:
                        if prob >= threshold_float and p not in seen:
                            if p in candidates:
                                # Found a path to a candidate - it can't be an ultimate parent
                                candidates.remove(p)
                            stack.append(p)
                            seen.add(p)
                            not_ultimate.add(p)
            
            # Add hierarchy entries for true ultimate parents
            for up in candidates:
                hierarchy.append((parent, up, threshold_float))
                adj_dict[up].add((parent, threshold_float))
        
        logger.debug(f"Component processing took {time.time() - comp_start:.4f}s")
        logger.debug(f"Total threshold processing took {time.time() - threshold_start:.4f}s")
    
    sort_start = time.time()
    result = sorted(hierarchy, key=lambda x: (x[2], x[0], x[1]), reverse=True)
    logger.debug(f"Sorting took {time.time() - sort_start:.4f}s")
    logger.debug(f"Final hierarchy size: {len(result)}")
    
    total_time = time.time() - total_start_time
    logger.debug(f"Total execution time: {total_time:.4f}s")
    
    return result

In [None]:
hashes = fast_generate_hashes(int(2e5))
probabilities = fast_sample_pairs(hashes, int(1e6))
clusters = to_clusters(probabilities)
hierarchical = _cluster_results_to_hierarchical(probabilities, clusters)

In [76]:
hierarchical = _cluster_results_to_hierarchical(probabilities, clusters)

In [96]:
logger = logging.getLogger('cluster_profiler')
logger.setLevel(logging.DEBUG)

hierarchical3 = _cluster_results_to_hierarchical_3(probabilities, clusters)

2024-12-10 17:39:35,730 - cluster_profiler - INFO - Starting clustering process
2024-12-10 17:39:35,732 - cluster_profiler - DEBUG - Input probabilities table size: 1000000
2024-12-10 17:39:35,732 - cluster_profiler - DEBUG - Input clusters table size: 5279385
2024-12-10 17:39:35,753 - cluster_profiler - DEBUG - Threshold computation took 0.0204 seconds
2024-12-10 17:39:35,754 - cluster_profiler - DEBUG - Number of unique thresholds: 30
2024-12-10 17:39:35,990 - cluster_profiler - DEBUG - Hierarchy initialization took 0.2352 seconds
2024-12-10 17:39:35,992 - cluster_profiler - DEBUG - Hierarchy table creation took 0.0002 seconds
2024-12-10 17:39:35,993 - cluster_profiler - DEBUG - Memory usage of hierarchy_table: 106.81 MB
2024-12-10 17:39:40,565 - cluster_profiler - DEBUG - Python object conversion took 4.5699 seconds
2024-12-10 17:39:40,566 - cluster_profiler - DEBUG - Initial hierarchy size: 2000000
2024-12-10 17:39:42,735 - cluster_profiler - DEBUG - Adjacency dictionary creation t

In [107]:
logger = logging.getLogger('cluster_profiler')
logger.setLevel(logging.DEBUG)

hierarchical4 = _cluster_results_to_hierarchical_4(probabilities, clusters)

2024-12-10 18:00:07,583 - cluster_profiler - INFO - Starting clustering process
2024-12-10 18:00:07,584 - cluster_profiler - DEBUG - Input probabilities table size: 1000000
2024-12-10 18:00:07,585 - cluster_profiler - DEBUG - Input clusters table size: 5279385
2024-12-10 18:00:14,588 - cluster_profiler - DEBUG - Initialization took 7.0027s
2024-12-10 18:00:14,610 - cluster_profiler - DEBUG - Processing threshold: 1.0
2024-12-10 18:00:14,616 - cluster_profiler - DEBUG - Filtered clusters size: 30689
2024-12-10 18:00:14,762 - cluster_profiler - DEBUG - Component processing took 0.0559s
2024-12-10 18:00:14,763 - cluster_profiler - DEBUG - Total threshold processing took 0.1526s
2024-12-10 18:00:14,763 - cluster_profiler - DEBUG - Processing threshold: 0.99
2024-12-10 18:00:14,770 - cluster_profiler - DEBUG - Filtered clusters size: 63676
2024-12-10 18:00:15,229 - cluster_profiler - DEBUG - Component processing took 0.3114s
2024-12-10 18:00:15,230 - cluster_profiler - DEBUG - Total thresho

KeyboardInterrupt: 

In [77]:
len(hierarchical)

2835379

In [93]:
len(hierarchical2)

2835379

In [100]:
len(hierarchical3)

2835379

In [105]:
len(hierarchical4)

2000000

In [64]:
len(clusters)

5279385

In [66]:
clusters

pyarrow.Table
parent: binary
child: binary
threshold: double
----
parent: [[6810342717C8589102E8A1F469EDF344BBD353D9D1DFB6B547FAC9561849CEDC,6810342717C8589102E8A1F469EDF344BBD353D9D1DFB6B547FAC9561849CEDC,47EF3BF7E7AD94C6AFF6B4832E5A7AD8C4FA1C3FDB60AA5EEB0AC555AB627833,47EF3BF7E7AD94C6AFF6B4832E5A7AD8C4FA1C3FDB60AA5EEB0AC555AB627833,A79E30871550ABAB8638EBDFF3A9E351E8537EBEDFF1EF2CAD18B5CEF4F4E8B9,...,C255483BAAF15A5475EE5DD67A30566E53673969DC2DC6D6689E0487857AF49E,C255483BAAF15A5475EE5DD67A30566E53673969DC2DC6D6689E0487857AF49E,C255483BAAF15A5475EE5DD67A30566E53673969DC2DC6D6689E0487857AF49E,C255483BAAF15A5475EE5DD67A30566E53673969DC2DC6D6689E0487857AF49E,C255483BAAF15A5475EE5DD67A30566E53673969DC2DC6D6689E0487857AF49E]]
child: [[694DCC2AE7D2038E9B3B547A23C398510A2FCB24,BA883EFEED4A93C4739969815E7D23CCC9455202,30C67178B851B0038FC5D5EA72B32032BABD7CBB,774F730EA226553663DF11E721072A6E28873E3F,B4481867BD5583E3C33D83DF4745FC059ECBBB98,...,FFFD0F3371B95C2DC93EBDE701309ECADA1DBE21,FFFDB881D

## Dummy data

* Probabilities standard/update: model, cluster, probability
* Clusters (data) standard/update: hash, dataset, id
* Clusters (cluster) standard/update: hash, dataset (null), id (null)
* Contains standard/update

Or

* clusters_dataset1_standard_sml
* clusters_dataset1_update_sml
* clusters_dataset2_standard_sml
* clusters_dataset2_update_sml
* clusters_clusters_standard_sml
* clusters_clusters_update_sml
* contains_standard_sml
* contains_update_sml
* probabilities_standard_sml
* probabilities_update_sml
* clusters_dataset1_standard_lrg
* clusters_dataset1_update_lrg
* clusters_dataset2_standard_lrg
* clusters_dataset2_update_lrg
* clusters_clusters_standard_lrg
* clusters_clusters_update_lrg
* contains_standard_lrg
* contains_update_lrg
* probabilities_standard_lrg
* probabilities_update_lrg

By

* Generate 20m hashes for dataset1
* Generate 20m pks for dataset1
* Generate 5m hashes for dataset2
* Generate 5m pks for dataset2
* Create dataset1 standard w/ 10m hashes and 180k pks
* Create dataset1 update w/ all hashes and 200k pks

In [4]:
from itertools import count
from uuid import UUID, uuid4
from typing import Generator

def d1gen() -> Generator[UUID, None, None]:
    """Generate UUIDs for dataset 1."""

    for i in count():
        yield UUID(int=i, version=4)

def d2gen() -> Generator[UUID, None, None]:
    """Generate UUIDs for dataset 2."""
    for _ in count():
        yield uuid4()

d1ids = d1gen()
d2ids = d2gen()

In [5]:
e = 5

def num(c: int, e: int) -> int:
    return int(c * (10 ** e))

In [6]:
def create_test_tables(
    size: int,
    name: bytes,
    id_gen: Generator[str, None, None]
) -> tuple[pa.Table, pa.Table]:
    """
    Create two Arrow tables: a standard version and an update version with controlled changes.
    The update version maintains some hashes, reassigns some IDs, and adds new entries.
    
    Args:
        size: Number of rows in each table
        name: Dataset name as bytes (will be used as prefix for both versions)
        id_gen: Generator that yields string IDs
        
    Returns:
        Tuple[pa.Table, pa.Table]: Standard and update tables
    """
    # Generate standard table data
    standard_hashes = fast_generate_hashes(size)
    standard_ids = [[str(next(id_gen))] for _ in range(size)]
    
    # Create standard table
    standard_ids_nested = pa.array(standard_ids, type=pa.list_(pa.string()))
    standard_dataset = pa.array([name] * len(standard_hashes), type=pa.binary())
    
    standard_table = pa.Table.from_arrays(
        [standard_hashes, standard_dataset, standard_ids_nested],
        names=['hash', 'dataset', 'id']
    )

    # Calculate sizes for different types of changes
    keep_count = int(size * 0.3)  # 30% remain same
    reuse_count = int(size * 0.2)  # 20% same hash, new ID
    move_count = int(size * 0.2)   # 20% same ID, new hash
    new_count = size - (keep_count + reuse_count + move_count)  # 30% new entries

    # Build update version
    update_hashes = []
    update_ids = []

    # 1. Keep some entries exactly the same
    for i in range(keep_count):
        update_hashes.append(standard_hashes[i])
        update_ids.append(standard_ids[i])
    
    # 2. Same hash, new IDs
    for i in range(keep_count, keep_count + reuse_count):
        update_hashes.append(standard_hashes[i])
        update_ids.append(
            [str(next(id_gen))]
        )
    
    # 3. Same IDs, new hashes
    new_hashes = fast_generate_hashes(move_count)
    for i, hash in enumerate(new_hashes):
        idx = keep_count + reuse_count + i
        update_hashes.append(hash)
        update_ids.append(standard_ids[idx])
    
    # 4. Completely new entries
    final_new_hashes = fast_generate_hashes(new_count)
    for hash in final_new_hashes:
        update_hashes.append(hash)
        update_ids.append(
            [str(next(id_gen))]
        )

    # Create update table
    update_ids_nested = pa.array(update_ids, type=pa.list_(pa.string()))
    update_dataset = pa.array([name] * len(update_hashes), type=pa.binary())

    update_table = pa.Table.from_arrays(
        [pa.array(update_hashes), update_dataset, update_ids_nested],
        names=['hash', 'dataset', 'id']
    )

    return standard_table, update_table

In [7]:
dataset1_standard, dataset1_update = create_test_tables(num(2, e), b'dataset1', d1ids)

dataset1_standard, dataset1_update

(pyarrow.Table
 hash: binary
 dataset: binary
 id: list<item: string>
   child 0, item: string
 ----
 hash: [[BB8BD9281B7698DA11C6AC58A7528954CCB32AF5,14CA930597D476D7D947BC29FF314ADC8D4B19A4,EFF6E0174705BD6169ABE3325A0D813592F6F04C,5F174867850BCA27C1664AD48A29C25B3D9B5C46,0866C5A7F70E10265E8E701432C389889EDE3A42,...,D0FDB00682A7F63981858B8BFB5674743F2CD1C5,0BA9AA21B31E28D1FEAA557F2B991375565E4066,088D04FCB221AE3BE2070AA143AA856526F44BCD,93ED08CD471857A3DBAA40364B60BA92E1441DE3,62A17D122B9DB5304999934C8180F22B480E30A6]]
 dataset: [[6461746173657431,6461746173657431,6461746173657431,6461746173657431,6461746173657431,...,6461746173657431,6461746173657431,6461746173657431,6461746173657431,6461746173657431]]
 id: [[["00000000-0000-4000-8000-000000000000"],["00000000-0000-4000-8000-000000000001"],...,["00000000-0000-4000-8000-000000030d3e"],["00000000-0000-4000-8000-000000030d3f"]]],
 pyarrow.Table
 hash: binary
 dataset: binary
 id: list<item: string>
   child 0, item: string
 ----
 hash: 

In [8]:
dataset2_standard, dataset2_update = create_test_tables(num(2, e), b'dataset2', d2ids)

dataset2_standard, dataset2_update

(pyarrow.Table
 hash: binary
 dataset: binary
 id: list<item: string>
   child 0, item: string
 ----
 hash: [[121E7D7DFC1F8102E589CDC2D628BF8BA14AE09B,DF6EFB89680F7F2DCA4A8B52A09DE7D073AB93E3,9450D7CB108639ED251E605DFA36F5E799AC7255,41B811585371BD576D93A6679D7B4651134BC94F,3D67BDBA1A20E6BA99712999ACF67705E39DDF9B,...,CBE891D5F3DB086232C306E3A775886119A196E3,CC9CBB781A04633F561B146D93DF9136B4AF793B,04DDAD1680AEAFBD86A54C2C702E57F54B55D994,553A343F82991092B48492AE901CD7698A358A82,386A186890E97B44B1E15CB825558D4B39B5039C]]
 dataset: [[6461746173657432,6461746173657432,6461746173657432,6461746173657432,6461746173657432,...,6461746173657432,6461746173657432,6461746173657432,6461746173657432,6461746173657432]]
 id: [[["ab2e9b34-3656-487d-874b-50be5d5386cc"],["b18f0852-a3c6-484d-9c6b-bae7d989a17f"],...,["d306f395-9255-413d-9fad-5ed0066a60e3"],["41c039bc-5642-4295-b722-3b28a7aa5535"]]],
 pyarrow.Table
 hash: binary
 dataset: binary
 id: list<item: string>
   child 0, item: string
 ----
 hash: 

In [None]:
from time import time

def fast_consistent_pairs(
    standard_left: pa.Array,
    standard_right: pa.Array,
    update_left: pa.Array, 
    update_right: pa.Array,
    n: int = int(1e8)
) -> tuple[pa.Table, pa.Table]:
    """Generate partially consistent hash pairs between standard and update tables."""
    t0 = time()
    
    # Calculate sizes and adjust n
    standard_total = len(standard_left) * len(standard_right)
    update_total = len(update_left) * len(update_right)
    n = min(n, min(standard_total, update_total))
    print(f"Setup time: {time() - t0:.3f}s")
    
    t1 = time()
    # Generate random permutation of just the first n indices instead of all possible indices
    standard_flat = np.random.permutation(n)
    # Scale to full range
    standard_flat = (standard_flat * (standard_total / n)).astype(np.int64)
    std_left_idx = standard_flat // len(standard_right)
    std_right_idx = standard_flat % len(standard_right)
    print(f"Standard index generation: {time() - t1:.3f}s")
    
    t2 = time()
    # For update table, make ~30% of pairs match standard pairs
    match_count = n // 3
    update_flat = np.empty(n, dtype=np.int64)
    update_flat[:match_count] = standard_flat[:match_count]  # Copy matching pairs
    
    # Same optimization for the remaining indices
    remaining = np.random.permutation(n - match_count)
    remaining = (remaining * (update_total / (n - match_count))).astype(np.int64)
    update_flat[match_count:] = remaining
    
    up_left_idx = update_flat // len(update_right)
    up_right_idx = update_flat % len(update_right)
    print(f"Update index generation: {time() - t2:.3f}s")
    
    t3 = time()
    # Generate hashes using original SHA1 approach
    standard_hashes = [
        sha1(i.to_bytes(8, 'big')).digest() 
        for i in range(n)
    ]
    standard_combined = pa.array(standard_hashes, type=pa.binary())
    print(f"Standard hash generation: {time() - t3:.3f}s")
    
    t4 = time()
    # For update table, reuse hashes for matching pairs
    update_hashes = standard_hashes[:match_count] + [
        sha1((i + n).to_bytes(8, 'big')).digest()
        for i in range(n - match_count)
    ]
    update_combined = pa.array(update_hashes, type=pa.binary())
    print(f"Update hash generation: {time() - t4:.3f}s")
    
    t5 = time()
    # Create tables
    standard_table = pa.table({
        'hash': standard_combined,
        'left': standard_left.take(pa.array(std_left_idx)),
        'right': standard_right.take(pa.array(std_right_idx)),
        'probability': pa.array(np.round(0.7 + 0.3 * np.random.random(n), 2), type=pa.float64())
    })
    print(f"Standard table creation: {time() - t5:.3f}s")

    t6 = time()
    update_table = pa.table({
        'hash': update_combined,
        'left': update_left.take(pa.array(up_left_idx)),
        'right': update_right.take(pa.array(up_right_idx)),
        'probability': pa.array(np.round(0.7 + 0.3 * np.random.random(n), 2), type=pa.float64())
    })
    print(f"Update table creation: {time() - t6:.3f}s")
    
    print(f"Total time: {time() - t0:.3f}s")
    return standard_table, update_table

probabilities_standard, probabilities_update = fast_consistent_pairs(
    dataset1_standard.column('hash'),
    dataset2_standard.column('hash'),
    dataset1_update.column('hash'),
    dataset2_update.column('hash'),
    n=1000
)

probabilities_standard, probabilities_update

Setup time: 0.000s
Standard index generation: 0.003s


In [226]:
dataset2_hashes = fast_generate_hashes(num(5, e - 1))
dataset2_bytes = b"dataset2"
dataset2_ids = [[str(next(d2ids))] for _ in range(num(5, e - 1))]

dataset2_ids_nested = pa.array(dataset2_ids, type=pa.list_(pa.string()))
dataset2_dataset = pa.array([dataset2_bytes] * len(dataset2_hashes), type=pa.binary())

dataset_2 = pa.Table.from_arrays(
    [dataset2_hashes, dataset2_dataset, dataset2_ids_nested],
    names=['hash', 'dataset', 'id']
)

dataset_2

pyarrow.Table
hash: binary
dataset: binary
id: list<item: string>
  child 0, item: string
----
hash: [[EE3F87AB6C85B96F9ECAA4102770DCD9DBC691EE,21A1F8D571F9FFC475C541610426418EDF5CCC45,1DC323B8253379511A1E5C26FE717A532FD2554F,64475A83F4E9DC9A7D6580B086414123D33C7699,6E37DB26F6309EA19AE5AA3B7582EF4525E3B1E1,...,8943E7E226DBA6E74B4D334D4DA3C6EADEAD2205,AC90B261C80108D9494A18D72FDCCD93C3DDA748,0EF069ADC5AA576D3E21B334C11B4D322E9FAF11,D0398ED473DA018B088A5CE6757156A9EDB4E504,903B543FA6EE6314E45A45641CEE52FCDDFEE2F6]]
dataset: [[6461746173657432,6461746173657432,6461746173657432,6461746173657432,6461746173657432,...,6461746173657432,6461746173657432,6461746173657432,6461746173657432,6461746173657432]]
id: [[["39bba755-998a-4ceb-9422-11b2ccd26804"],["4ff8dc4b-9406-4d85-9abb-6dcf3378f29e"],...,["43ca0a07-da74-44df-bab3-4d685c2e17be"],["c44079c3-5d53-4f2f-8b32-342c49e8e2dd"]]]

In [None]:
hashes = fast_generate_hashes(int(2e5))
probabilities = fast_sample_pairs_lr(hashes, int(1e6))
clusters = to_clusters(probabilities)
hierarchical = _cluster_results_to_hierarchical(probabilities, clusters)

## Compression

In [None]:
arrow_table.schema, f"{arrow_table.nbytes / (1024 * 1024):.2f}MB"

(hash: binary
 left: binary
 right: binary
 probability: double,
 '7653.24MB')

In [None]:
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.parquet as pq
from pathlib import Path
import hashlib

class HashIndex:
    def __init__(self, hash_type: str = 'sha1', hashes: list[bytes] | pa.Array | None = None):
        """Create a new HashIndex instance.
        
        Args:
            hash_type: Hash algorithm to use (default: 'sha1')
            hashes (Optional): Initial list of hashes to insert (default: None)
        """
        try:
            hash_obj = hashlib.new(hash_type)
            self._hash_size: int = hash_obj.digest_size   # Return the digest size in bytes
        except ValueError as e:
            raise ValueError(f'Unsupported hash type: {hash_type}') from e
        
        # Initialize empty table with correct schema
        self.schema = pa.schema([
            ('hash', pa.binary(self._hash_size)),
            ('id', pa.int32())
        ])
        self.table = pa.Table.from_pydict({
            'hash': [],
            'id': []
        }, schema=self.schema)
        self.next_id: int = 0

        if hashes is not None:
            self.insert_hashes(hashes)

    def __eq__(self, other: 'HashIndex') -> bool:
        """
        Compare this HashIndex with another for equality.
        
        Two HashIndex instances are considered equal if they:
            1. Have the same hash size
            2. Have the same next_id
            3. Have equal tables (same schema and data)
        
        Args:
            other: Another HashIndex instance to compare with
            
        Returns:
            bool: True if the indexes are equal, False otherwise
        """
        if not isinstance(other, HashIndex):
            return False
            
        return (
            self._hash_size == other._hash_size and
            self.next_id == other.next_id and
            self.table.equals(other.table)
        )

    def _list_to_array(self, hashes: list[bytes] | pa.Array) -> pa.Array:
        if isinstance(hashes, list):
            return pa.array(hashes, type=pa.binary(self._hash_size))
        return hashes

    def insert_hashes(self, hashes: list[bytes] | pa.Array) -> pa.Array:
        """
        Insert new hashes and return their indices. For existing hashes, returns their
        current indices. For new hashes, assigns and returns new indices.
        
        Args:
            hashes: Array of SHA-1 hashes to insert
        Returns:
            Array of indices (both existing and newly assigned)
        """
        hashes = self._list_to_array(hashes)
        
        # If table is empty, fast path to create initial table
        if len(self.table) == 0:
            ids = pa.array(range(len(hashes)), type=pa.int32())
            self.table = pa.Table.from_arrays([hashes, ids], schema=self.schema)
            self.next_id = len(hashes)
            # Sort table by hash for future binary searches
            self.table = self.table.sort_by('hash')
            return ids

        # Find existing hashes using binary search
        indices = pc.index_in(hashes, self.table['hash'])
        is_new = pc.is_null(indices)
        new_count = pc.sum(pc.cast(is_new, pa.int32())).as_py()
        
        if new_count > 0:
            # Get the new hashes
            new_hashes = pc.filter(hashes, is_new)
            
            # Pre-allocate new IDs array
            new_ids = pa.array(
                range(self.next_id, self.next_id + new_count), 
                type=pa.int32()
            )
            self.next_id += new_count
            
            # Append in one operation and sort once
            new_table = pa.Table.from_arrays([new_hashes, new_ids], schema=self.schema)
            self.table = pa.concat_tables([self.table, new_table])
            self.table = self.table.sort_by('hash')
            
            # Final lookup to get all IDs in correct order
            indices = pc.index_in(hashes, self.table['hash'])
        
        return pc.take(self.table['id'], indices)

    def get_hashes(self, ids: list[int] | pa.Array) -> pa.Array:
        """
        Look up hashes by their IDs
        
        Args:
            ids: Array of IDs to look up
        Returns:
            Array of corresponding hashes (null for unknown indices)
        """
        if isinstance(ids, list):
            ids = pa.array(ids, type=pa.int32())
        
        positions = pc.index_in(ids, self.table['id'])
        return pc.take(self.table['hash'], positions)

    def get_indices(self, hashes: list[bytes] | pa.Array) -> pa.Array:
        """
        Look up IDs for existing hashes. Returns null for unknown hashes.
        
        Args:
            hashes: Array of hashes to look up
        Returns:
            Array of corresponding IDs (null for unknown hashes)
        """
        hashes = self._list_to_array(hashes)
            
        indices = pc.index_in(hashes, self.table['hash'])
        return pc.take(self.table['id'], indices)
    

    def to_parquet(self, path: str | Path, compression: str = 'zstd') -> None:
        """
        Save the HashIndex to a Parquet file.
        
        Args:
            path: Path to save the Parquet file
            compression: Compression algorithm to use (default: 'zstd')
                Options include: 'none', 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'
        
        Raises:
            IOError: If the file cannot be written
            ValueError: If the compression algorithm is not supported
        """
        path = Path(path)
        path.parent.mkdir(parents=True, exist_ok=True)
        
        metadata = {
            b'next_id': str(self.next_id).encode(),
            b'hash_size': str(self._hash_size).encode()
        }

        existing_metadata = self.table.schema.metadata or {}
        merged_metadata = {**existing_metadata, **metadata}
        
        try:
            pq.write_table(
                self.table.replace_schema_metadata(merged_metadata),
                path,
                compression=compression,
            )
        except Exception as e:
            raise IOError(f"Failed to write Parquet file: {e}") from e

    @classmethod
    def from_parquet(cls, path: str | Path) -> 'HashIndex':
        """
        Load a HashIndex from a Parquet file.
        
        Args:
            path: Path to the Parquet file
        
        Returns:
            HashIndex: New HashIndex instance loaded from the file
        
        Raises:
            IOError: If the file cannot be read or is invalid
            ValueError: If the file format is invalid
        """
        path = Path(path)
        
        try:
            table = pq.read_table(path)
            metadata = table.schema.metadata
            
            if not metadata or b'next_id' not in metadata or b'hash_size' not in metadata:
                raise ValueError("Invalid Parquet file: missing required metadata")
            
            # Create new instance
            instance = cls.__new__(cls)
            
            instance._hash_size = int(metadata[b'hash_size'].decode())
            instance.next_id = int(metadata[b'next_id'].decode())
            
            instance.schema = table.schema
            instance.table = table
            
            return instance
            
        except Exception as e:
            raise IOError(f"Failed to load Parquet file: {e}") from e

In [None]:
test_hashes = fast_generate_hashes(int(2e7))
test_hashes2 = pa.concat_arrays([fast_generate_hashes(int(2e7) / 2), test_hashes[:int(2e7 / 2)]])
hidx_5 = HashIndex(hash_type='sha1', hashes=test_hashes)
test_hashes[:5], test_hashes2[:5]

(<pyarrow.lib.BinaryArray object at 0x118363fa0>
 [
   7BDB9431F1E8DDBF1ACCF691D4B8661CF9B43A25,
   392BCA5B7AFC18C6727FC30EC57B718DDAAFF953,
   F884626B7A54AEFFECB3CA3DDB1493613CC0C7A9,
   C94022C8F551879B31D10F99BCC9A3561BAE7612,
   08458C06AD784FCDC7809825056654AD5F119C4C
 ],
 <pyarrow.lib.BinaryArray object at 0x11f06ada0>
 [
   31E7C33F498E4B7E95B1C52ECACAF348CEFFAE71,
   CC141AB85B40C152864B1A24689072D718332BDD,
   C769376877CFA78C46210E9B91D4446E82A6D7CE,
   9263614A6B6C53967ABA62FA2859F785C58B1016,
   15B8ABB98A8ED7D0F008C1234583DCE9DC34AFFF
 ])

In [None]:
indices = pa.array(random.sample(range(len(test_hashes2)), k=len(test_hashes2)))
test_hashes3 = test_hashes2.take(indices)

In [None]:
hidx_5.insert_hashes(test_hashes3)

<pyarrow.lib.ChunkedArray object at 0x13dc3c8b0>
[
  [
    8997172,
    6063643,
    6260234,
    3255430,
    7485144,
    ...
    9917674,
    5612100,
    2723141,
    201414,
    4680984
  ]
]

In [None]:
test_hashes = fast_generate_hashes(40)
test_hashes2 = fast_generate_hashes(40)
hidx = HashIndex(hash_type='sha1')
test_hashes[:5], test_hashes2[:5]

(<pyarrow.lib.BinaryArray object at 0x1187140a0>
 [
   1F21ABB3185DA8D5340A8298C6CBC9C21F8AF9CD,
   813583A225BA22DE65149EC502FA79A082F93D8C,
   C4587910358DC70CD81B6B106FD724685D8AE971,
   79924525EAF98B48E03E00213E0FFBFD17AD8778,
   5E668D4D878659EE35DC8F3E7EFE5B19D68890FA
 ],
 <pyarrow.lib.BinaryArray object at 0x11ec591e0>
 [
   BDBD93052D2290495857C2A46936C468CA4A7FD6,
   AC0FFE1E2F41A6D12D5F56E32A00F8B48D47EF66,
   A3DD64FC8F69428EA0693218AF31FEA22F8BCDDE,
   0C7FF3A1FB8441369A80059DC2FA08F678A05D86,
   BD939DD6F750A40D0C3CFCB3F1C3B202A8BEADE2
 ])

In [None]:
file_path = Path.cwd()

hidx3 = HashIndex(hash_type='sha1', hashes=fast_generate_hashes(int(2e7)))

hidx3.to_parquet(file_path / 'hash_index.parquet')

# del hidx3

In [None]:
hidx3 = HashIndex.from_parquet(file_path / 'hash_index.parquet')

In [None]:
hidx3.get_hashes([random.randint(0, int(2e7)) for _ in range(int(2e5))])[:5]

<pyarrow.lib.ChunkedArray object at 0x11ec7e6b0>
[
  [
    3F0B311BB3EF9EAEB211378D538A67491E43429F,
    1234FE9CB4C1413559FD09FBBAF512DD26DF7CF6,
    C163CF811B2021CE8D17AFE5720F9DED2D602543,
    D151C016C7C94022B0E8A7826EE698A0B5FB947E,
    30EE3D4D4A6497F0C1A85158F895398C9E901F63
  ]
]

In [None]:
hidx.insert_hashes(test_hashes)[:5], hidx.insert_hashes(test_hashes2)[:5]

(<pyarrow.lib.Int32Array object at 0x11ecc1f60>
 [
   0,
   1,
   2,
   3,
   4
 ],
 <pyarrow.lib.ChunkedArray object at 0x11ef1f600>
 [
   [
     40,
     41,
     42,
     43,
     44
   ]
 ])

In [None]:
h = hidx.get_hashes([0])[0]
i = hidx.get_indices([h])[0]
h2 = hidx.get_hashes([i])[0]

h == h2

True

In [None]:
file_path = Path.cwd()

hidx.to_parquet(file_path / 'hash_index.parquet')
hidx2 = HashIndex.from_parquet(file_path / 'hash_index.parquet')

hidx == hidx2

True

In [None]:
hidx.get_hashes([0, 40, 7, 42, 190])[:5]

<pyarrow.lib.ChunkedArray object at 0x15a223d80>
[
  [
    9E24FC1BC65D8A134990D02A6C4A40E3AAB20FAA,
    09C61C3BE151A112228E61AC7C7C1B81DC625CDF,
    CEED556D7FE741F9BC0FA5A464DEF839989F3E74,
    76506960448B8BB6CB2AF337FBE9852E1ABB1486,
    null
  ]
]

In [None]:
hidx.get_indices(test_hashes[:5]), hidx.get_indices(test_hashes2[:5])

(<pyarrow.lib.ChunkedArray object at 0x120b61e90>
 [
   [
     0,
     1,
     2,
     3,
     4
   ]
 ],
 <pyarrow.lib.ChunkedArray object at 0x15a2d1800>
 [
   [
     40,
     41,
     42,
     43,
     44
   ]
 ])

In [None]:
hashes = fast_generate_hashes(int(2e7))
t1 = fast_sample_pairs(hashes, int(1e8))

In [None]:
def index_probability_table(table: pa.Table) -> tuple[HashIndex, pa.Table]:

    hidx = HashIndex(hash_type='sha1')

    table = table.set_column(
        table.column_names.index("left"),
        "left",
        hidx.insert_hashes(table['left'])
    )
    table = table.set_column(
        table.column_names.index("right"),
        "right",
        hidx.insert_hashes(table['right'])
    )
    table = table.set_column(
        table.column_names.index("hash"),
        "hash",
        hidx.insert_hashes(table['hash'])
    )

    return hidx, table

hidx, t2 = index_probability_table(t1)

f"{t1.nbytes / (1024 * 1024):.2f}MB", f"{hidx.table.nbytes / (1024 * 1024):.2f}MB", f"{t2.nbytes / (1024 * 1024):.2f}MB", f"{(hidx.table.nbytes + t2.nbytes) / (1024 * 1024):.2f}MB"

('7653.24MB', '4135.36MB', '1907.35MB', '6042.71MB')

In [None]:
(15 * 15) / 60

3.75

In [None]:
(4230 / 5) / 60

14.1

5mb/s upload speed -- 25 mins for 7.6Gb, 14min for 4.3Gb

In [None]:
import pyarrow.parquet as pq
from pathlib import Path

file_path = Path.cwd()

pq.write_table(
    t2,
    file_path / 'probabilities_normalised_brot.parquet',
    compression='BROTLI',
    # compression_level=16,
    # use_dictionary=True,
    # write_statistics=True,
    # use_byte_stream_split=True,
    # row_group_size=1048576  # 1MB row groups
)

In [None]:
hidx.to_parquet(file_path / 'hash_index.parquet')

* Impose threshold (don't store below x, default x)
* Can we automate to check rub probabilities?
    * Find any gigantic clusters as a result
    * Run at plausible thresholds?
* Is threshold but need a rule of thumb
    * If dedupe, n^2 - n/2
    * If link, n^2 - n/2 ish
* "How do you know that" is hard in a chain, and people want that

* duckdb from pg OR pg from duckdb -- both available to work with pg
    * Bad for atomic updates

---

100m

ZSTD 10s at 1, 1.86Gb
ZSTD 10s at 4, 1.86Gb 
ZSTD 50s at 15, 1.86Gb
ZSTD 4m at 16, 1.86Gb
ZSTD 7m at 22, 1.52Gb
BROTLI 2m at default, 1.57Gb

Index

Snappy 13s at default, 3.8Gb
BROTLI 6m at default, 2.6Gb
ZSTD 20s at default, 2.75Gb

ZSTD balanced between the two: 4.235Gb (3Gb saving)

--

Work through pg/duckdb idea

- Clusters and contains need appending -- parquet or postgres?
- How is duckdb informed about new parquet
    - Lambda? API?
- Can this perform?

* Duckdb
    * R/W or R mode -- we'd launch in one or tother
        * No lambdas, just run the new command
* Consolidating a dataset (pruning old records) -- like upsert
    * 3 ways
        * duckdb directly, w process. Load two parquets, prune old
        * pandas
        * polars
* (?) married to parquet