In [1]:
from hashlib import sha1
import numpy as np
import pyarrow as pa
from typing import List

def fast_generate_hashes(n: int = int(2e7)) -> pa.Array:
    """Generate n SHA1 hashes using PyArrow arrays."""
    hashes = [sha1(i.to_bytes(8, 'big')).digest() 
              for i in range(int(n))]
    return pa.array(hashes, type=pa.binary())

def fast_sample_pairs(hashes: pa.Array, n: int = int(1e8)) -> pa.Table:
    """Generate hash pairs with random new hashes."""
    hash_count = len(hashes)
    
    # Generate indices
    left = np.random.randint(0, hash_count, n)
    right = np.random.randint(0, hash_count - 1, n)
    right += (right >= left)
    
    # Take values using PyArrow
    left_hashes = hashes.take(pa.array(left))
    right_hashes = hashes.take(pa.array(right))
    
    # Generate probabilities as PyArrow array
    probs = pa.array(np.random.random(n), type=pa.float64())
    
    # Generate completely new random hashes instead of combining
    new_hashes = [sha1(i.to_bytes(8, 'big')).digest() 
                  for i in range(n)]
    combined_arr = pa.array(new_hashes, type=pa.binary())
    
    # Create table directly with PyArrow
    return pa.table({
        'hash': combined_arr,
        'left': left_hashes,
        'right': right_hashes,
        'probability': probs
    })


In [4]:
hashes = fast_generate_hashes(int(2e7))
arrow_table = fast_sample_pairs(hashes, int(1e8))

In [5]:
arrow_table.schema, f"{arrow_table.nbytes / (1024 * 1024):.2f}MB"

(hash: binary
 left: binary
 right: binary
 probability: double,
 '7653.24MB')

In [8]:
import pyarrow.parquet as pq
from pathlib import Path

# Define the path to save the parquet file
file_path = Path.cwd() / 'probabilities.parquet'

# Save the arrow_table to a parquet file
pq.write_table(arrow_table, file_path)