In [1]:
import numpy as np
from collections import Counter
from tqdm import tqdm

In [2]:
def load_binary_file(file):
    """
    Load a numpy array from a binary file
    """
    with open(file, 'rb') as f:
        packed_data = f.read()
    data = np.frombuffer(packed_data, dtype=np.uint64)
    # return np.unpackbits(data, axis=-1, bitorder='little')
    return data

In [3]:
all_binary = load_binary_file('output_set.bin')

In [5]:
print(len(all_binary))
print(all_binary)

60801408
[ 9516115917310651001  9516115917312745613  9516115917717502581 ...
 17725741141956831395 17725741141956835459 17725741142021845635]


In [64]:
def hamming_distance(a, b):
    """Calculate the Hamming distance using bit_count() (Python 3.10+)."""
    return (a ^ b).bit_count()

def hamming_distances_vectorized(point, lst):
    # Convert inputs to NumPy arrays
    # point = np.uint64(point)
    # lst = np.array(lst, dtype=np.uint64)
    
    # XOR the point with the list
    xor_result = point ^ lst  # XOR result is still uint64
    
    # Convert uint64 to uint8 view (8 bytes per uint64)
    xor_bytes = xor_result.view(np.uint8)  # Interpret each uint64 as 8 uint8 values
    
    # Unpack bits and count the number of 1s for each uint64
    unpacked_bits = np.unpackbits(xor_bytes, axis=0).reshape(len(lst), 64)
    distances = unpacked_bits.sum(axis=1)
    
    return distances

def compute_centroid(cluster):
    """
    Compute the centroid (bitwise majority) of a cluster of binary codes.
    
    Args:
        cluster (list of int): List of 64-bit binary codes as integers.
        
    Returns:
        int: Centroid as a 64-bit binary code.
    """
    if not cluster:
        return None
    
    # Convert cluster to an array of bits
    cluster_bits = np.array([list(map(int, bin(code)[2:].zfill(64))) for code in cluster])
    
    # Compute bitwise majority
    majority_bits = (cluster_bits.sum(axis=0) >= (len(cluster) / 2)).astype(int)
    
    # Convert majority bits back to integer
    centroid = int("".join(map(str, majority_bits)), 2)
    return centroid

def kmeans_hamming(dataset, k, max_iters=100):
    """
    Perform K-means clustering using Hamming distance for 64-bit binary codes.
    
    Args:
        dataset (list of int): List of 64-bit binary codes as integers.
        k (int): Number of clusters.
        max_iters (int): Maximum number of iterations.
        
    Returns:
        list: Cluster assignments for each data point.
        list: Final centroids for each cluster.
    """
    n = len(dataset)
    
    # Step 1: Initialize centroids (randomly select k points from the dataset)
    centroids = np.random.choice(dataset, k, replace=False)
    
    # Step 2: Iterate until convergence or max iterations
    for iteration in range(max_iters):
        print(f"Iteration {iteration + 1}")
        # Assignment step: Assign each point to the nearest centroid
        clusters = [[] for _ in range(k)]
        for point in tqdm(dataset):
            # distances = [hamming_distance(point, centroid) for centroid in centroids]
            distances = hamming_distances_vectorized(point, np.array(centroids))
            cluster_idx = np.argmin(distances)
            clusters[cluster_idx].append(point)
        
        # Update step: Compute new centroids
        new_centroids = []
        for cluster in clusters:
            if cluster:
                new_centroids.append(compute_centroid(cluster))
            else:
                # If a cluster is empty, reinitialize its centroid randomly
                new_centroids.append(np.random.choice(dataset))
        
        # Check for convergence (centroids do not change)
        if np.array_equal(centroids, new_centroids):
            break
        centroids = new_centroids
    
    # Final assignment
    cluster_assignments = [None] * n
    for cluster_idx, cluster in enumerate(clusters):
        for point in cluster:
            cluster_assignments[dataset.index(point)] = cluster_idx
    
    return cluster_assignments, centroids

In [66]:
# Perform K-means clustering
k = 2000  # Number of clusters
cluster_assignments, centroids = kmeans_hamming(all_binary, k)

Iteration 1


  0%|          | 66922/60801408 [00:05<1:21:37, 12401.03it/s]


KeyboardInterrupt: 

In [67]:
import numpy as np
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

def hamming_distance(a, b):
    """Calculate the Hamming distance using bit_count() (Python 3.10+)."""
    return (a ^ b).bit_count()


def compute_centroid(cluster):
    """
    Compute the centroid (bitwise majority) of a cluster of binary codes.
    
    Args:
        cluster (list of int): List of 64-bit binary codes as integers.
        
    Returns:
        int: Centroid as a 64-bit binary code.
    """
    if not cluster:
        return None
    
    # Convert cluster to an array of bits
    cluster_bits = np.array([list(map(int, bin(code)[2:].zfill(64))) for code in cluster])
    
    # Compute bitwise majority
    majority_bits = (cluster_bits.sum(axis=0) >= (len(cluster) / 2)).astype(int)
    
    # Convert majority bits back to integer
    centroid = int("".join(map(str, majority_bits)), 2)
    return centroid


def compute_hamming_distances(args):
    """
    Compute the Hamming distances between a point and all centroids.
    
    Args:
        args (tuple): (point, centroids)
        
    Returns:
        tuple: (point, index of closest centroid, distance to closest centroid)
    """
    point, centroids = args
    distances = [(point ^ centroid).bit_count() for centroid in centroids]
    cluster_idx = np.argmin(distances)
    return point, cluster_idx


def kmeans_hamming(dataset, k, max_iters=100):
    """
    Perform K-means clustering using Hamming distance for 64-bit binary codes.
    
    Args:
        dataset (list of int): List of 64-bit binary codes as integers.
        k (int): Number of clusters.
        max_iters (int): Maximum number of iterations.
        
    Returns:
        list: Cluster assignments for each data point.
        list: Final centroids for each cluster.
    """
    n = len(dataset)
    
    # Step 1: Initialize centroids (randomly select k points from the dataset)
    centroids = np.random.choice(dataset, k, replace=False)
    cluster_assignments = [None] * n
    
    # Step 2: Iterate until convergence or max iterations
    for iteration in range(max_iters):
        print(f"Iteration {iteration + 1}")
        
        # Assignment step: Assign each point to the nearest centroid
        clusters = [[] for _ in range(k)]
        
        # Use multiprocessing to compute distances in parallel
        with Pool(cpu_count()) as pool:
            results = list(tqdm(pool.imap(compute_hamming_distances, [(point, centroids) for point in dataset]), total=n))
        
        # Update cluster assignments based on results
        for point, cluster_idx in results:
            clusters[cluster_idx].append(point)
        
        # Update step: Compute new centroids
        new_centroids = []
        for cluster in clusters:
            if cluster:
                new_centroids.append(compute_centroid(cluster))
            else:
                # If a cluster is empty, reinitialize its centroid randomly
                new_centroids.append(np.random.choice(dataset))
        
        # Check for convergence (centroids do not change)
        if np.array_equal(centroids, new_centroids):
            break
        centroids = new_centroids
    
    # Final assignment
    for cluster_idx, cluster in enumerate(clusters):
        for point in cluster:
            cluster_assignments[dataset.index(point)] = cluster_idx
    
    return cluster_assignments, centroids

In [70]:
# Perform K-means clustering
k = 20000  # Number of clusters
cluster_assignments, centroids = kmeans_hamming(all_binary, k)

Iteration 1


  0%|          | 73681/60801408 [00:12<2:55:45, 5758.50it/s]


KeyboardInterrupt: 

In [None]:
# Print results
print("Cluster Assignments:", cluster_assignments)
print("Centroids:")
for centroid in centroids:
    print(bin(centroid))

In [55]:
from torch import tensor
target = tensor([[0, 1], [1, 1]])
preds = tensor([[0, 1], [0, 1]])
hamming_distance = HammingDistance(task="multilabel", num_labels=2)
hamming_distance(preds, target)


NameError: name 'HammingDistance' is not defined

In [58]:
import numpy as np

def hamming_distance_numpy_optimized(point, lst):
    # Convert inputs to NumPy arrays
    point_array = np.uint64(point)
    lst_array = np.array(lst, dtype=np.uint64)
    
    # XOR the point with the list
    xor_result = point_array ^ lst_array  # XOR result is still uint64
    
    # Convert uint64 to uint8 view (8 bytes per uint64)
    xor_bytes = xor_result.view(np.uint8)  # Interpret each uint64 as 8 uint8 values
    
    # Unpack bits and count the number of 1s for each uint64
    unpacked_bits = np.unpackbits(xor_bytes, axis=0).reshape(len(lst), 64)
    distances = unpacked_bits.sum(axis=1)
    
    return distances

# Example usage
point = 0b101010  # uint64 point
lst = [0b111000, 0b101010, 0b000111]  # List of uint64 values
distances = hamming_distance_numpy_optimized(point, lst)
print(distances)  # Output: [3, 0, 6]

[2 0 4]


In [71]:
from sklearn.cluster import MiniBatchKMeans
import numpy as np

# Generate some sample data
X = np.array([[1, 2], [1, 4], [1, 0],
              [10, 2], [10, 4], [10, 0]])

# Initialize and fit Mini-Batch K-Means
n_clusters = 2  # Number of clusters
mbk = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, batch_size=3)
mbk.fit(X)

# Get the cluster labels for each data point
labels = mbk.predict(X)

# Print the labels
print("Cluster labels:", labels)

# Print the cluster centroids
print("Centroids:", mbk.cluster_centers_)

  super()._check_params_vs_input(X, default_n_init=3)


Cluster labels: [0 0 0 1 1 1]
Centroids: [[ 1.          2.6       ]
 [10.          1.76190476]]


In [6]:
import numpy as np
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
import random

def hamming_distance(a, b):
    """Calculate the Hamming distance using bit_count() (Python 3.10+)."""
    return (a ^ b).bit_count()

def hamming_distances_vectorized(point, lst):
    # Convert inputs to NumPy arrays
    # point = np.uint64(point)
    # lst = np.array(lst, dtype=np.uint64)
    
    # XOR the point with the list
    xor_result = point ^ lst  # XOR result is still uint64
    
    # Convert uint64 to uint8 view (8 bytes per uint64)
    xor_bytes = xor_result.view(np.uint8)  # Interpret each uint64 as 8 uint8 values
    
    # Unpack bits and count the number of 1s for each uint64
    unpacked_bits = np.unpackbits(xor_bytes, axis=0).reshape(len(lst), 64)
    distances = unpacked_bits.sum(axis=1)
    
    return distances


def compute_centroid(cluster):
    """
    Compute the centroid (bitwise majority) of a cluster of binary codes.
    
    Args:
        cluster (list of int): List of 64-bit binary codes as integers.
        
    Returns:
        int: Centroid as a 64-bit binary code.
    """
    if not cluster:
        return None
    
    # Convert cluster to an array of bits
    cluster_bits = np.array([list(map(int, bin(code)[2:].zfill(64))) for code in cluster])
    
    # Compute bitwise majority
    majority_bits = (cluster_bits.sum(axis=0) >= (len(cluster) / 2)).astype(int)
    
    # Convert majority bits back to integer
    centroid = int("".join(map(str, majority_bits)), 2)
    return centroid


def compute_hamming_distances(args):
    """
    Compute the Hamming distances between a point and all centroids.
    
    Args:
        args (tuple): (point, centroids)
        
    Returns:
        tuple: (point, index of closest centroid, distance to closest centroid)
    """
    point, centroids = args
    # distances = [(point ^ centroid).bit_count() for centroid in centroids]
    distances = hamming_distances_vectorized(point, np.array(centroids))
    cluster_idx = np.argmin(distances)
    return point, cluster_idx


def mini_batch_kmeans_hamming(dataset, k, batch_size=1000, max_iters=10):
    """
    Perform Mini-Batch K-means clustering using Hamming distance for 64-bit binary codes.
    
    Args:
        dataset (list of int): List of 64-bit binary codes as integers.
        k (int): Number of clusters.
        batch_size (int): Size of each mini-batch.
        max_iters (int): Maximum number of iterations.
        
    Returns:
        list: Cluster assignments for each data point.
        list: Final centroids for each cluster.
    """
    n = len(dataset)
    
    # Step 1: Initialize centroids (randomly select k points from the dataset)
    centroids = np.random.choice(dataset, k, replace=False)
    
    # Initialize cluster assignments
    cluster_assignments = [None] * n
    cluster_counts = [0] * k  # Track number of points assigned to each cluster
    
    # Step 2: Iterate until convergence or max iterations
    for iteration in range(max_iters):
        print(f"Iteration {iteration + 1}")
        
        # Step 2.1: Sample a mini-batch from the dataset
        # mini_batch = random.sample(dataset, batch_size)
        # Step 2.1: Sample a mini-batch from the dataset
        mini_batch = random.sample(list(dataset), batch_size)
        
        # Step 2.2: Assign points in the mini-batch to the nearest centroid
        clusters = [[] for _ in range(k)]
        
        # Use multiprocessing to compute distances in parallel
        with Pool(cpu_count()) as pool:
            results = list(pool.imap(compute_hamming_distances, [(point, centroids) for point in mini_batch]))
        
        # Update cluster assignments and counts based on mini-batch results
        for point, cluster_idx in results:
            clusters[cluster_idx].append(point)
            cluster_counts[cluster_idx] += 1
        
        # Step 2.3: Update centroids incrementally using the mini-batch
        new_centroids = centroids.copy()
        for cluster_idx, cluster in enumerate(clusters):
            if cluster:
                cluster_centroid = compute_centroid(cluster)
                
                # Incrementally update centroid using a weighted average
                if cluster_counts[cluster_idx] > 0:
                    new_centroids[cluster_idx] = (
                        (cluster_counts[cluster_idx] - len(cluster)) * centroids[cluster_idx] + cluster_centroid
                    ) // cluster_counts[cluster_idx]
        
        # Check for convergence (if centroids do not change)
        if np.array_equal(centroids, new_centroids):
            print("Convergence reached.")
            break
        
        centroids = new_centroids
    
    # Final assignment: Assign all points in the dataset to the nearest centroid
    with Pool(cpu_count()) as pool:
        final_results = list(tqdm(pool.imap(compute_hamming_distances, [(point, centroids) for point in dataset]), total=n))
    
    for i, (point, cluster_idx) in enumerate(final_results):
        cluster_assignments[i] = cluster_idx  # Assign cluster index directly by position
        
    # Save the cluster assignments and centroids
    np.save("cluster_assignments.npy", cluster_assignments)
    np.save("centroids.npy", centroids)
    
    return cluster_assignments, centroids

In [None]:
# Perform K-means clustering
k = 20000  # Number of clusters
cluster_assignments, centroids = mini_batch_kmeans_hamming(all_binary, k, batch_size=10000, max_iters=100)

Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29
Iteration 30
Iteration 31
Iteration 32
Iteration 33
Iteration 34
Iteration 35
Iteration 36
Iteration 37
Iteration 38
Iteration 39
Iteration 40
Iteration 41
Iteration 42
Iteration 43
Iteration 44
Iteration 45
Iteration 46
Iteration 47
Iteration 48
Iteration 49
Iteration 50
Iteration 51
Iteration 52
Iteration 53
Iteration 54
Iteration 55
Iteration 56
Iteration 57
Iteration 58
Iteration 59
Iteration 60
Iteration 61
Iteration 62
Iteration 63
Iteration 64
Iteration 65
Iteration 66
Iteration 67
Iteration 68
Iteration 69
Iteration 70
Iteration 71
Iteration 72
Iteration 73
Iteration 74
Iteration 75
Iteration 76
Iteration 77
Iteratio

100%|██████████| 60801408/60801408 [3:01:21<00:00, 5587.49it/s]  


: 

In [8]:
cluster_assignments

[2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 312,
 2,
 1446,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1446,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 312,
 312,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1446,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 330,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 330,
 2,
 2,
 2,
 2,
 2,
 312,
 2,
 330,
 2,
 2,
 1446,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 312,
 312,
 2,
 2,
 2,
 2,
 2,
 330,
 2,
 312,
 312,
 2,
 330,
 2,
 2,
 312,
 2,
 1743,
 2,
 2,
 2,
 2,
 312,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 312,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 312,
 1646,
 1646,
 2,
 2,
 709,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1621,


In [9]:
centroids

array([ 704902902040065408, 1189589327514942464,  288381041079352576, ...,
       4758102010793744384, 5709797372109057024, 6344124661326496768],
      dtype=uint64)

In [11]:
# Check average Hamming distance to centroids
avg_distances = []
for i, point in enumerate(all_binary[:40000]):
    centroid = centroids[cluster_assignments[i]]
    avg_distances.append(hamming_distance(point, centroid))
    
print("Average Hamming distance to centroids:", np.mean(avg_distances))


Average Hamming distance to centroids: 21.129975


In [13]:
# Check minimum Hamming distance to centroids
min_distances = []
for i, point in enumerate(all_binary[:40000]):
    centroid = centroids[cluster_assignments[i]]
    min_distances.append(hamming_distance(point, centroid))

print("Minimum Hamming distance to centroids:", np.min(min_distances))

Minimum Hamming distance to centroids: 13


In [None]:
# Pairwise Hamming distances between centroids
pairwise_distances = []
for i, centroid1 in enumerate(centroids):
    for j, centroid2 in enumerate(centroids):
        if i < j:
            pairwise_distances.append(hamming_distance(centroid1, centroid2))