In [24]:
import numpy as np
import time

def compare_and_reassign(Assignments, Distances, I, D):
    # Vectorized comparison between current and new distances for selected points
    reassignment_mask = D < Distances[0, I]  # Boolean mask for points to be reassigned
    
    # Reassignments: indices of points to be reassigned
    reassignments = I[reassignment_mask]  # Points where D < Distances
    
    # d_reassignments: the new distances for reassigned points
    d_reassignments = D[reassignment_mask]  # New distances for those points
    
    return reassignments, d_reassignments

# Benchmarking function
def benchmark(N, m):
    
    # Generate random inputs
    Assignments = np.random.randint(0, 10, size=(1, N))  # Example random assignments (centroid IDs)
    Distances = np.random.rand(1, N)  # Random distances of points to their current centroids
    I = np.random.randint(0, N, size=(1, m))  # Random indices of m points to check
    D = np.random.rand(1, m)  # New distances for the selected m points
    
    # Start the timer
    start_time = time.perf_counter()

    # Perform the comparison and reassignment
    reassignments, d_reassignments = compare_and_reassign(Assignments, Distances, I, D)

    # Stop the timer
    end_time = time.perf_counter()

    # Return the results and the time taken
    return reassignments, d_reassignments, end_time - start_time



In [57]:
# Example usage:
N = 1_000_000  # Total number of points
m = 10000    # Number of points to check for reassignment

reassignments, d_reassignments, time_taken = benchmark(N, m)
print(f"Reassignments: {reassignments}")
print(f"New distances for reassigned points: {d_reassignments}")
print(f"Time taken: {time_taken * 1000} ms")


Reassignments: [ 25077 721555 645097 ... 276466 785713 710240]
New distances for reassigned points: [0.67339574 0.01503862 0.17511773 ... 0.60708491 0.65357686 0.2381887 ]
Time taken: 0.16450000111944973 ms


In [100]:
import numpy as np
import time

def compare_and_reassign(Assignments, Distances, I, D, j, N, m):
    # Initialize arrays to store the reassignments and corresponding distances
    reassignments = []
    d_reassignments = []
    c_reassignments = []

    # Vectorized approach: Compare distances for each row
    for row_idx in range(j):
        # Get the relevant distances and indices for the current row
        current_distances = Distances[row_idx]
        current_indices = I[row_idx]
        new_distances = D[row_idx]
        
        # Create a mask of where the new distance is smaller
        reassignment_mask = new_distances < current_distances[current_indices]
        
        # Get the reassigned points and their new distances
        reassigned_points = current_indices[reassignment_mask]
        new_dists = new_distances[reassignment_mask]
        
        # Store the results
        reassignments.append(reassigned_points)
        d_reassignments.append(new_dists)
        c_reassignments.append(np.full_like(reassigned_points, row_idx, dtype=int))
    
    # Convert the results into numpy arrays
    reassignments = np.concatenate(reassignments)
    d_reassignments = np.concatenate(d_reassignments)
    c_reassignments = np.concatenate(c_reassignments)
    
    return reassignments, d_reassignments, c_reassignments

# Benchmarking function
def benchmark(N, m, j):
    # Generate random inputs for j rows of assignments and distances
    Assignments = np.random.randint(0, 10, size=(j, N))  # Random assignments for j rows
    Distances = np.random.rand(j, N)  # Random distances to centroids for j rows
    I = np.random.randint(0, N, size=(j, m))  # Random indices to check (m points per row)
    D = np.random.rand(j, m)  # New distances for m points per row
    
    # Start the timer
    start_time = time.perf_counter()

    # Perform the comparison and reassignment
    reassignments, d_reassignments, c_reassignments = compare_and_reassign(Assignments, Distances, I, D, j, N, m)

    # Stop the timer
    end_time = time.perf_counter()

    # Return the results and the time taken
    return reassignments, d_reassignments, c_reassignments, end_time - start_time

# Example usage:
N = 1_000_000  # Total number of points
m = 100      # Number of points to check for reassignment
j = 16         # Number of rows

reassignments, d_reassignments, c_reassignments, time_taken = benchmark(N, m, j)
# print(f"Reassignments: {reassignments}")
# print(f"New distances for reassigned points: {d_reassignments}")
# print(f"Row indices for reassignments: {c_reassignments}")
print(f"Time taken: {time_taken * 1000} ms")


Time taken: 0.16404900088673458 ms
