In [1]:
import numpy as np
import warnings
import numba
import time
from sklearn.cluster import KMeans
from numba import cuda
import argparse

In [5]:
@cuda.jit
def assign_labels(data, centroids, labels):
    idx = cuda.grid(1)
    if idx < data.shape[0]:
        min_dist = np.inf
        for i in range(centroids.shape[0]):
            dist = 0.0
            for j in range(data.shape[1]):
                temp = data[idx, j] - centroids[i, j]
                dist += temp * temp
            if dist < min_dist:
                min_dist = dist
                labels[idx] = i

@cuda.jit
def update_centroids(data, centroids, labels, counts):
    idx = cuda.grid(1)
    if idx < data.shape[0]:
        label = labels[idx]
        for j in range(data.shape[1]):
            cuda.atomic.add(centroids, (label, j), data[idx, j])
        cuda.atomic.add(counts, label, 1)

@cuda.jit
def finalize_centroids(centroids, counts):
    idx = cuda.grid(1)
    num_centroids = centroids.shape[0]
    num_features = centroids.shape[1]
    total_elements = num_centroids * num_features
    
    if idx < total_elements:
        centroid_idx = idx // num_features  # Determine which centroid
        feature_idx = idx % num_features    # Determine which feature of the centroid
        
        if counts[centroid_idx] > 0:
            centroids[centroid_idx, feature_idx] /= counts[centroid_idx]

@cuda.jit(fastmath=True)
def fill_zeros(arr):
    idx = cuda.grid(1)
    if idx < arr.size:
        arr[idx] = 0

def gpu_kmeans(data, k=3, max_iter=300):
    start_time=time.time()
    
    threads_per_block = 128
    num_points = data.shape[0]
    blocks_per_grid_points = (num_points + threads_per_block - 1) // threads_per_block
    
    blocks_per_grid_centroids = (k * data.shape[1] + threads_per_block - 1) // threads_per_block
    blocks_per_grid_counts = (k + threads_per_block - 1) // threads_per_block

    data_device = cuda.to_device(data)
    centroids = np.random.rand(k, data.shape[1]).astype(np.float32)
    centroids_device = cuda.to_device(centroids)
    labels = np.zeros(num_points, dtype=np.int32)
    labels_device = cuda.to_device(labels)
    counts = np.zeros(k, dtype=np.int32)
    counts_device = cuda.to_device(counts)
    cuda.synchronize()
    kernel_start_time=time.time()
    for _ in range(max_iter):
        assign_labels[blocks_per_grid_points, threads_per_block](data_device, centroids_device, labels_device)
        fill_zeros[blocks_per_grid_centroids, threads_per_block](centroids_device.ravel())
        fill_zeros[blocks_per_grid_counts, threads_per_block](counts_device)
        update_centroids[blocks_per_grid_points, threads_per_block](data_device, centroids_device, labels_device, counts_device)
        finalize_centroids[blocks_per_grid_centroids, threads_per_block](centroids_device, counts_device)
    cuda.synchronize()
    kernel_end_time=time.time()
    kernel_time=kernel_end_time-kernel_start_time
        
    centroids = centroids_device.copy_to_host()
    labels = labels_device.copy_to_host()
    end_time=time.time()
    gpu_time=end_time-start_time
    return centroids, labels, gpu_time, kernel_time

def cpu_kmeans(data, k=5, max_iter=300):
    start_time = time.time()
    kmeans = KMeans(n_clusters=k, max_iter=max_iter, n_init=10)
    kmeans.fit(data)
    centroids = kmeans.cluster_centers_
    labels = kmeans.labels_
    end_time = time.time()
    cpu_time = end_time-start_time
    return centroids, labels, cpu_time

In [6]:
if __name__ == "__main__":
    warnings.filterwarnings('ignore', category=numba.NumbaPerformanceWarning)
        
    np.random.seed(42)
    data = np.random.rand(10000, 2000).astype(np.float32)
    cpu_centroids, cpu_labels, cpu_time = cpu_kmeans(data, k=5, max_iter=300)
    gpu_centroids, gpu_labels, gpu_time, kernel_time = gpu_kmeans(data, k=5, max_iter=300)
    print(f"CPU time: {cpu_time}s, GPU time: {gpu_time}s, Kernel time: {kernel_time}s")
    print("CPU results:")
    print("CPU Centroids:", cpu_centroids)
    print("CPU Labels:", cpu_labels)
    print("GPU results:")
    print("GPU Centroids:", gpu_centroids)
    print("GPU Labels:", gpu_labels)

CPU time: 4.744860410690308s, GPU time: 1.7323057651519775s, Kernel time: 0.9856407642364502s
CPU results:
CPU Centroids: [[0.49292195 0.5009731  0.5079429  ... 0.48896664 0.50180995 0.48507613]
 [0.49806246 0.49994147 0.4973255  ... 0.49169034 0.50716573 0.50768673]
 [0.52181137 0.51935315 0.515785   ... 0.46911243 0.48909786 0.5068382 ]
 [0.50181496 0.48508802 0.49627584 ... 0.5098018  0.51103806 0.5166171 ]
 [0.49398053 0.49230936 0.5117607  ... 0.5252925  0.4955201  0.4885505 ]]
CPU Labels: [0 1 1 ... 1 4 0]
GPU results:
GPU Centroids: [[0.495935   0.47987485 0.50574625 ... 0.49643004 0.5170477  0.48738566]
 [0.5176254  0.5220244  0.49564996 ... 0.48860392 0.49921325 0.5110142 ]
 [0.4826264  0.49215248 0.52044976 ... 0.49595448 0.48970884 0.51041704]
 [0.49717364 0.5054106  0.49757862 ... 0.48821878 0.4842077  0.49909568]
 [0.5034721  0.48910856 0.51189625 ... 0.5291474  0.5185405  0.48453608]]
GPU Labels: [3 2 3 ... 3 1 0]


In [None]:
print(centroids)

In [31]:
print(labels)

[3 8 9 ... 8 9 1]
