In [1]:
import numpy as np

In [8]:
def initialize_random_centroid(K, X):
    """Initializes and returns k random r"""
    m, n = np.shape(X)
    r = np.empty((K, n))
    for i in range(K):
        # pick a random data point from X as the centroid
        r[i] =  X[np.random.choice(range(m))] 
    return r

def euclidean_distance(x1, x2):
    """Calculates and returns the euclidean distance between two vectors x1 and x2"""
    return np.sqrt(np.sum(np.power(x1 - x2, 2)))

def closest_centroid(x, r, K):
    """Finds and returns the index of the closest centroid for a given vector x"""
    distances = np.empty(K)
    for i in range(K):
        distances[i] = euclidean_distance(r[i], x)
    return np.argmin(distances) # return the index of the lowest distance

def create_clusters(r, K, X):
    """Returns an array of cluster indices for all the data samples"""
    m, _ = np.shape(X)
    cluster_idx = np.empty(m)
    for i in range(m):
        cluster_idx[i] = closest_centroid(X[i], r, K)
    return cluster_idx

def compute_means(cluster_idx, K, X):
    """Computes and returns the new centroids of the clusters"""
    _, n = np.shape(X)
    r = np.empty((K, n))
    for i in range(K):
        points = X[cluster_idx == i] # gather points for the cluster i
        r[i] = np.mean(points, axis=0) # use axis=0 to compute means across points
    return r

In [9]:
def run_Kmeans(K, X, max_iterations = 500):
    """Runs the K-means algorithm and computes the final clusters"""
    # initialize random centroids
    r = initialize_random_centroid(K, X)
    # loop till max_iterations or convergance
    print(f"initial centroids: {r}")
    for _ in range(max_iterations):
        # create clusters by assigning the samples to the closet centroids
        clusters = create_clusters(r, K, X)
        previous_centroids = r                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
        # compute means of the clusters and assign to centroids
        r = compute_means(clusters, K, X)
        # if the new_centroids are the same as the old centroids, return clusters
        diff = previous_centroids - r
        if not diff.any():
            return clusters
    return clusters

In [10]:
from sklearn import datasets
# creating a dataset for clustering
X, y = datasets.make_blobs()
y_preds = run_Kmeans(3, X)

initial centroids: [[  7.49442745   7.35933359]
 [ -8.67763982  -0.77524342]
 [ -0.40570764 -10.49166661]]


### Ví dụ khi không dùng được Kmean 