### Import Libraries

In [2]:
import numpy as np
import pandas as pd
from numpy import random

### Compute distance between two points

In [24]:
# Input
# point1
# point2
# Output
# distance between point1 and point2
def compute_distance(point1, point2):
    dim = len(point2)
    sum = 0
    for i in range(dim):
        sum = sum + ((point1[i] - point2[i])**2)
    return sum

### Assign centers to each data point

In [38]:
# Input
# centers: existing centroids
# data: numpy array containing all data points
# Output
# indices of nearest centers
def assign_centers(centers, data):
    distance_from_centers = []
    num_of_rows = data.shape[0]
    num_of_centers = len(centers)
    for i in range(num_of_rows):
        point = data[i]
        final_distance = -1
        for j in range(num_of_centers):
            center = centers[j]
            distance = compute_distance(point, center)
            if final_distance == -1:
                final_distance = distance
            elif distance < final_distance:
                final_distance = distance
        distance_from_centers.append(final_distance)
    return distance_from_centers


### Compute new centroid

In [31]:
# Input
# centers: existing centroids
# data: numpy array containing all data points
# Output
# new centroid
def get_point(centers, data):
    distance_from_centers = assign_centers(centers, data)
    max_distance = -1
    new_centroid = -1
    for i in range(len(distance_from_centers)):
        if distance_from_centers[i] > max_distance:
            max_distance = distance_from_centers[i]
            new_centroid = i
    return data[new_centroid]

### Driver function to generate centers

In [36]:
# Input
# K: Number of clusters
# data: numpy array containing all data points
# Output
# centers : 2D numpy array containing initial data points
def kmeanspp(K, data):
    num_of_rows = data.shape[0]
    centers = []
    first_point_index = random.randint(num_of_rows - 1)
    centers.append(data[first_point_index])
    for i in range(1, K):
        point = get_point(centers, data)
        centers.append(point)
    return centers


### Tests

In [39]:
data = np.array([[-2, -2], [5, 5], [1, 1], [2, 2], [4, 4], [8, 8], [-1, -1]])
K = 3
centers = kmeanspp(K, data)
print(centers)

First index: 4
[array([4, 4]), array([-2, -2]), array([8, 8])]
