In [1]:
import pandas as pd
import numpy as np
import random as rd
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy

In [2]:
plt.style.use('seaborn-whitegrid')

ds_1 = np.random.randn(100,2) + np.array([1,1])
ds_2 = np.random.randn(100,2) + np.array([6,4])

data = pd.DataFrame(np.concatenate((ds_1, ds_2), axis = 0))

plt.scatter(data[0], data[1], s=20)

In [3]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2, axis=1))

In [4]:
def custom_KMeans(k, dataset, n_iter=100):
    centroids = dataset.sample(n=k).to_numpy()
    dataset = dataset.to_numpy()
    
    output = {}
    
    for i in range(n_iter):
        distance = np.array([]).reshape(dataset.shape[0], 0)

        for i in range(k):
            dist = euclidean_distance(dataset, centroids[i,:])
            distance = np.c_[distance, dist]

        minimum = np.argmin(distance, axis=1) + 1

        for i in range(k):
            output[i+1] = np.array([]).reshape(2,0)

        for i in range(dataset.shape[0]):
            output[minimum[i]] = np.c_[output[minimum[i]], dataset[i]]

        for i in range(k):
            output[i+1] = output[i+1].T

        for i in range(k):
            centroids[i,:] = np.mean(output[i+1], axis=0)

    return output, centroids

In [5]:
def visualize(k, clusters, centroids, xlabel, ylabel):
    for k in range(k):
        sns.scatterplot(x=clusters[k+1][:,0], y=clusters[k+1][:,1], s=50)

    sns.scatterplot(x=centroids[:,0], y=centroids[:,1], s=100, color='black', marker='x', linewidth=1.2)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

k = 6
clusters, centroids = custom_KMeans(k, data, distance='manhattan')

visualize(k, clusters, centroids)