In [None]:
# Import necessary libraries
from copy import deepcopy
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import cv2

In [None]:
# Set three centers, the model should predict similar results
center_1 = np.array([1,1])
center_2 = np.array([5,5])
center_3 = np.array([8,1])

# Generate random data and center it to the three centers
sigma = 1
data_1 = sigma*np.random.randn(200,2) + center_1
data_2 = sigma*np.random.randn(200,2) + center_2
data_3 = sigma*np.random.randn(200,2) + center_3

data = np.concatenate((data_1, data_2, data_3), axis = 0)
np.random.shuffle(data)

plt.scatter(data[:,0], data[:,1], s=7)

In [None]:
# Euclidean Distance Caculator
def dist(a, b, ax=1):
    return np.linalg.norm(a - b, axis=ax)

In [None]:
# Number of clusters
k = 3
# Number of training data
n = data.shape[0]
# Number of features in the data
c = data.shape[1]

# Generate random centers, here we use sigma and mean to ensure it represent the whole data
mean = np.mean(data, axis = 0)
std = np.std(data, axis = 0)
centers = np.random.randn(k,c)*std + mean
print(centers)

In [None]:
# Plotting along with the Centroids
plt.scatter(data[:,0], data[:,1], c='#050505', s=7)
plt.scatter(centers[:,0], centers[:,1], marker='*', s=200, c='g')

In [None]:
# To store the value of centroids when it updates
centers_old = np.zeros(centers.shape)
# Cluster Lables(0, 1, 2, ... , k-1)
clusters = np.zeros(len(data))
# Error func. - Distance between new centroids and old centroids
error = dist(centers, centers_old, None)
# Loop will run till the error becomes zero
while error != 0:
    # Assigning each value to its closest cluster
    for i in range(len(data)):
        distances = dist(data[i], centers)
        cluster = np.argmin(distances)
        clusters[i] = cluster
    # Storing the old centroid values
    centers_old = deepcopy(centers)
    # Finding the new centroids by taking the average value
    for i in range(k):
        points = [data[j] for j in range(len(data)) if clusters[j] == i]
        centers[i] = np.mean(points, axis=0)
    error = dist(centers, centers_old, None)

In [None]:
colors = ['r', 'g', 'b', 'y', 'c', 'm']
fig, ax = plt.subplots()
for i in range(k):
        points = np.array([data[j] for j in range(len(data)) if clusters[j] == i])
        ax.scatter(points[:, 0], points[:, 1], s=7, c=colors[i])
ax.scatter(centers[:, 0], centers[:, 1], marker='*', s=200, c='#050505')

In [None]:
# Calculate reconstruction data and error
recon_error = np.zeros(k)
recon_data = centers[clusters.astype(int)]
recon_error = np.sum(np.square((recon_data-data)))

print(recon_error)

In [None]:
def kmeans(k, data):
    # Number of training data
    n = data.shape[0]
    # Number of features in the data
    c = data.shape[1]

    # Generate random centers, here we use sigma and mean to ensure it represent the whole data
    mean = np.mean(data, axis = 0)
    std = np.std(data, axis = 0)
    centers = np.random.randn(k,c)*std + mean
    # To store the value of centroids when it updates
    
    centers_old = np.zeros(centers.shape)
    # Cluster Lables(0, 1, 2, ... , k)
    clusters = np.zeros(len(data))
    # Error func. - Distance between new centroids and old centroids
    error = dist(centers, centers_old, None)
    # Loop will run till the error becomes zero
    while error != 0:
        # Assigning each value to its closest cluster
        for i in range(len(data)):
            distances = dist(data[i], centers)
            cluster = np.argmin(distances)
            clusters[i] = cluster
        # Storing the old centroid values
        centers_old = deepcopy(centers)
        # Finding the new centroids by taking the average value
        for i in range(k):
            points = [data[j] for j in range(len(data)) if clusters[j] == i]
            if not len(points) == 0:
                centers[i] = np.mean(points, axis=0)
            else:
                centers[i] = 0
        error = dist(centers, centers_old, None)
        
    # Calculate reconstruction data and error
    recon_data = centers[clusters.astype(int)]
    recon_error = np.sum(np.square((recon_data-data)))
    
    return recon_data, recon_error

error = np.zeros(5)
for i in range(5):
    error[i] = kmeans(i+1, data)[1]
    print("Error for K = " + str(i+1) + " : " + str(error[i]))

plt.plot(np.arange(1, 6), error)

In [None]:
# Image compression
import cv2
from mpl_toolkits.mplot3d import Axes3D

# read image
img = cv2.imread('test_image.jpg')
img = cv2.resize(img, (100, 100))

#convert from BGR to RGB
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)


In [None]:
# Calculate objective function
data = img.reshape(-1, 3)
error = np.zeros(5)
for i in range(5):
    recon_data, error[i] = kmeans(i+1, data)
    recon_img = recon_data.reshape(img.shape)
    cv2.imwrite('recon_img' + str(i+1) + '.jpg', recon_img)
    print("Error for K = " + str(i+1) + " : " + str(error[i]))

plt.plot(np.arange(1, 6), error)