# <div style="text-align: right"> Kmeans from scratch. </div>

---

<div style="text-align: right"> Geoff Counihan - Oct 9, 2017 </div>

### Notes

---

Unclear what the difference between sklearn's implementation and mine is. 
    
I've run into the issue of having equidistant points from centroids. Though its rare in high dimensional datasets it happens quite frequently using the iris dataset. I'm uncertain how to best go about asigning these points to the lowest index centoid with the numpy implementation I have. 

Why are my values didferent than sklearns?

__Additions__: Create a visualization

In [605]:
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd

In [634]:
iris = load_iris()
X = iris.data[:,:2]
y = iris.target

In [635]:
Xy = np.column_stack((X,y))
Xy_point = Xy[:3]
print(Xy_point)

[[ 5.1  3.5  0. ]
 [ 4.9  3.   0. ]
 [ 4.7  3.2  0. ]]


In [606]:
Xy = pd.read_csv('./sonar.all-data.csv',header=None)
Xy[60] = Xy[60].map({'R':0,'M':1})
X = np.array(Xy.iloc[:,:-1])
y = np.array(Xy.iloc[:,-1])
Xy = np.array(Xy)

__Eucledean distance__ - is defined by the square root of the sum of squared differences between two arrays of numbers. 

In [636]:
def euclidean(a, b):
    return np.sqrt(((a-b)**2).sum(axis=0))

In [637]:
a = np.array([0,0,0,0])
b = np.array([2,2,2,2])

distance3d = euclidean(a,b)
print('2d Distance: {}'.format(distance3d))

c = np.array([0,0])
d = np.array([2,2])

distance2d = euclidean(c,d)
print('3d Distance: {}'.format(distance2d))

2d Distance: 4.0
3d Distance: 2.8284271247461903


### Other components

---

__Centroids__ - number of centroids to use

In [544]:
a = X[np.random.randint(Xy.shape[0])]

In [545]:
b = X[np.random.randint(Xy.shape[0])]

In [748]:
def init_centroids(X,num_centroids=3):
    feat_range = {}
    centroids = {}
    for k in range(num_centroids):
        centroids[k] = X[np.random.randint(Xy.shape[0])]
    return centroids

In [749]:
centroids = init_centroids(X)

In [647]:
def dist_from_centroid(centroids):
    dist_from = np.array([]).reshape(0,X.shape[0])
    for k in centroids.keys():
        dist = np.array([])
        for i in range(X.shape[0]):
            dist = np.append(dist,euclidean(centroids[k],X[i]))
        dist_from = np.vstack((dist_from,dist))
    return dist_from

In [648]:
dist_from = dist_from_centroid(centroids)

In [732]:
def label(dist_from):
     return np.argwhere(dist_from == np.min(dist_from,axis=0))

In [733]:
labels = label(dist_from)

In [763]:
for k in centroids.keys():
    print(X[labels[:,0] == k].mean(axis=0))

[ 5.11296296  3.38333333]
[ 5.9         2.76595745]
[ 6.59387755  2.96734694]


In [764]:
def recenter(centroids):
    for k in centroids.keys():
        centroids_hist[k] = centroids[k]
        centroids[k] = X[labels[:,0] == k].mean(axis=0)
    return centroids

In [765]:
centroids = recenter(centroids)

In [769]:
def stop(centroid_hist, centroids, iterations):
    if iterations > max_iterations:
        return True
    for k in centroids.keys():
        #print(k)
        if np.array_equal(centroids_hist[k], centroids[k]):
            return True
        #print('through')
    return False

In [778]:
iterations = 0
max_iterations = 1000
centroid_hist = {}
while not stop(centroid_hist, centroids, iterations):
    #print(iterations)
    dist_from = dist_from_centroid(centroids)
    labels = label(dist_from)
    centroids = recenter(centroids)
    iterations += 1

In [770]:
centroids

{0: array([ 5.11296296,  3.38333333]),
 1: array([ 5.9       ,  2.76595745]),
 2: array([ 6.59387755,  2.96734694])}

In [784]:
class kmeans():
    def __init__(self, num_centroids=4, max_iter=1000):
        self.num_centroids = num_centroids
        self.max_iter = max_iter
        self.metric = metric
        
    def euclidean(self, a, b):
        return np.sqrt(((a-b)**2).sum(axis=0))
    
    def manhattan(self, a, b):
        return np.abs((a-b).sum(axis=0))
        
    def init_centroids(self):
        centroids = {}
        for k in range(self.num_centroids):
            centroids[k] = self.X[np.random.randint(self.X.shape[0])]
        return centroids

    def dist_from_centroid(self):
        self.dist_from = np.array([]).reshape(0,self.X.shape[0])
        if self.metric == 'euclidean':
            for k in self.centroids.keys():
                dist = np.array([])
                for i in range(self.X.shape[0]):
                    dist = np.append(dist,self.euclidean(self.centroids[k],self.X[i]))
                self.dist_from = np.vstack((self.dist_from,dist))
            
    def label(self):
        self.labels = np.argwhere(self.dist_from == np.min(self.dist_from,axis=0))
        if self.labels.shape[0] != self.X.shape[0]:
            idx = np.where(np.unique(self.labels[:,1], return_counts=True)[1] == 2)
            duplicate_asignments = np.where(self.labels[:,1] == idx)[1]
            for i, dup in enumerate(duplicate_asignments):
                if i == 0:
                    self.labels = np.delete(self.labels,dup,axis=0)
        #print(self.labels)
            
    def recenter(self):
        for k in self.centroids.keys():
            self.centroids_hist[k] = self.centroids[k]
            self.centroids[k] = self.X[self.labels[:,0] == k].mean(axis=0)
            
    def stop(self):
        if self.iterations > self.max_iter:
            return True
        for k in centroids.keys():
            if np.array_equal(km.centroids_hist[k], km.centroids[k]):
                return True
        return False
    
    def fit(self, X):
        self.X = X
        self.centroids = self.init_centroids()
        self.centroids_hist = self.init_centroids()
        self.iterations = 0
        
        #while not self.stop():#self.iterations < self.max_iter:
        while self.iterations < self.max_iter:
            self.dist_from_centroid()
            self.label()
            self.recenter()
            self.iterations += 1
            

### Test.

---

In [780]:
from sklearn.model_selection import train_test_split

iris = load_iris()
X = iris.data[:,:2]
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=35)

In [782]:
km = kmeans(num_centroids=3,max_iter=1000,metric='euclidean')
km.fit(X)

In [783]:
km.centroids

{0: array([ 5.11296296,  3.38333333]),
 1: array([ 5.9       ,  2.76595745]),
 2: array([ 6.59387755,  2.96734694])}

In [775]:
km.labels

array([[  0,   0],
       [  0,   1],
       [  0,   2],
       [  0,   3],
       [  0,   4],
       [  0,   5],
       [  0,   6],
       [  0,   7],
       [  0,   8],
       [  0,   9],
       [  0,  10],
       [  0,  11],
       [  0,  12],
       [  0,  13],
       [  0,  14],
       [  0,  15],
       [  0,  16],
       [  0,  17],
       [  0,  18],
       [  0,  19],
       [  0,  20],
       [  0,  21],
       [  0,  22],
       [  0,  23],
       [  0,  24],
       [  0,  25],
       [  0,  26],
       [  0,  27],
       [  0,  28],
       [  0,  29],
       [  0,  30],
       [  0,  31],
       [  0,  32],
       [  0,  33],
       [  0,  34],
       [  0,  35],
       [  0,  36],
       [  0,  37],
       [  0,  38],
       [  0,  39],
       [  0,  40],
       [  0,  41],
       [  0,  42],
       [  0,  43],
       [  0,  44],
       [  0,  45],
       [  0,  46],
       [  0,  47],
       [  0,  48],
       [  0,  49],
       [  0,  57],
       [  0,  84],
       [  0,

### Compare performance

---

In [771]:
from sklearn.cluster import k_means

In [772]:
k_means(X, n_clusters=3, max_iter=1000)

(array([[ 5.006     ,  3.418     ],
        [ 6.81276596,  3.07446809],
        [ 5.77358491,  2.69245283]]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
        2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2,
        1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1,
        2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2], dtype=int32),
 37.12370212765957)