In [280]:
import numpy as np
from sklearn.datasets import load_iris
data = load_iris()
X, Y = data['data'], data['target']

In [281]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

In [282]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

KNN

In [283]:
def index_k_nearest_point(x_train, x, k):
    m = x_train.shape[0]
    index = []
    dist = np.zeros(m)
    for i in range(m):
        dist[i] = np.linalg.norm(x - x_train[i])
    sorted_dist = sorted(dist)
    sorted_dist = sorted_dist[1:k+1]
    for value in sorted_dist:
        for i in range(m):
            if dist[i] == value:
                index.append(i)
    return index

In [284]:
def major_voting(y_train, index):
    vote = {}
    for value in index:
        if y_train[value] not in vote:
            vote[y_train[value]] = 0
        vote[y_train[value]] += 1
    max_value = max(vote.values())
    for k, v in vote.items():
        if v == max_value:
            max_key = k
    return max_key

In [285]:
def evaluateKNN_single(k, x_train, y_train, data):
    index = index_k_nearest_point(x_train, data, k)
    evaluate_index = major_voting(y_train, index)
    return evaluate_index

In [286]:
def evaluateKNN(k, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test):
    correct = sum(map(lambda x: evaluateKNN_single(k, x_train, y_train, x[0]) == x[1], zip(x_test, y_test)))
    print(f'Test accuracy with k={k}: {correct/len(y_test)*100:.4f}% ({correct}/{len(y_test)})')
    # return the number of correct evaluations for us to check with the solution
    return correct

In [287]:
assert evaluateKNN(5) == len(y_test), "Incorrect accuracy for 5-NN!"

Test accuracy with k=5: 100.0000% (15/15)


In [288]:
assert evaluateKNN(1) == len(y_test), "Incorrect accuracy for 1-NN!"

Test accuracy with k=1: 100.0000% (15/15)


K-Means

In [303]:
def get_cluster_classification(x_data, centroids):
    clusters = np.zeros(x_data.shape[0])
    for i in range(x_data.shape[0]):
        dist = np.linalg.norm(x_data[i] - centroids, axis = 1)
        clusters[i] = np.argmin(dist)
    return clusters

In [290]:
def update_centroids(x_data, clusters, k):
    new_centroids = np.zeros((k, x_data.shape[1]))
    for i in range(k):
        x_i = x_data[clusters==i, : ]
        new_centroids[i] = np.mean(x_i, axis = 0)
    return new_centroids

In [291]:
def kmeans(x_train, k, step):
    centroids = x_train[0:3, :]
    clusters = np.zeros(x_train.shape[0])
    for i in range(step):
        old_clusters = clusters
        clusters = get_cluster_classification(x_train, centroids)
        if np.array_equal(clusters, old_clusters):
            print('break')
            break
        centroids = update_centroids(x_train, clusters, k)
    return centroids

In [292]:
centroids = kmeans(x_train, k=3, step=10)
assert np.allclose(centroids, np.array([
    [-1.02028733,  0.90854287, -1.32521428, -1.27540932],
    [ 0.99363929,  0.01896468,  0.90355632,  0.92076921],
    [-0.22539812, -1.02749927,  0.23322382,  0.15491878]
])), "Incorrect centroids for K-means!"

break


k_mean++

In [293]:
def get_clusters(x_train, centroids, n):
    centroids = centroids[0:n]
    clusters = np.zeros(x_train.shape[0])
    for i in range(x_train.shape[0]):
        dist = np.linalg.norm(x_train[i] - centroids, axis = 1)
        clusters[i] = np.argmin(dist)
    return clusters

In [294]:
def get_dist_from_nearest_centroid(x_train, clusters, centroids):
    m = x_train.shape[0]
    dist = np.zeros(m)
    for i in range(m):
        centroid_index = int(clusters[i])
        dist[i] = np.linalg.norm(x_train[i] - centroids[centroid_index])
    return dist

In [295]:
def get_4th_furthest_point(dist):
    sort_dist = sorted(dist, reverse = True)
    return sort_dist[3]

In [296]:
def init_centroids(x_train, k):
    n = 1
    centroids = np.zeros((k, x_train.shape[1]))
    centroids[0] = x_train[3]
    while( n < k):
        clusters = get_clusters(x_train, centroids, n)
        dist = get_dist_from_nearest_centroid(x_train, clusters, centroids)
        value = get_4th_furthest_point(dist)
        new_centroid = x_train[dist == value]
        centroids[n] = new_centroid
        n += 1
    return centroids

In [306]:
def kmeanspp(x_train, k, step):
    centroids = init_centroids(x_train, k) 
    print('init_centroids')
    print(centroids)
    clusters = np.zeros(x_train.shape[0])
    for i in range(step):
        old_clusters = clusters
        clusters = get_cluster_classification(x_train, centroids)
        if np.array_equal(clusters, old_clusters):
            print('break')
            break
        centroids = update_centroids(x_train, clusters, k)
    return centroids

In [308]:
centroidspp = kmeanspp(x_train, k=3, step=10)
assert np.allclose(centroidspp, np.array([
    [-0.0118057 , -0.87997489,  0.36942197,  0.30573876],
    [ 1.15200055,  0.18878042,  0.98903982,  1.01136932],
    [-1.03358934,  0.84835232, -1.32732076, -1.27380566]
])), "Incorrect centroids for K-means++!"

init_centroids
[[-0.16055757 -0.59185214  0.4040297   0.11432871]
 [ 2.4886423   1.70412599  1.48580024  1.03775287]
 [-0.04013939  2.16332162 -1.47483493 -1.3367664 ]]
break
