<a href="https://colab.research.google.com/github/vhnowf/machine-learning-course/blob/master/week4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [363]:
# load our dataset
from sklearn.datasets import load_iris
data = load_iris()
X, Y = data['data'], data['target']

In [364]:
# split our data into training and testing set with 90:10 ratio
# use a fixed random state for reproducible results
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)



In [365]:
# z-score normalization.
# Remember to scale the training and test set separately to avoid data snooping.
# We use the training set's mu and sigma for the test set.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)


In [366]:
import numpy as np
from collections import Counter
# Remember, no training is needed for KNN!
def euclidean_distance(a,b):
  return np.sqrt(np.sum((a-b)** 2))
def evaluateKNN_single(k, x_train, y_train, data):
    '''
    Evaluate the classification for `data` with k-nearest neighbor
    given training set (x_train, y_train).

    Note that this function takes in one input instead of the whole
    testing set.
    
    Input:
        k      : hyperparameter for KNN
        x_train: features of training set
        y_train: labels of training set
        data   : features of the data point to be evaluated
    Output:
        Classification of the input data point.
    '''
    distances = []
 
    for i in range(x_train.shape[0]):
         dist = euclidean_distance(data,x_train[i])
         distances.append((dist, y_train[i]))
    distances = sorted(distances)
    votes = np.array(distances)[:k,1]
    return int(Counter(votes).most_common(1)[0][0])

  
    


In [367]:
# Evaluation code for the whole dataset
def evaluateKNN(k, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test):
    correct = sum(map(lambda x: evaluateKNN_single(k, x_train, y_train, x[0]) == x[1], zip(x_test, y_test)))
    print(f'Test accuracy with k={k}: {correct/len(y_test)*100:.4f}% ({correct}/{len(y_test)})')
    # return the number of correct evaluations for us to check with the solution
    return correct

In [368]:
label = evaluateKNN(2, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test)
print(label)

Test accuracy with k=2: 93.3333% (14/15)
14


In [369]:
# and let's see how good is our model with k=5
assert evaluateKNN(5) == len(y_test), "Incorrect accuracy for 5-NN!"

Test accuracy with k=5: 100.0000% (15/15)


In [371]:
# and let's see how good is our model with k=5
assert evaluateKNN(1) == len(y_test) - 1, "Incorrect accuracy for 1 -NN!"

Test accuracy with k=1: 93.3333% (14/15)


In [377]:
def get_cluster_classification(x_data, centroids):
    '''
    A helper function that you will need later.
    Classifies the points to their nearest cluster.
    
    Input:
        x_data   : the data points
        centroids: the cluster centroids
    Output:
        The centroid numbers that each data point belongs to (i.e. is nearest)
    '''
    
    # IMPLEMENT HERE
    clusters = np.array([np.argmin(np.linalg.norm(centroids- data_point, axis=1)) for data_point in x_data])
    return clusters

In [379]:
def kmeans(x_train, k, step, centroids = None):
    '''
    An implementation of K-means clustering.
    
    Input:
        k      : number of clusters
        x_train: training dataset
        step   : number of recaliberation steps
    Output:
        The centroids of the clusters (a k x d matrix)
    '''
    
    # IMPLEMENT HERE
    if type(centroids) == type(None):
      centroids = x_train[:k]
    if centroids is None: centroids = x_train[:k]
    for _ in range(step):
      clusters = get_cluster_classification(x_train,centroids)
      new_centroids = np.array([np.mean(x_train[clusters == i], axis = 0) for i in range(k)])
      if np.allclose(new_centroids, centroids): break
      centroids = new_centroids
    return centroids


In [385]:
# we know that there are three classes
centroids = kmeans(x_train, k=3, step=10)
assert np.allclose(centroids, np.array([
    [-1.02028733,  0.90854287, -1.32521428, -1.27540932],
    [ 0.99363929,  0.01896468,  0.90355632,  0.92076921],
    [-0.22539812, -1.02749927,  0.23322382,  0.15491878]
])), "Incorrect centroids for K-means!"

In [390]:
def kmeanspp(x_train, k, step):
    '''
    An implementation of K-means++ clustering.
    
    Input:
        k      : number of clusters
        x_train: training dataset
        step   : number of recaliberation steps
    Output:
        The centroids of the clusters (a k x d matrix)
    '''
    # initialize the centroids according to the above criteria
    
    # IMPLEMENT HERE
    init_centroid = [x_train[3]]
    for _ in range(1,k):
      init_centroid.append(x_train[np.argsort(np.linalg.norm(x_train-init_centroid[-1], axis = 1))[-4]])
    centroids = np.array(init_centroid)

    
    # the rest should be identical to kmeans()
    
    # IMPLEMENT HERE
    return kmeans(x_train, k, step, centroids)

In [391]:
# now check if you did it correctly.
centroidspp = kmeanspp(x_train, k=3, step=10)
assert np.allclose(centroidspp, np.array([
    [-0.0118057 , -0.87997489,  0.36942197,  0.30573876],
    [ 1.15200055,  0.18878042,  0.98903982,  1.01136932],
    [-1.03358934,  0.84835232, -1.32732076, -1.27380566]
])), "Incorrect centroids for K-means++!"
