<a href="https://colab.research.google.com/github/zhuzz12/cvdl/blob/main/DL_Lab2_KMeans_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt

# unused but required import for doing 3d projections with matplotlib < 3.2
import mpl_toolkits.mplot3d  # noqa: F401

from sklearn import datasets
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np

In [None]:
iris = datasets.load_iris()

In [None]:
data = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])

In [None]:
data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0


In [None]:
data['target'].unique()

array([0., 1., 2.])

In [None]:
X_train = data.drop('target', axis=1).to_numpy()
Y_train = data['target'].to_numpy()
print(X_train.shape)
print(Y_train.shape)

(150, 4)
(150,)


In [None]:
class KMeans():
    def __init__(self, k, max_iter=1000, random_state=42):
        self.k = k
        self.max_iter = max_iter
        self.random_state = random_state
        
    def initialize_centroids(self, X):
        np.random.RandomState(self.random_state)
        random_idx = np.random.permutation(X.shape[0])
        
        centroids = X[random_idx[:self.k]]
        return centroids
    
    def compute_centroids(self, X, labels):
        centroids = np.zeros((self.k, X.shape[1]))
        for k in range(self.k):
            centroids[k, :] = np.mean(X[labels == k, :], axis=0)
        return centroids

    def compute_distance(self, X, centroids):
        distance = np.zeros((X.shape[0], self.k))
        for k in range(self.k):
            row_norm = np.linalg.norm(X - centroids[k, :], axis=1)
            distance[:, k] = np.square(row_norm)
        return distance

    def find_closest_cluster(self, distance):
        return np.argmin(distance, axis=1)
    
    def fit(self, X):
        self.X = X
        self.centroids = self.initialize_centroids(self.X)
        for i in range(self.max_iter):
            old_centroids = self.centroids
            distance = self.compute_distance(self.X, old_centroids)
            self.labels = self.find_closest_cluster(distance)
            self.centroids = self.compute_centroids(self.X, self.labels)

    def classify(self, X):
        distance = self.compute_distance(X, self.centroids)
        centroid_idx = self.find_closest_cluster(distance)
        
            
        return centroid_idx

In [None]:
kmeans = KMeans(3)
kmeans.fit(X_train)


In [None]:
a = kmeans.classify(X_train)

In [None]:
len(a)

150

In [None]:
Y_train

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])

In [None]:
a

array([1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1,
       1, 2, 1, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(Y_train, a)

0.8866666666666667

In [None]:
class KNN():
    def __init__(self, k, random_state=42):
        self.k = k
        self.random_state = random_state

    def compute_distance(self, x1, x2): 
        distance = np.linalg.norm(x1 - x2, axis = 1)
        return distance
    
    def predict(self, X, Y, X_test):
        preds = []
        for row in X_test: 
            distances = self.compute_distance(X, row)
            sorted_indices = np.argsort(distances)[1:(self.k+1)]  
            counts = []
            for i in range(self.k): 
                counts.append((Y[sorted_indices]==i).sum())
            counts = np.array(counts)
            preds.append(np.argmax(counts))
        return np.array(preds)

In [None]:
clf = KNN(5)

In [None]:
clf.compute_distance(np.array([[2, 4, 3, 4], [1, 2, 3, 4]]), np.array([[1, 2, 3, 4], [1, 2, 3, 4]]))

array([2.23606798, 0.        ])

In [None]:
knn_preds = clf.predict(X_train,Y_train, X_train)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
accuracy_score(Y_train, knn_preds)

0.9666666666666667

In [None]:
precision_score(Y_train, knn_preds, average ='weighted')

0.9667867146858743

In [None]:
recall_score(Y_train, knn_preds, average ='weighted')

0.9666666666666667

In [None]:
knn_preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])