In [47]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [48]:
data = load_iris()
X = data["data"]
y = data["target"]

In [49]:
print(X.shape)
print(y.shape)

(150, 4)
(150,)


In [50]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.33,
    random_state=42,
    shuffle=True,
    stratify=None,
)

In [51]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(100, 4) (100,)
(50, 4) (50,)


In [52]:
import numpy as np
import numpy.typing as npt
from typing import Self
from functools import cached_property

In [62]:
class KNeighborsClassifier:
    def __init__(
        self,
        n_neighbors: int = 5,
        weights: str = "uniform",
        p: int = 2,
        metric: str = "minkowski",
    ):
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.p = p
        self.metric = metric
        return

    def fit(
        self,
        X_train: npt.NDArray[np.float64],
        y_train: npt.NDArray[np.int64],
    ) -> Self:
        self.X_train = X_train
        self.y_train = y_train
        return self

    def __minkowski_distance(
        self,
        X_test: npt.NDArray[np.float64],
    ) -> npt.NDArray[np.float64]:
        absolute_dimension_wise_differences = np.abs(
            self.X_train - X_test[:, np.newaxis, :]
        )
        return np.power(
            np.sum(
                np.power(absolute_dimension_wise_differences, self.n_neighbors), axis=-1
            ),
            1 / self.n_neighbors,
        )

    def __cosine_distance(
        self,
        X_test,
    ) -> npt.NDArray[np.float64]:
        dimension_wise_products = self.X_train * X_test[:, np.newaxis, :]
        dot_products = np.sum(dimension_wise_products, axis=-1)

        norms_of_train = np.sqrt(np.sum(np.square(self.X_train), axis=-1))
        norms_of_test = np.sqrt(np.sum(np.square(X_test), axis=-1))

        cosine_similarity = dot_products / norms_of_train / norms_of_test.reshape(-1, 1)
        return 1 - cosine_similarity

    def __plurality_voting(
        self,
        classes_of_nearest_neighbors: npt.NDArray[np.int64],
        distances_of_nearest_neighbors: npt.NDArray[np.float64],
    ) -> npt.NDArray[np.int64]:
        weights_of_nearest_neighbors = np.ones_like(
            distances_of_nearest_neighbors, dtype=np.float64
        )
        if self.weights == "distance":
            weights_of_nearest_neighbors = 1.0 / distances_of_nearest_neighbors

        y_pred = []
        for classes, weights in zip(
            classes_of_nearest_neighbors, weights_of_nearest_neighbors
        ):
            weighted_count = dict.fromkeys(classes, 0)
            for class_, weight in zip(classes, weights):
                weighted_count[class_] += weight

            prediction = max(weighted_count, key=weighted_count.get)
            y_pred.append(prediction)

        return np.array(y_pred)

    def predict(
        self,
        X_test: npt.NDArray[np.float64],
    ) -> npt.NDArray[np.int64]:
        distance_metrics = {
            "minkowski": self.__minkowski_distance,
            "cosine": self.__cosine_distance,
        }
        distance_metric = distance_metrics.get(self.metric)
        distances = distance_metric(X_test)

        indices_of_nearest_neighbors = np.argpartition(
            a=distances,
            kth=self.n_neighbors,
            axis=-1,
        )[:, : self.n_neighbors]
        classes_of_nearest_neighbors = self.y_train[indices_of_nearest_neighbors]
        distances_of_nearest_neighbors = distances[
            np.arange(distances.shape[0]).reshape(-1, 1), indices_of_nearest_neighbors
        ]

        delta = 1e-10
        distances_of_nearest_neighbors += delta

        return self.__plurality_voting(
            classes_of_nearest_neighbors,
            distances_of_nearest_neighbors,
        )

    def score(
        self,
        X_test: npt.NDArray[np.float64],
        y_test: npt.NDArray[np.int64],
    ) -> float:
        y_pred = self.predict(X_test)
        is_correct = y_test == y_pred

        accuracy_score = is_correct.sum() / is_correct.size
        return accuracy_score.item()

In [63]:
params = {
    "n_neighbors": 3,
    "weights": "distance",
    "p": 1,
    "metric": "cosine",
}

In [64]:
clf = KNeighborsClassifier(**params)
clf.fit(X_train, y_train)

<__main__.KNeighborsClassifier at 0x7091ab70b620>

In [65]:
clf.score(X_test, y_test)

0.98

In [66]:
from sklearn.neighbors import KNeighborsClassifier as KNN

In [67]:
clf2 = KNN(**params)
clf2.fit(X_train, y_train)

In [68]:
clf2.score(X_test, y_test)

0.98

In [69]:
%%timeit
clf.score(X_test, y_test)

215 μs ± 932 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [70]:
%%timeit
clf2.score(X_test, y_test)

480 μs ± 3.47 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
