# AI300
### kNN

In [None]:
from typing import Literal
import numpy as np

class MyKNN():
  def __init__(self, num_neighbors, norm=2):
    self.num_neighbors = num_neighbors
    self.norm = norm
    self.X_train = None
    self.y_train = None

  def fit(self, X_train, y_train):
    self.X_train = X_train
    self.y_train = y_train

  def predict(self, X_test,
              mode: Literal["classification", "regression"] = "classification"):
    distance_matrix = np.linalg.norm(X_test[:, None, :] - X_train,
                                     ord = self.norm, axis=-1)
    nearest_k_indices = np.argsort(distance_matrix, axis=1)[:, :self.num_neighbors]
    nearest_k_vals = self.y_train[nearest_k_indices]

    if mode == "classification":
      y_pred = np.apply_along_axis(
          lambda x: np.argmax(np.bincount(x)),
          axis=1, arr=nearest_k_vals
      )
    elif mode == "regression":
      y_pred = np.mean(nearest_k_vals, axis=1)
    else:
      raise ValueError(f"mode must be one of ['classification', 'regression']."\
                       f"Received: {mode}")

    return y_pred

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

iris = load_iris()
X, y = iris["data"], iris["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

my_knn = MyKNN(num_neighbors=5)
my_knn.fit(X_train, y_train)
my_y_pred = my_knn.predict(X_test)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

diffs = (my_y_pred == y_pred)
print(diffs) # All true so my predictions and Sklearn predictions are same

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True]


In [None]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

diabetes = load_diabetes()
X, y = diabetes["data"], diabetes["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

my_knn = MyKNN(num_neighbors=5)
my_knn.fit(X_train, y_train)
my_y_pred = my_knn.predict(X_test, mode="regression")

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

diffs = (my_y_pred == y_pred)
print(diffs) # All true so my predictions and Sklearn predictions are same

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True]


I'm not completely sure how to work out questions #4 and #5, so I was wondering how I was supposed to go about solving that? Would it possible to receive some explanation on these questions or topics to review? Thanks

In this problem, we will have to find an expression for $y^{(n)}$ as $N \rightarrow \infty$. Because $f(x)>0$ for $x\in [0,1]$, we can compute the PDF which would be of form $\int_0^{x_1} x f(x) dx$, where $x_1$ is the threshold that tells us what percent of the $N$ data points to use in the kNN model. We can plug in known values to get that $y^{(n)}=\int_0^{x_1}g(x)\frac{f(x)}{\epsilon}dx = \frac{1}{\epsilon}\int_0^{x_1}g(x)f(x)dx$. So our formula as $N\rightarrow\infty$ may look something like this.

Part two, I think, would be looking for the error, or the distance between the predicted value and the ground truth value. So it would be $g(0)-\frac{1}{\epsilon}\int_0^{x_1}g(x)f(x)dx$ using the expression from part 1.