# KNN+CV

In [4]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd
df = pd.read_csv("HW3_data/cv+knn.csv")

print(df)

   ID    x1   x2      y
0   1 -2.00  4.0   star
1   2 -6.48  5.0  spade
2   3  0.93 -2.0   star
3   4  0.20  2.0  spade
4   5  1.69  2.0   star
5   6 -5.85  4.0   star
6   7  3.00  0.0  spade
7   8 -0.36  8.0  spade
8   9 -1.68  3.0  spade
9  10  0.00  0.0   star


In [5]:
X = df[['x1', 'x2']].values
y = df['y'].values

# (a) What is the leave-one-out cross-validation error of 1NN on this dataset?

In [17]:
from sklearn.model_selection import LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier

loo = LeaveOneOut()
errors = 0

for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    knn = KNeighborsClassifier(n_neighbors=1)
    knn.fit(X_train, y_train)
    y_prediction = knn.predict(X_test)

    if y_prediction[0] != y_test[0]:
        errors += 1

loo_error_rate = errors / len(X)
print(f"LOOCV 1NN Error: {loo_error_rate:.3f}")
#https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

LOOCV 1NN Error: 0.900


# (b) What are the 3 nearest neighbors for data points 2 and 8 respectively.

In [20]:
ids = df['ID'].values

nbrs = NearestNeighbors(n_neighbors=4, algorithm='ball_tree', metric='euclidean').fit(X)
distance, indices = nbrs.kneighbors(X)

#https://scikit-learn.org/stable/modules/neighbors.html

def three_nbrs_for_id(target_id):
    i = np.where(ids == target_id)[0][0]
    neighbors_indices = indices[i][1:4]
    return list(zip(ids[neighbors_indices], distance[i][1:4], y[neighbors_indices]))

nbrs_2 = three_nbrs_for_id(2)
nbrs_8 = three_nbrs_for_id(8)

print("3 nearest neighbors for ID=2:", nbrs_2)
print("3 nearest neighbors for ID=8:", nbrs_8)

3 nearest neighbors for ID=2: [(np.int64(6), np.float64(1.1819052415485773), 'star'), (np.int64(1), np.float64(4.590250537824706), 'star'), (np.int64(9), np.float64(5.2), 'spade')]
3 nearest neighbors for ID=8: [(np.int64(1), np.float64(4.323147001895725), 'star'), (np.int64(9), np.float64(5.17130544446951), 'spade'), (np.int64(4), np.float64(6.026076667285275), 'spade')]


# (c) What is the 3-folded cross-validation error of 3NN on this dataset?
For the ith fold, the testing dataset is composed of all the data points whose (ID mod 3 = i-1).

In [23]:
errors = 0
total = 0

for i in [1, 2, 3]:
    test_mask = (ids % 3) == (i - 1)
    train_mask = ~test_mask

    X_train, X_test = X[train_mask], X[test_mask]
    y_train, y_test = y[train_mask], y[test_mask]

    knn3 = KNeighborsClassifier(n_neighbors = 3, algorithm = 'ball_tree', metric = 'euclidean')
    knn3.fit(X_train, y_train)

    y_pred = knn3.predict(X_test)

    fold_error = np.sum(y_pred != y_test)
    errors += fold_error
    total += len(y_test)

CrossValidation3fold_error_rate = errors / total
print(f"3-folded cross validation Error: {CrossValidation3fold_error_rate:.3f}")

3-folded cross validation Error: 0.700


# (d) Based on the results of (a) and (c), can we determine which is a better classifier, 1NN or 3NN? Why?

In this dataset, the leave-one-out cross validation error of 1NN is 0.90, whereas the 3-fold cross validation error of 3NN is 0.70, so 3NN achieve the lower error. Because 3NN averages over neighbors via majority vote, it is less sensitive to noise/outliers and typically reduces variance, which can yield better generalization on small samples. However, since (a) and (c) use different cross validation protocols, the comparison is not fully fair, for a fair assessment we should evaluate both 1NN and 3NN under the same cross validation scheme with identical folds.