# ML CUP - k-NN
In this notebook we will study the application of k-NN networks to the ML CUP.
In particular we focus on the effect of the **hyperparameter k**.

In [None]:
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, KFold
from itertools import product
import matplotlib.pyplot as plt

In [None]:
SMALL_SIZE = 16
MEDIUM_SIZE = 18
BIGGER_SIZE = 20

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)   # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)   # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('axes', titlesize=MEDIUM_SIZE)    # fontsize of the figure suptitle
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [None]:
def mean_euclidean_error(vectors1, vectors2):
    """
    Compute the mean Euclidean error between two sets of 3D vectors.

    Parameters:
    - vectors1: NumPy array of shape (N, 3) representing the first set of 3D vectors
    - vectors2: NumPy array of shape (N, 3) representing the second set of 3D vectors

    Returns:
    - mean_error: Mean Euclidean error between the two sets of vectors
    """
    # Check if the input arrays have the correct shape
    if vectors1.shape != vectors2.shape or vectors1.shape[1] != 3:
        raise ValueError("Input arrays must be of shape (N, 3)")

    # Compute Euclidean distance
    euclidean_distance = np.linalg.norm(vectors1 - vectors2, axis=1)

    # Calculate the mean Euclidean error
    mean_error = np.mean(euclidean_distance)

    return mean_error

In [None]:
def plot_mean_std(x,mee,std, label, color):
    """
    Plot mean Euclidean error (MEE) and standard deviation.

    Parameters:
    - x (array-like): An array-like object containing the values of the hyperparameter.
    - mee (array-like): An array-like object containing the mean Euclidean error for each hyperparameter value.
    - std (array-like): An array-like object containing the standard deviation of the Euclidean error for each hyperparameter value.
    - label (str): Label for the x-axis, typically representing the name of the hyperparameter.
    - color (str): Color code (name or hexadecimal) for the plot line and shaded area.

    The function plots a line for the MEE and shades the area between MEE ± standard deviation
    """

    plt.figure(figsize=(10, 9))
    plt.plot(x,mee, label='MEE $\pm$ std. (results of k-fold cross validation)', color = color, linewidth=1)
    plt.fill_between(x,mee-std, mee+std, color=color, alpha=0.3)

    plt.xlabel(f'{label} values')
    plt.ylabel('MEE')
    plt.legend()

    plt.show()


# Load the dataset

In [None]:
from sklearn.model_selection import train_test_split

# load the dataset, split into input (X) and output (y) variables
dataset = np.loadtxt('ML-CUP23-TR.csv', delimiter=',')
X = dataset[:,1:11]
y = dataset[:,11:14]

# Split the data into training and test sets (80%/20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# implement a custom scorer to use MEE
from sklearn.metrics import make_scorer
mean_euclidean_scorer = make_scorer(mean_euclidean_error, greater_is_better=False)

def perform_grid_search_kfold(ks,weightss,ps, k_folds, X, y):
    """
    Perform grid search to find the best hyperparameters for K-Nearest Neighbors Regressor
    using K-Fold cross-validation based on Mean Euclidean Error.

    Iterate over all combinations of specified hyperparameters,
    evaluates performance using K-Fold cross-validation, and identify the best combination based
    on the lowest MEE.

    Parameters:
    - ks (list of int): A list of values to try for the 'n_neighbors' hyperparameter in KNN.
    - weightss (list of str): A list of weight options (e.g., 'uniform', 'distance') to evaluate in KNN.
    - ps (list of int): A list of values to try for the 'p' hyperparameter in KNN, determining the power parameter for the Minkowski metric.
    - k_folds (int): The number of folds to use in K-Fold cross-validation.
    - X (array-like): The feature dataset used for training the model.
    - y (array-like): The target variable dataset used for training the model.

    Returns:
    - final_model (KNeighborsRegressor): The trained KNN regressor model using the best hyperparameters.

    The function prints the MEE for each combination of hyperparameters during the grid search process.
    After completing the search, it prints and returns the best hyperparameters along with their corresponding MEE.
    """
    mees_mean = []
    mees_sd = []
    # Best k and its corresponding score
    best_k = None
    best_score = float('inf')

    # K-Fold cross-validation
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

    # Trying different combinations of hyperparams
    for k, weights, p in product(ks, weightss,ps):
        model = KNeighborsRegressor(n_neighbors=k, weights=weights, p=p)
        # Negative Mean Squared Error as scoring
        scores = cross_val_score(model, X, y, cv=kf, scoring=mean_euclidean_scorer)
        mean_score = -scores.mean()  # Convert back to positive MSE
        mees_mean.append(mean_score)
        mees_sd.append(-scores.std())
        print(f"k={k}, w={weights}, p={p}, Mean Squared Error: {mean_score}")

        if mean_score < best_score:
            best_k = k
            best_weight = weights
            best_p = p
            best_score = mean_score

    print(f"The best k is {best_k} {best_weight} {best_p}  with a MEE of {best_score}")

    #  train the final model with the best k
    final_model = KNeighborsRegressor(n_neighbors=best_k, weights=best_weight,p = best_p)
    final_model.fit(X_train, y_train)

    return final_model

# Perform the grid search

In [None]:
# Define the grid search
ks = range(1, 31)
weightss=['uniform','distance']
ps = range(1,10)
perform_grid_search_kfold(ks,weightss,ps,k_folds=3,X=X_train,y=y_train)

# Study the effect of K
keeping fixed the others hyperparameters

In [None]:
mees_mean = []
mees_sd = []
# Best k and its corresponding score
best_k = None
best_score = float('inf')

k_values = range(1, 31)

# K-Fold cross-validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Trying different k values
for k in k_values:
    model = KNeighborsRegressor(n_neighbors=k, weights='distance',p = 1)
    # Negative Mean Squared Error as scoring
    scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=mean_euclidean_scorer)
    mean_score = -scores.mean()  # Convert back to positive MSE
    mees_mean.append(mean_score)
    mees_sd.append(scores.std())
    print(f"k={k} Mean Squared Error: {mean_score}")

    if mean_score < best_score:
        best_k = k
        best_score = mean_score
        best_score_sd = scores.std()

print(f"The best k is {best_k}  with a MEE of {best_score:.4} +- {best_score_sd:.4}")

#  train the final model with the best k
final_model = KNeighborsRegressor(n_neighbors=best_k, weights='distance',p = 1)
final_model.fit(X_train, y_train)

plot_mean_std(k_values,np.array(mees_mean),np.array(mees_sd),label='k', color='purple')