In [None]:
import pandas as pd
import sklearn.metrics as metrics
import matplotlib.pyplot as plt

In [None]:
X_train = pd.read_csv("sets/X_train.csv")
X_test = pd.read_csv("sets/X_test.csv")
y_train = pd.read_csv("sets/y_train.csv")
y_test = pd.read_csv("sets/y_test.csv")

### KNN Regression

In order to find the best value for $k$, we will train and make predictions on different models with
$$
    k = 1, 2, \ldots, 15.
$$
For each model we will then score the predictions and choose the $k$ with the best score.

In [None]:
# We will use the scikit-learn implementation of the model
from sklearn.neighbors import KNeighborsRegressor

knn_models = {}

# We will analize models with different 'k' values
max_k = 15

for k in range(1, max_k + 1):
    print(f"Training KNN model with n_neighbors = {k}")
    knn_models[k] = KNeighborsRegressor(n_neighbors=k)
    knn_models[k].fit(X_train, y_train)


After the training is complete we will the all the $15$ models, and score their prediction using *Mean Squared Error* (or **RMSE**) and the *R2* scores.

The best model will be the one with the *lowest* **RMSE** score and the *highest* **R2** score.

In [None]:
y_predictions = {}
RMSE_knn = {}
R2_knn = {}

for k, model in knn_models.items():
    print(f"Making predictions for KNN model with n_neighbors = {k}")
    y_predictions[k] = model.predict(X_test)

    # Calculate the scores for the predictions
    RMSE_knn[k] = metrics.mean_squared_error(y_test, y_predictions[k], squared=False)
    R2_knn[k] = metrics.r2_score(y_test, y_predictions[k])

best_k = min(RMSE_knn, key=RMSE_knn.get)
print(f"Best value for 'k' {best_k}")

The best model is the one with $k = 5$.

Let's plot the scores calculated in the previous step for each model, showing the corresponding $k$ value.

In [None]:
figure, axis = plt.subplots(1, 2, figsize=(15, 4)) 

KNN_y_label = "Number of neighbors (k)"

KNN_X = RMSE_knn.keys()

axis[0].plot(KNN_X, RMSE_knn.values())
axis[0].set_xlabel(KNN_y_label)
axis[0].set_ylabel("RMSE Score")

axis[1].plot(KNN_X, R2_knn.values())
axis[1].set_xlabel(KNN_y_label)
axis[1].set_ylabel("R2 Score")

plt.show()

We note that the point of *minimum* in the **RMSE** plot is equal to the point of *maximum* in the **R2** plot.
This is given by the fact that the two scores are inversely proportional.

The analysis was conducted with $k=1,\ldots,15$.
Intuitively, analysing models with $k$ larger than $15$ would be useless, since they would just keep *underfitting* the data, hence getting worse score.

In [None]:

print(f"RMSE: {RMSE_knn[best_k]}")
print(f"R2: {R2_knn[best_k]}")