In [30]:
# Import packages and data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load a dataframe
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [31]:
# === 1. Preprocess data ===

# Features & target
X = df.drop("target", axis=1)
y = df["target"]


# Train/Test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      # 20% of the data reserved for the test set
    random_state=412,   # Fix the random seed for reproducibility
    shuffle=True,       # Shuffle the dataset before splitting (recommended)
    stratify=y          # Preserve the target class distribution
)

In [None]:
# === 2. Model training, hyperparameter tuning, and evaluation ===

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Find the best k using cross-validation
def find_best_knn(X_train, y_train, X_test, y_test, 
                  cv=5, plot=False):
    """
    Train model with its own hyperparameter grids,
    find best parameters using GridSearchCV, evaluate on test set,
    and plot predictions.
    """

    pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('knn', KNeighborsClassifier())
        ])
    
    param_grid = {
        'knn__n_neighbors': range(1, 21),
        'knn__weights': ['uniform', 'distance'],
        'knn__metric': ['euclidean', 'manhattan']
    }
    
    grid = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        cv=cv,
        scoring="accuracy"
    )
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    best_params = grid.best_params_
    
    y_pred_test = best_model.predict(X_test)
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred_test)}\n")

    if plot:
        cm = confusion_matrix(y_test, y_pred_test)
        print("Confusion Matrix:\n", cm)
        print("\nClassification Report:\n", classification_report(y_test, y_pred_test))

    return best_params, best_model


In [None]:
# === 3. Run evaluation ===

find_best_knn(
    X_train, y_train, X_test, y_test, 
    cv=5, plot=True
    )

Test Accuracy: 0.9666666666666667

Confusion Matrix:
 [[10  0  0]
 [ 0  9  1]
 [ 0  0 10]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.90      0.95        10
           2       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



({'knn__metric': 'euclidean',
  'knn__n_neighbors': 13,
  'knn__weights': 'distance'},
 Pipeline(steps=[('scaler', StandardScaler()),
                 ('knn',
                  KNeighborsClassifier(metric='euclidean', n_neighbors=13,
                                       weights='distance'))]))