In [1]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

In [2]:
import pandas as pd
%store -r dataset_scaled_2_pca
%store -r dataset_scaled_3_pca

dataset_scaled_2_pca = dataset_scaled_2_pca
dataset_scaled_3_pca = dataset_scaled_3_pca

# KNN Clustering with PCA
## Naive KNN for reference below

In [9]:
def train_test(dataset):
    X_train, X_test, y_train, y_test = train_test_split(
        dataset.iloc[:,:dataset.shape[1]-1],
        dataset.iloc[:,-1],
        test_size=0.2,
        stratify=dataset.iloc[:,-1],
        random_state=42)

    return X_train, X_test, y_train, y_test

def classification_reporting(tuning, model, y_test, y_pred, X_train):
    report = classification_report(y_test, y_pred, output_dict = True)
    print('{} {} performance using {} features:\n'.format(tuning, model, X_train.shape[1]),
          report['macro avg'])

In [10]:
def knn_clustering(dataset):
    X_train, X_test, y_train, y_test = train_test(dataset)

    model = KNeighborsClassifier()
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    classification_reporting('Naive', 'KNN', y_test, y_pred, X_train)

In [11]:
knn_clustering(dataset_scaled_2_pca)
knn_clustering(dataset_scaled_3_pca)

Naive KNN performance using 2 features:
 {'precision': 0.5364493519603178, 'recall': 0.53176913577543, 'f1-score': 0.5314632132591448, 'support': 2000}
Naive KNN performance using 3 features:
 {'precision': 0.5986700067182135, 'recall': 0.5927594711317699, 'f1-score': 0.5944997294049733, 'support': 2000}


## Hyperparameter Tuning

In [15]:
X_train2, X_test2, y_train2, y_test2 = train_test(dataset_scaled_2_pca)
X_train3, X_test3, y_train3, y_test3 = train_test(dataset_scaled_3_pca)

In [24]:
def knn_hyp_tuning(X_train, X_test, y_train, y_test):

    n_neighbors = [int(x) for x in np.linspace(start = 1, stop = 100, num = 50)]
    weights = ['uniform','distance']
    metric = ['euclidean','manhattan','chebyshev', 'minkowski']

    random_grid = {
        'n_neighbors': n_neighbors,
        'weights': weights,
        'metric': metric}

    knn = KNeighborsClassifier()
    knn_random = RandomizedSearchCV(estimator = knn,
                                    random_state = 42,
                                    n_jobs = -1,
                                    param_distributions = random_grid,
                                    n_iter = 100,
                                    cv=5,
                                    verbose = 0)

    knn_random.fit(X_train, y_train)
    knn_random.best_params_
    y_pred = knn_random.predict(X_test)

    classification_reporting('Tuned', 'KNN', y_test, y_pred, X_train)

In [27]:
knn_hyp_tuning(X_train2, X_test2, y_train2, y_test2)
knn_hyp_tuning(X_train3, X_test3, y_train3, y_test3)

Tuned KNN performance using 2 features:
 {'precision': 0.5934402742884913, 'recall': 0.591770534323274, 'f1-score': 0.5914655926539869, 'support': 2000}
Tuned KNN performance using 3 features:
 {'precision': 0.6282273869371974, 'recall': 0.6266104955441463, 'f1-score': 0.6262741481813052, 'support': 2000}


# Hierarchical Clustering