In [4]:
from ipynb.fs.defs.MLCP_EDA_1 import data_importing, replace_numbers_with_letters

from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import numpy as np

In [2]:
import pandas as pd
%store -r dataset_scaled_2_pca
%store -r dataset_scaled_3_pca

# Import dataset PCA'd featuring 2 and 3 columns
dataset_scaled_2_pca = dataset_scaled_2_pca
dataset_scaled_3_pca = dataset_scaled_3_pca

# KNN Clustering with original dataset
## Naive KNN

In [3]:
data = data_importing(verbose = False)
dataset = pd.concat([data[1], replace_numbers_with_letters(data[-1], mode = 'label')], axis = 1)

# Print a row as example
print('Sample dataset:\n', dataset.head(1).T)

Sample dataset:
                           0
j_zlogz           -3.488422
j_c1_b0_mmdt       0.471864
j_c1_b1_mmdt       0.058537
j_c1_b2_mmdt       0.014659
j_c2_b1_mmdt       0.109481
j_c2_b2_mmdt       0.025472
j_d2_b1_mmdt       1.870292
j_d2_b2_mmdt       1.737580
j_d2_a1_b1_mmdt    1.870292
j_d2_a1_b2_mmdt    0.211365
j_m2_b1_mmdt       0.089198
j_m2_b2_mmdt       0.032309
j_n2_b1_mmdt       0.331585
j_n2_b2_mmdt       0.155772
j_mass_mmdt      141.861038
j_multiplicity    85.000000
label              0.000000


In [4]:
# Define train/test dataset splitting
def train_test(dataset):
    X_train, X_test, y_train, y_test = train_test_split(
        dataset.iloc[:,:dataset.shape[1]-1],
        dataset.iloc[:,-1],
        test_size=0.2,
        stratify=dataset.iloc[:,-1],
        random_state=42)

    return X_train, X_test, y_train, y_test

# Define the classification report. "macro avg" for precision, recall and f1 will be used as a benchmark
def classification_reporting(tuning, model, y_test, y_pred, X_train):
    report = classification_report(y_test, y_pred, output_dict = True)
    print('{} {} performance using {} features:\n'.format(tuning, model, X_train.shape[1]),
          report['macro avg'])

In [5]:
# Defining Naive KNN clustering function
def knn_clustering(X_train, X_test, y_train, y_test):

    model = KNeighborsClassifier()
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    classification_reporting('Naive', 'KNN', y_test, y_pred, X_train)

In [6]:
# Split the dataset with 16 feature
X_train_full, X_test_full, y_train_full, y_test_full = train_test(dataset)

In [7]:
# Evaluate Naive KNN performance
knn_clustering(X_train_full, X_test_full, y_train_full, y_test_full)

Naive KNN performance using 16 features:
 {'precision': 0.6969268006940664, 'recall': 0.688817863291366, 'f1-score': 0.6909659932950201, 'support': 2000}


## KNN Hyperparameter Tuning

In [8]:
# Finding better hyperparameter
def knn_hyp_tuning(X_train, X_test, y_train, y_test):

    n_neighbors = [int(x) for x in np.linspace(start = 1, stop = 100, num = 50)]
    weights = ['uniform','distance']
    metric = ['euclidean','manhattan','chebyshev', 'minkowski']

    # The random_grid will contain all the possible combination of parameters listed above
    random_grid = {
        'n_neighbors': n_neighbors,
        'weights': weights,
        'metric': metric}

    # RandomizedSearchCV will cross-validate the model using different random combinations of hyperparameters
    knn = KNeighborsClassifier()
    knn_random = RandomizedSearchCV(estimator = knn,
                                    random_state = 42,
                                    n_jobs = -1,
                                    param_distributions = random_grid,
                                    n_iter = 100,
                                    cv=5,
                                    verbose = 0)

    knn_random.fit(X_train, y_train)
    knn_random.best_params_
    y_pred = knn_random.predict(X_test)

    # Print the performances
    classification_reporting('Tuned', 'KNN', y_test, y_pred, X_train)

In [9]:
knn_hyp_tuning(X_train_full, X_test_full, y_train_full, y_test_full)

Tuned KNN performance using 16 features:
 {'precision': 0.7221803123366086, 'recall': 0.7114613379414695, 'f1-score': 0.7139247216652634, 'support': 2000}


# KNN Clustering with PCA
## Naive KNN

In [10]:
X_train2, X_test2, y_train2, y_test2 = train_test(dataset_scaled_2_pca)
X_train3, X_test3, y_train3, y_test3 = train_test(dataset_scaled_3_pca)

In [11]:
knn_clustering(X_train2, X_test2, y_train2, y_test2)
knn_clustering(X_train3, X_test3, y_train3, y_test3)

Naive KNN performance using 2 features:
 {'precision': 0.5364493519603178, 'recall': 0.53176913577543, 'f1-score': 0.5314632132591448, 'support': 2000}
Naive KNN performance using 3 features:
 {'precision': 0.5986700067182135, 'recall': 0.5927594711317699, 'f1-score': 0.5944997294049733, 'support': 2000}


## KNN Hyperparameter Tuning

In [12]:
knn_hyp_tuning(X_train2, X_test2, y_train2, y_test2)
knn_hyp_tuning(X_train3, X_test3, y_train3, y_test3)

Tuned KNN performance using 2 features:
 {'precision': 0.5934402742884913, 'recall': 0.591770534323274, 'f1-score': 0.5914655926539869, 'support': 2000}
Tuned KNN performance using 3 features:
 {'precision': 0.6282273869371974, 'recall': 0.6266104955441463, 'f1-score': 0.6262741481813052, 'support': 2000}


# Hierarchical Clustering