In [1]:
#!pip install git+https://github.com/hyperopt/hyperopt-sklearn

from hpsklearn import HyperoptEstimator, any_classifier, svc
from hyperopt import tpe

from ipynb.fs.defs.functions import data_importing, replace_numbers_with_letters, train_test, classification_reporting

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import pandas as pd
import numpy as np

In [2]:
%store -r dataset_scaled_2_pca
%store -r dataset_scaled_3_pca

# Import dataset PCA'd featuring 2 and 3 columns
dataset_scaled_2_pca = dataset_scaled_2_pca
dataset_scaled_3_pca = dataset_scaled_3_pca

# KNN Classification with original dataset
## Naive KNN

In [3]:
data = data_importing(verbose = False)
dataset = pd.concat([data[1], replace_numbers_with_letters(data[-1], mode = 'label')], axis = 1)

# Print a row as example
print('Sample dataset:\n', dataset.head(1).T)

Sample dataset:
                           0
j_zlogz           -3.488422
j_c1_b0_mmdt       0.471864
j_c1_b1_mmdt       0.058537
j_c1_b2_mmdt       0.014659
j_c2_b1_mmdt       0.109481
j_c2_b2_mmdt       0.025472
j_d2_b1_mmdt       1.870292
j_d2_b2_mmdt       1.737580
j_d2_a1_b1_mmdt    1.870292
j_d2_a1_b2_mmdt    0.211365
j_m2_b1_mmdt       0.089198
j_m2_b2_mmdt       0.032309
j_n2_b1_mmdt       0.331585
j_n2_b2_mmdt       0.155772
j_mass_mmdt      141.861038
j_multiplicity    85.000000
label              0.000000


In [4]:
# Defining Naive KNN classification function
def knn_classification(X_train, X_test, y_train, y_test):

    model = KNeighborsClassifier()
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    classification_reporting('Naive', 'KNN', y_test, y_pred, X_train)

In [5]:
# Split the dataset with 16 feature
X_train_full, X_test_full, y_train_full, y_test_full = train_test(dataset, scaling = True)

In [6]:
# Evaluate Naive KNN performance
knn_classification(X_train_full, X_test_full, y_train_full, y_test_full)

Naive KNN performance using 16 features:
 {'precision': 0.7002880677152326, 'recall': 0.6930851052345895, 'f1-score': 0.6948513026716446, 'support': 2000}
Accuracy score: 0.6935


## KNN Hyperparameter Tuning

In [7]:
# Finding better hyperparameter
def knn_hyp_tuning(X_train, X_test, y_train, y_test):

    n_neighbors = [int(x) for x in np.linspace(start = 1, stop = 100, num = 50)]
    weights = ['uniform','distance']
    metric = ['euclidean','manhattan','chebyshev', 'minkowski']

    # The random_grid will contain all the possible combination of parameters listed above
    random_grid = {
        'n_neighbors': n_neighbors,
        'weights': weights,
        'metric': metric}

    # RandomizedSearchCV will cross-validate the model using different random combinations of hyperparameters
    knn = KNeighborsClassifier()
    knn_random = RandomizedSearchCV(estimator = knn,
                                    random_state = 42,
                                    n_jobs = -1,
                                    param_distributions = random_grid,
                                    n_iter = 100,
                                    cv=5,
                                    verbose = 0)

    knn_random.fit(X_train, y_train)
    knn_random.best_params_
    y_pred = knn_random.predict(X_test)

    # Print the performances
    classification_reporting('Tuned', 'KNN', y_test, y_pred, X_train)

In [8]:
knn_hyp_tuning(X_train_full, X_test_full, y_train_full, y_test_full)

Tuned KNN performance using 16 features:
 {'precision': 0.7216023554926478, 'recall': 0.7147253123286589, 'f1-score': 0.7159633710309441, 'support': 2000}
Accuracy score: 0.715


# KNN with PCA
## Naive KNN

In [9]:
X_train2, X_test2, y_train2, y_test2 = train_test(dataset_scaled_2_pca)
X_train3, X_test3, y_train3, y_test3 = train_test(dataset_scaled_3_pca)

In [10]:
knn_classification(X_train2, X_test2, y_train2, y_test2)
knn_classification(X_train3, X_test3, y_train3, y_test3)

Naive KNN performance using 2 features:
 {'precision': 0.5364493519603178, 'recall': 0.53176913577543, 'f1-score': 0.5314632132591448, 'support': 2000}
Accuracy score: 0.5325
Naive KNN performance using 3 features:
 {'precision': 0.5986700067182135, 'recall': 0.5927594711317699, 'f1-score': 0.5944997294049733, 'support': 2000}
Accuracy score: 0.5935


## KNN Hyperparameter Tuning

In [11]:
knn_hyp_tuning(X_train2, X_test2, y_train2, y_test2)
knn_hyp_tuning(X_train3, X_test3, y_train3, y_test3)

Tuned KNN performance using 2 features:
 {'precision': 0.5934402742884913, 'recall': 0.591770534323274, 'f1-score': 0.5914655926539869, 'support': 2000}
Accuracy score: 0.5925
Tuned KNN performance using 3 features:
 {'precision': 0.6282273869371974, 'recall': 0.6266104955441463, 'f1-score': 0.6262741481813052, 'support': 2000}
Accuracy score: 0.6275


# Support Vector Classification SVC
## Naive SVC

In [16]:
# Defining Naive SVC function
def svc_classification(X_train, X_test, y_train, y_test):

    model = SVC()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    classification_reporting('Naive', 'SVC', y_test, y_pred, X_train)

svc_classification(X_train_full, X_test_full, y_train_full, y_test_full)

Naive SVC performance using 16 features:
 {'precision': 0.7284261234655312, 'recall': 0.7194220211924094, 'f1-score': 0.7213806303164567, 'support': 2000}
Accuracy score: 0.7195


## SVC Hyperparameter tuning

In [20]:
# Finding better hyperparameter
def svc_hyp_tuning(X_train, X_test, y_train, y_test):

    # RandomizedSearchCV was so slow! So I used optimization algorithm to find better hyperparameters
    model = HyperoptEstimator(classifier=svc('mySVC'), max_evals=50)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(model.best_model())

    # Print the performances
    classification_reporting('Tuned', 'SVC', y_test, y_pred, X_train)

In [21]:
svc_hyp_tuning(X_train_full, X_test_full, y_train_full, y_test_full)

100%|██████████| 1/1 [00:02<00:00,  2.07s/trial, best loss: 0.35187500000000005]
100%|██████████| 2/2 [00:01<00:00,  1.88s/trial, best loss: 0.35187500000000005]
100%|██████████| 3/3 [00:00<00:00,  1.17trial/s, best loss: 0.35187500000000005]
100%|██████████| 4/4 [00:00<00:00,  1.00trial/s, best loss: 0.35187500000000005]
100%|██████████| 5/5 [00:03<00:00,  3.43s/trial, best loss: 0.35187500000000005]
100%|██████████| 6/6 [00:00<00:00,  1.19trial/s, best loss: 0.35]
100%|██████████| 7/7 [00:02<00:00,  2.37s/trial, best loss: 0.35]
100%|██████████| 8/8 [00:01<00:00,  1.10s/trial, best loss: 0.35]
100%|██████████| 9/9 [00:01<00:00,  1.91s/trial, best loss: 0.30374999999999996]
100%|██████████| 10/10 [00:01<00:00,  1.45s/trial, best loss: 0.30374999999999996]
100%|██████████| 11/11 [00:00<00:00,  1.00trial/s, best loss: 0.30374999999999996]
100%|██████████| 12/12 [00:01<00:00,  1.49s/trial, best loss: 0.30374999999999996]
100%|██████████| 13/13 [00:01<00:00,  1.30s/trial, best loss: 0.303

  "X does not have valid feature names, but"


{'learner': SVC(C=0.8590922561372307, coef0=0.45656864791382556, degree=5, kernel='poly',
    random_state=2, shrinking=False, tol=0.003368807392590179), 'preprocs': (Normalizer(norm='l1'),), 'ex_preprocs': ()}
Tuned SVC performance using 16 features:
 {'precision': 0.7524338514603489, 'recall': 0.7426881196085372, 'f1-score': 0.7449769831172937, 'support': 2000}
Accuracy score: 0.743


# Neural Network (in another notebook)