In [1]:
!pip install git+https://github.com/hyperopt/hyperopt-sklearn

from hpsklearn import HyperoptEstimator, any_classifier, svc
from hyperopt import tpe

from ipynb.fs.defs.MLCP_EDA_1 import data_importing, replace_numbers_with_letters

from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

import pandas as pd
import numpy as np

Collecting git+https://github.com/hyperopt/hyperopt-sklearn
  Cloning https://github.com/hyperopt/hyperopt-sklearn to /private/var/folders/9g/430d659n24x98lk_m6dccs1r0000gp/T/pip-req-build-pinwklbc
  Running command git clone --filter=blob:none -q https://github.com/hyperopt/hyperopt-sklearn /private/var/folders/9g/430d659n24x98lk_m6dccs1r0000gp/T/pip-req-build-pinwklbc
  Resolved https://github.com/hyperopt/hyperopt-sklearn to commit 4b3f6fde3a1ded2e71e8373d52c1b51a0239ef91
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
You should consider upgrading via the '/Users/Vincenzo/PycharmProjects/scratches/venv/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
%store -r dataset_scaled_2_pca
%store -r dataset_scaled_3_pca

# Import dataset PCA'd featuring 2 and 3 columns
dataset_scaled_2_pca = dataset_scaled_2_pca
dataset_scaled_3_pca = dataset_scaled_3_pca

# KNN Classification with original dataset
## Naive KNN

In [3]:
data = data_importing(verbose = False)
dataset = pd.concat([data[1], replace_numbers_with_letters(data[-1], mode = 'label')], axis = 1)

# Print a row as example
print('Sample dataset:\n', dataset.head(1).T)

Sample dataset:
                           0
j_zlogz           -3.488422
j_c1_b0_mmdt       0.471864
j_c1_b1_mmdt       0.058537
j_c1_b2_mmdt       0.014659
j_c2_b1_mmdt       0.109481
j_c2_b2_mmdt       0.025472
j_d2_b1_mmdt       1.870292
j_d2_b2_mmdt       1.737580
j_d2_a1_b1_mmdt    1.870292
j_d2_a1_b2_mmdt    0.211365
j_m2_b1_mmdt       0.089198
j_m2_b2_mmdt       0.032309
j_n2_b1_mmdt       0.331585
j_n2_b2_mmdt       0.155772
j_mass_mmdt      141.861038
j_multiplicity    85.000000
label              0.000000


In [4]:
# Define train/test dataset splitting
def train_test(dataset):
    X_train, X_test, y_train, y_test = train_test_split(
        dataset.iloc[:,:dataset.shape[1]-1],
        dataset.iloc[:,-1],
        test_size=0.2,
        stratify=dataset.iloc[:,-1],
        random_state=42)

    return X_train, X_test, y_train, y_test

# Define the classification report. "macro avg" for precision, recall and f1 will be used as a benchmark
def classification_reporting(tuning, model, y_test, y_pred, X_train):
    report = classification_report(y_test, y_pred, output_dict = True)
    print('{} {} performance using {} features:\n'.format(tuning, model, X_train.shape[1]),
          report['macro avg'])
    print('Accuracy score:', accuracy_score(y_test, y_pred))

In [5]:
# Defining Naive KNN classification function
def knn_classification(X_train, X_test, y_train, y_test):

    model = KNeighborsClassifier()
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    classification_reporting('Naive', 'KNN', y_test, y_pred, X_train)

In [6]:
# Split the dataset with 16 feature
X_train_full, X_test_full, y_train_full, y_test_full = train_test(dataset)

In [7]:
# Evaluate Naive KNN performance
knn_classification(X_train_full, X_test_full, y_train_full, y_test_full)

Naive KNN performance using 16 features:
 {'precision': 0.6969268006940664, 'recall': 0.688817863291366, 'f1-score': 0.6909659932950201, 'support': 2000}
Accuracy score: 0.689


## KNN Hyperparameter Tuning

In [8]:
# Finding better hyperparameter
def knn_hyp_tuning(X_train, X_test, y_train, y_test):

    n_neighbors = [int(x) for x in np.linspace(start = 1, stop = 100, num = 50)]
    weights = ['uniform','distance']
    metric = ['euclidean','manhattan','chebyshev', 'minkowski']

    # The random_grid will contain all the possible combination of parameters listed above
    random_grid = {
        'n_neighbors': n_neighbors,
        'weights': weights,
        'metric': metric}

    # RandomizedSearchCV will cross-validate the model using different random combinations of hyperparameters
    knn = KNeighborsClassifier()
    knn_random = RandomizedSearchCV(estimator = knn,
                                    random_state = 42,
                                    n_jobs = -1,
                                    param_distributions = random_grid,
                                    n_iter = 100,
                                    cv=5,
                                    verbose = 0)

    knn_random.fit(X_train, y_train)
    knn_random.best_params_
    y_pred = knn_random.predict(X_test)

    # Print the performances
    classification_reporting('Tuned', 'KNN', y_test, y_pred, X_train)

In [9]:
knn_hyp_tuning(X_train_full, X_test_full, y_train_full, y_test_full)

Tuned KNN performance using 16 features:
 {'precision': 0.7221803123366086, 'recall': 0.7114613379414695, 'f1-score': 0.7139247216652634, 'support': 2000}
Accuracy score: 0.7115


# KNN with PCA
## Naive KNN

In [10]:
X_train2, X_test2, y_train2, y_test2 = train_test(dataset_scaled_2_pca)
X_train3, X_test3, y_train3, y_test3 = train_test(dataset_scaled_3_pca)

In [11]:
knn_classification(X_train2, X_test2, y_train2, y_test2)
knn_classification(X_train3, X_test3, y_train3, y_test3)

Naive KNN performance using 2 features:
 {'precision': 0.5364493519603178, 'recall': 0.53176913577543, 'f1-score': 0.5314632132591448, 'support': 2000}
Accuracy score: 0.5325
Naive KNN performance using 3 features:
 {'precision': 0.5986700067182135, 'recall': 0.5927594711317699, 'f1-score': 0.5944997294049733, 'support': 2000}
Accuracy score: 0.5935


## KNN Hyperparameter Tuning

In [12]:
knn_hyp_tuning(X_train2, X_test2, y_train2, y_test2)
knn_hyp_tuning(X_train3, X_test3, y_train3, y_test3)

Tuned KNN performance using 2 features:
 {'precision': 0.5934402742884913, 'recall': 0.591770534323274, 'f1-score': 0.5914655926539869, 'support': 2000}
Accuracy score: 0.5925
Tuned KNN performance using 3 features:
 {'precision': 0.6282273869371974, 'recall': 0.6266104955441463, 'f1-score': 0.6262741481813052, 'support': 2000}
Accuracy score: 0.6275


# Support Vector Classification SVC
## Naive SVC

In [13]:
# Defining Naive KNN classification function
def svc_classification(X_train, X_test, y_train, y_test):

    model = SVC()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    classification_reporting('Naive', 'SVC', y_test, y_pred, X_train)

svc_classification(X_train_full, X_test_full, y_train_full, y_test_full)

Naive SVC performance using 16 features:
 {'precision': 0.7075778304684717, 'recall': 0.7022382855550579, 'f1-score': 0.7031409236781301, 'support': 2000}
Accuracy score: 0.7025


## Hyperparameter tuning

In [14]:
# Finding better hyperparameter
def svc_hyp_tuning(X_train, X_test, y_train, y_test):

    # RandomizedSearchCV was so slow! So I used optimization algorithm to find better hyperparameters
    model = HyperoptEstimator(classifier=svc('mySVC'))
    model.fit(X_train_full, y_train_full)
    y_pred = model.predict(X_test_full)

    print(model.best_model())

    # Print the performances
    classification_reporting('Tuned', 'SVC', y_test, y_pred, X_train)

In [15]:
svc_hyp_tuning(X_train_full, X_test_full, y_train_full, y_test_full)

100%|██████████| 1/1 [00:01<00:00,  1.95s/trial, best loss: 0.485]
100%|██████████| 2/2 [00:01<00:00,  1.18s/trial, best loss: 0.485]
100%|██████████| 3/3 [00:02<00:00,  2.10s/trial, best loss: 0.46187500000000004]
100%|██████████| 4/4 [00:01<00:00,  1.86s/trial, best loss: 0.27875000000000005]
100%|██████████| 5/5 [00:01<00:00,  1.83s/trial, best loss: 0.27875000000000005]
100%|██████████| 6/6 [00:00<00:00,  1.10trial/s, best loss: 0.27875000000000005]
100%|██████████| 7/7 [00:08<00:00,  8.16s/trial, best loss: 0.27249999999999996]
100%|██████████| 8/8 [00:01<00:00,  1.79s/trial, best loss: 0.27249999999999996]
100%|██████████| 9/9 [00:01<00:00,  1.84s/trial, best loss: 0.27249999999999996]
100%|██████████| 10/10 [00:01<00:00,  1.23s/trial, best loss: 0.27249999999999996]


  "X does not have valid feature names, but"


{'learner': SVC(C=1.0374953640580185, coef0=0.76811069777233, degree=5, kernel='poly',
    random_state=2, tol=0.00036433643603499526), 'preprocs': (MinMaxScaler(clip=True, feature_range=(0.0, 1.0)),), 'ex_preprocs': ()}
Tuned SVC performance using 16 features:
 {'precision': 0.7470467719551952, 'recall': 0.7373904802778443, 'f1-score': 0.739672745612378, 'support': 2000}
Accuracy score: 0.7375


# Neural Network (in another notebook)