In [39]:
import numpy as np
import pandas as pd
from dataLoad import PulsarData
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
import shap

## Classification of pulsar data using the sklearn neural network method



In [9]:
features_data = PulsarData('HTRU_2').features
targets_data = PulsarData('HTRU_2').targets

Shuffle and split the data into training and test groups with 3:1 split

In [13]:
train_features_data, test_features_data, train_targets_data, test_targets_data  =  train_test_split( features_data, 
                                                        targets_data, test_size=0.25, random_state=42)

Bayesian optimisation and cross validation of hyperparameters using Simone's Troels example code.

In [26]:
def sklNN_CrossValidation(hidden_layer_sizes, learning_rate_init, data, targets):
    """Cross validation.
       Fits a NN with the given paramaters to the target 
       given data, calculated a CV accuracy score and returns the mean.
       The goal is to find combinations
       that maximize the accuracy
    """
    
    estimator = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, learning_rate_init=learning_rate_init, random_state=0)
    
    cval = cross_val_score(estimator, data, targets, scoring='accuracy', cv=5)
    
    return cval.mean()

In [27]:
def optimize_sklNN(data, targets, pars, n_iter=5):
    """Apply Bayesian Optimization to NN parameters."""
    
    def crossval_wrapper(hidden_layer_sizes, learning_rate_init):
        """Wrapper of NNe cross validation. 
           hidden_layer_sizes
           is cast to integer before we pass them along.
        """
        return sklNN_CrossValidation(hidden_layer_sizes=int(hidden_layer_sizes), 
                                            learning_rate_init=learning_rate_init, 
                                            data=data, 
                                            targets=targets)

    optimizer = BayesianOptimization(f=crossval_wrapper, 
                                     pbounds=pars, 
                                     random_state=42, 
                                     verbose=2)
    optimizer.maximize(init_points=4, n_iter=n_iter)

    return optimizer

In [28]:
parameters_BayesianOptimization = {"hidden_layer_sizes": (1, 500), 
                                   "learning_rate_init": (0.0001, 1)
                                  }

BayesianOptimization = optimize_sklNN(train_features_data, 
                                             train_targets_data, 
                                             parameters_BayesianOptimization, 
                                             n_iter=5)
print(BayesianOptimization.max)

|   iter    |  target   | hidden... | learni... |
-------------------------------------------------
| [0m 1       [0m | [0m 0.9081  [0m | [0m 187.9   [0m | [0m 0.9507  [0m |
| [95m 2       [0m | [95m 0.9569  [0m | [95m 366.3   [0m | [95m 0.5987  [0m |
| [95m 3       [0m | [95m 0.9727  [0m | [95m 78.85   [0m | [95m 0.1561  [0m |
| [0m 4       [0m | [0m 0.9081  [0m | [0m 29.98   [0m | [0m 0.8662  [0m |
| [95m 5       [0m | [95m 0.9761  [0m | [95m 500.0   [0m | [95m 0.0001  [0m |
| [0m 6       [0m | [0m 0.9755  [0m | [0m 496.5   [0m | [0m 0.1832  [0m |
| [0m 7       [0m | [0m 0.9755  [0m | [0m 428.1   [0m | [0m 0.01407 [0m |
| [0m 8       [0m | [0m 0.9081  [0m | [0m 114.0   [0m | [0m 0.9893  [0m |
| [95m 9       [0m | [95m 0.9765  [0m | [95m 274.3   [0m | [95m 0.008437[0m |
{'target': 0.9765330380459971, 'params': {'hidden_layer_sizes': 274.3146737438626, 'learning_rate_init': 0.00843679678392597}}


Cross-validation on result:

In [30]:
clf = MLPClassifier(hidden_layer_sizes=int(BayesianOptimization.max['params']['hidden_layer_sizes']), 
                                 learning_rate_init=BayesianOptimization.max['params']['learning_rate_init'],
                                 random_state=0)
scores = cross_val_score(clf, features_data, targets_data, cv=5) 
print(f"{scores.mean():.4f} accuracy with a standard deviation of {scores.std():.4f}")

0.9766 accuracy with a standard deviation of 0.0023


Comparing with baseline:

In [31]:
print(f"{scores.mean()-PulsarData('HTRU_2').baseline:.4f} improvement with a standard deviation of {scores.std():.4f}")

0.0682 improvement with a standard deviation of 0.0023


Fit a gradient boosting classifier with hyperparameters optimised by Gaussian Process Optimisation above

In [33]:
clf_fit = clf.fit(train_features_data, train_targets_data)


Compare the classified data to the test set - returns percentage match

In [35]:
clf_fit.score(test_features_data, test_targets_data)

0.9776536312849162

SHAP-values:

In [41]:
shap_values = shap.TreeExplainer(clf_fit).shap_values(train_features_data)
shap.summary_plot(shap_values, train_features_data, plot_type="bar")

Exception: Model type not yet supported by TreeExplainer: <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>