In [1]:
import numpy as np
import pandas as pd
from dataLoad import PulsarData
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
import shap
import tensorflow

## Classification of pulsar data using the sklearn neural network method



In [2]:
features_data = PulsarData('HTRU_2').features
targets_data = PulsarData('HTRU_2').targets

Shuffle and split the data into training and test groups with 3:1 split

In [3]:
train_features_data, test_features_data, train_targets_data, test_targets_data  =  train_test_split( features_data, 
                                                        targets_data, test_size=0.25, random_state=42)

Bayesian optimisation and cross validation of hyperparameters using Simone's Troels example code.

In [4]:
def sklNN_CrossValidation(hidden_layer_sizes, learning_rate_init, data, targets):
    """Cross validation.
       Fits a NN with the given paramaters to the target 
       given data, calculated a CV accuracy score and returns the mean.
       The goal is to find combinations
       that maximize the accuracy
    """
    
    estimator = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, learning_rate_init=learning_rate_init, random_state=0)
    
    cval = cross_val_score(estimator, data, targets, scoring='accuracy', cv=5)
    
    return cval.mean()

In [5]:
def optimize_sklNN(data, targets, pars, n_iter=5):
    """Apply Bayesian Optimization to NN parameters."""
    
    def crossval_wrapper(hidden_layer_sizes, learning_rate_init):
        """Wrapper of NNe cross validation. 
           hidden_layer_sizes
           is cast to integer before we pass them along.
        """
        return sklNN_CrossValidation(hidden_layer_sizes=int(hidden_layer_sizes), 
                                            learning_rate_init=learning_rate_init, 
                                            data=data, 
                                            targets=targets)

    optimizer = BayesianOptimization(f=crossval_wrapper, 
                                     pbounds=pars, 
                                     random_state=42, 
                                     verbose=2)
    optimizer.maximize(init_points=4, n_iter=n_iter)

    return optimizer

In [6]:
parameters_BayesianOptimization = {"hidden_layer_sizes": (1, 500), 
                                   "learning_rate_init": (0.0001, 1)
                                  }

BayesianOptimization = optimize_sklNN(train_features_data, 
                                             train_targets_data, 
                                             parameters_BayesianOptimization, 
                                             n_iter=5)
print(BayesianOptimization.max)

|   iter    |  target   | hidden... | learni... |
-------------------------------------------------
| [0m 1       [0m | [0m 0.9766  [0m | [0m 187.9   [0m | [0m 0.9507  [0m |
| [0m 2       [0m | [0m 0.9762  [0m | [0m 366.3   [0m | [0m 0.5987  [0m |
| [95m 3       [0m | [95m 0.9794  [0m | [95m 78.85   [0m | [95m 0.1561  [0m |
| [0m 4       [0m | [0m 0.9758  [0m | [0m 29.98   [0m | [0m 0.8662  [0m |
| [0m 5       [0m | [0m 0.9791  [0m | [0m 1.026   [0m | [0m 0.9539  [0m |
| [0m 6       [0m | [0m 0.9779  [0m | [0m 83.83   [0m | [0m 0.6154  [0m |
| [0m 7       [0m | [0m 0.9779  [0m | [0m 1.004   [0m | [0m 0.2526  [0m |
| [0m 8       [0m | [0m 0.9791  [0m | [0m 1.075   [0m | [0m 0.7396  [0m |
| [0m 9       [0m | [0m 0.977   [0m | [0m 1.015   [0m | [0m 0.4597  [0m |
{'target': 0.9794381492366655, 'params': {'hidden_layer_sizes': 78.85330158077582, 'learning_rate_init': 0.15607892088416903}}


Cross-validation on result:

In [7]:
clf = MLPClassifier(hidden_layer_sizes=int(BayesianOptimization.max['params']['hidden_layer_sizes']), 
                                 learning_rate_init=BayesianOptimization.max['params']['learning_rate_init'],
                                 random_state=0)
scores = cross_val_score(clf, features_data, targets_data, cv=5, scoring='f1') 
print(f"{scores.mean():.4f} accuracy with a standard deviation of {scores.std():.4f}")

0.8698 accuracy with a standard deviation of 0.0186


Comparing with baseline:

In [8]:
print(f"{scores.mean()-PulsarData('HTRU_2').baseline:.4f} improvement with a standard deviation of {scores.std():.4f}")

-0.0386 improvement with a standard deviation of 0.0186


Fit a gradient boosting classifier with hyperparameters optimised by Gaussian Process Optimisation above

In [9]:
clf_fit = clf.fit(train_features_data, train_targets_data)


Compare the classified data to the test set - returns percentage match

In [10]:
clf_fit.score(test_features_data, test_targets_data)

0.9781005586592179

In [11]:
explainer = shap.DeepExplainer(clf_fit, train_features_data).shap_values(train_features_data)
shap.summary_plot(shap_values, train_features_data, plot_type="bar")

Using TensorFlow backend.
keras is no longer supported, please use tf.keras instead.


AssertionError: <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'> is not currently a supported model type!