In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dataLoad import PulsarData
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.model_selection import train_test_split
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
import shap
from sklearn.gaussian_process.kernels import RBF, DotProduct, Matern, RationalQuadratic, WhiteKernel

In [2]:
# For pretty plotting
plt.style.use('seaborn-paper')
plt.rcParams["font.family"] = "serif"

Load in the data:

In [3]:
raw_features = PulsarData('HTRU_2').features
raw_targets = PulsarData('HTRU_2').targets

Calculate the percentage of each class:

In [4]:
print(f'The data set contains {PulsarData("HTRU_2").baseline*100:.1f} % pulsars of class 0 and {(1-PulsarData("HTRU_2").baseline)*100:.1f} % pulsars of class 1')

The data set contains 90.8 % pulsars of class 0 and 9.2 % pulsars of class 1


In [5]:
train_features_data, test_features_data, train_targets_data, test_targets_data  =  train_test_split(raw_features, 
                                                        raw_targets, test_size=0.1, random_state=42)

Cross validation:

In [6]:
kernel_list = [1*RBF(), 1*DotProduct(), 1*Matern(), 1*RationalQuadratic(), 1*WhiteKernel()]
for k in kernel_list:
    print(str(k))    
    gpc = GaussianProcessClassifier(kernel = k, random_state=42)
    scores = cross_val_score(gpc, test_features_data, test_targets_data, cv=2, scoring='f1')
    print(f"{scores.mean():.4f} accuracy with a standard deviation of {scores.std():.4f}")

1**2 * RBF(length_scale=1)
0.8902 accuracy with a standard deviation of 0.0152
1**2 * DotProduct(sigma_0=1)
0.8736 accuracy with a standard deviation of 0.0243
1**2 * Matern(length_scale=1, nu=1.5)
0.8872 accuracy with a standard deviation of 0.0182
1**2 * RationalQuadratic(alpha=1, length_scale=1)
0.8872 accuracy with a standard deviation of 0.0182
1**2 * WhiteKernel(noise_level=1)
0.0000 accuracy with a standard deviation of 0.0000


In [12]:
model = GaussianProcessClassifier(kernel = 1*RBF(), random_state=42)
model.fit(test_features_data, test_targets_data)
explainer = shap.KernelExplainer(model.predict_proba, shap.sample(test_features_data,100), link="logit")
shap_values = explainer.shap_values(test_features_data, nsamples=100)
shap.plots.bar(shap_values)

  0%|          | 0/1790 [00:00<?, ?it/s]

AssertionError: You must pass an Explanation object, Cohorts object, or dictionary to bar plot!

Alternative with train and test:

In [35]:
%matplotlib
new_shape = np.vstack(np.array(shap_values))
shap.summary_plot(new_shape, test_features_data, plot_type="bar", show='False')
plt.savefig('plots/GPC_SHAP.pdf', bbox_inches='tight')

Using matplotlib backend: Qt5Agg


In [7]:
train_features_data, test_features_data, train_targets_data, test_targets_data  =  train_test_split( raw_features, 
                                                        raw_targets, test_size=0.25, random_state=42)

In [8]:
#gpc = GaussianProcessClassifier(random_state=42).fit(train_features_data, train_targets_data)

In [9]:
#print(gpc.score(test_features_data, test_targets_data))

SHAP-values

In [10]:
#shap_values = shap.KernelExplainer(gpc, raw_features).shap_values(raw_features)
#shap.summary_plot(shap_values, raw_features, plot_type="bar")