In [1]:
import pandas as pd

import numpy as np
import six
import sys
sys.modules['sklearn.externals.six'] = six
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import _tree
import random

from skopt.space import Real, Categorical, Integer
random.seed(42)

In [2]:
def _sample_search_space(param_space, categorical_columns, num_samples = 1000):
    """
        Uniformly sample the search space for generating explanation dataset. Maintains the original order of search space.
        Sensitive to order: place categoricals first and everything else later.

        Args:
        - param_space (param space): parameter space
        - categorical_columns (list): list of categorical variables
        - num_samples (int): number of generated samples

        Returns:
        - new_data (numpy array): the augmented dataset
        """
    samples = []
    
    for _ in range(num_samples):
        sample = [param.rvs() for param in param_space]
        samples.append(sample)
    
    # Create a DataFrame with the samples
    param_names = [param.name for param in param_space]
    df = pd.DataFrame(samples, columns=param_names)
    
    # One-hot encode categorical variables and add dynamic prefixes
    encoded_columns = []
    for col in categorical_columns:
        prefix = f'categorical_{col}'
        encoded = pd.get_dummies(df[col], prefix=prefix).astype(float)
        encoded_columns.extend(encoded.columns)
        df = pd.concat([df.drop(col, axis=1), encoded], axis=1)
    
    # Rearrange columns: categorical columns in front, followed by non-categorical columns
    ordered_columns = []
    for col in param_space:
        if col.name in categorical_columns:
            ordered_columns.extend([col_name for col_name in df.columns if col_name.startswith(f'categorical_{col.name}')])
        elif col.name in df.columns:
            ordered_columns.append(col.name)
    
    # Now you have the DataFrame with categorical variables in the front and non-categorical variables in their original order
    df = df[ordered_columns]
    
    # Convert numerical variables from lists to floats
    for col in param_names:
        if col not in categorical_columns:
            df[col] = df[col]
    df = df.to_numpy().astype(np.float32)
    return df

In [3]:
df = pd.read_csv('cifar10_xception_25.csv')
df = df.drop(columns=['training_epoch_accuracy','training_epoch_loss','validation_epoch_loss','validation_evaluation_accuracy_vs_iterations','validation_evaluation_loss_vs_iterations'])

In [4]:
ye = df['validation_epoch_accuracy']
y = ye.to_numpy()

Xe = df.drop(columns='validation_epoch_accuracy')
dum1 = pd.get_dummies(Xe['conv2d_num_filters'],prefix = 'categorical_conv2dnumfilters')
dum2 = pd.get_dummies(Xe['activation'],prefix = 'categorical_activation')
dum3 = pd.get_dummies(Xe['pooling'],prefix = 'categorical_pooling')
dum4 = pd.get_dummies(Xe['kernel_size'],prefix = 'categorical_kernelsize')
dum5 = pd.get_dummies(Xe['dense_use_bn'],prefix = 'categorical_denseusebn')
Xe = Xe.drop(columns = ['conv2d_num_filters','activation','pooling','kernel_size','dense_use_bn'])
Xe = pd.concat([dum1, dum2, dum3, dum4, dum5, Xe], axis=1)

param_space = [
    Categorical(categories=['64','32', '128'], name='conv2d_num_filters'),
    Categorical(categories=['relu', 'selu'], name='activation'),
    Categorical(categories=['flatten', 'avg', 'max'], name='pooling'),
    Categorical(categories=['5','3'], name='kernel_size'),
    Categorical(categories=['1', '0'], name='dense_use_bn'),
    Integer(low=128, high=768, name='sep_num_filters'),
    Integer(low=0, high=1, name='dropout_rate'),
    Integer(low=1, high=5, name='num_dense_layers'),
    Integer(low=1, high=8, name='num_residual_blocks'),
    Integer(low=0, high=1, name='learning_rate'),
    Integer(low=2, high=4, name='initial_strides'),
]

# Define the categorical columns
categorical_columns = ['conv2d_num_filters','activation','pooling','kernel_size','dense_use_bn']

print (Xe.iloc[np.argmin(ye.values)])
print (np.max(ye.values))

categorical_conv2dnumfilters_32       1.0000
categorical_conv2dnumfilters_64       0.0000
categorical_conv2dnumfilters_128      0.0000
categorical_activation_relu           1.0000
categorical_activation_selu           0.0000
categorical_pooling_avg               0.0000
categorical_pooling_flatten           1.0000
categorical_pooling_max               0.0000
categorical_kernelsize_3              1.0000
categorical_kernelsize_5              0.0000
categorical_denseusebn_0              1.0000
categorical_denseusebn_1              0.0000
sep_num_filters                     128.0000
dropout_rate                          0.1000
num_dense_layers                      3.0000
num_residual_blocks                   5.0000
learning_rate                         0.0001
initial_strides                       2.0000
Name: 3, dtype: float64
0.777999997


In [5]:
import GPy

def _train_gp(x_train, y_train):
    """
        Train a GPR model from Gpy library.

        Args:
        - X_train (numpy array): input data
        - Y_train (numpy array): ground truth data
        - n_samples (int): number of times to sample the posterior of the trained gp
        - sample_points (numpy array): new input X locations where the posterior would be sampled

        Returns:
        - model (trained gp model): trained gp model
        - mean_y (numpy array): mean of the Y from the sampled posterior
        - std_y (numpy array): std dev of the Y from the sampled posterior
        - sum_log_likelihoods (numpy array): returns the likelihood of observing
          each input point given the trained model
    """
    kernel = GPy.kern.RBF(input_dim=x_train.shape[1])
    model = GPy.models.GPRegression(x_train, y_train, kernel)
    model.optimize()
    return model

def augment_with_noise(data, n_samples=1000):
    """
    Add noise to the input data to create new points and generate a new dataset
    with the desired number of samples.

    Args:
    - data (numpy array): the input data
    - num_samples (int): the number of samples to generate
    - noise_factor (float): the magnitude of the noise to add to the data (default: 0.01)

    Returns:
    - new_data (numpy array): the augmented dataset
    """
    n_points, n_dims = data.shape
    new_data = np.empty((n_samples, n_dims))  # Create a new array for the augmented data

    for i in range(n_samples):
        noise = np.random.uniform(low=-0.01, high=0.01, size=n_dims)
        new_data[i] = data[np.random.randint(n_points)] + noise

    return new_data
    
model = _train_gp(Xe.to_numpy(), y.reshape(-1, 1))
test_set = _sample_search_space(param_space, categorical_columns, num_samples = 100)
test_set = augment_with_noise(test_set)
mean_y, var_y = model.predict_noiseless(test_set, full_cov=False)
flattened_mean = [item for sublist in mean_y for item in sublist]
flattened_var = [item for sublist in var_y for item in sublist]
std = np.sqrt(flattened_var)

samp = _sample_search_space(param_space, categorical_columns, num_samples = 1000)
samp = augment_with_noise(samp)
yee, var = model.predict_noiseless(samp, full_cov=False)
yee = np.squeeze(yee, axis=1)

In [8]:
from rulekit import RuleKit
from rulekit.regression import RuleRegressor
from rulekit.params import Measures
RuleKit.init()

reg = RuleRegressor(
    induction_measure=Measures.C2,
    pruning_measure=Measures.C2,
    voting_measure=Measures.C2,
)
reg.fit(samp, yee)
predictions = reg.predict(test_set)

ru = []
for rule in reg.model.rules:
    ru.append(str(rule))
    print (rule)

print(len(ru))

from itertools import combinations

# Given rule list in the new representation
rule_list = ru

# Function to extract features from a rule
def extract_features(rule):
    # Extract only the part before "THEN" and split by "AND"
    features = set(rule.split('IF')[1].split(" THEN")[0].split(" AND"))
    return features

# Function to calculate Jaccard similarity between two rules
def jaccard_similarity(rule1, rule2):
    features1 = extract_features(rule1)
  
    features2 = extract_features(rule2)
   
    intersection = len(features1.intersection(features2))
    union = len(features1.union(features2))
    
    if union == 0:
        return 0.0  # Handle the case where both sets are empty
    else:
        return intersection / union

# Calculate Jaccard similarity between all pairs of rules and collect them
similarities = []
for rule1, rule2 in combinations(rule_list, 2):
    similarity = jaccard_similarity(rule1, rule2)
    similarities.append(similarity)

# Calculate the mean Jaccard similarity for the entire rule set
mean_similarity = sum(similarities) / len(similarities)

print(f"Mean Jaccard Similarity for the Rule Set: {mean_similarity:.2f}")

## Define your bounds
lower_bounds = [flattened_mean[i] - 0.5*flattened_mean[i]  for i in range(len(flattened_mean))]
upper_bounds = [flattened_mean[i] + 0.5*flattened_mean[i]  for i in range(len(flattened_mean))]

# Initialize a count for predictions within bounds
count_within_bounds = sum(1 for prediction, lower_bound, upper_bound in zip(predictions, lower_bounds, upper_bounds) if lower_bound <= prediction <= upper_bound)

# Calculate the percentage within bounds
percentage_within_bounds = (count_within_bounds / len(predictions)) 

# Print the result
print(f"Percentage of predictions within bounds: {percentage_within_bounds}%")

IF att18 = <2.00, inf) AND att10 = (-inf, 0.0074) AND att13 = (-inf, 138.99) THEN label = {0.29} [0.28,0.31]
IF att1 = (-inf, 1.00) AND att13 = (-inf, 138.99) AND att4 = <0.99, inf) THEN label = {0.26} [0.24,0.27]
IF att13 = <761.00, inf) THEN label = {0.69} [0.67,0.71]
IF att2 = (-inf, 0.99) AND att13 = (-inf, 140.50) AND att3 = <-0.0024, inf) THEN label = {0.26} [0.22,0.30]
IF att16 = (-inf, 3.01) AND att13 = <664.50, 677.00) AND att3 = <-0.0046, inf) THEN label = {0.036} [0.035,0.037]
IF att16 = (-inf, 3.01) AND att1 = <6.7E-5, 0.99) AND att13 = <675.00, 722.00) AND att3 = <-0.0081, 1.01) THEN label = {2.4E-4} [-0.0011,0.0016]
IF att18 = (-inf, 4.01) AND att16 = (-inf, 3.01) AND att7 = <-0.0079, inf) AND att2 = (-inf, 0.50) AND att1 = <-0.0014, inf) AND att13 = <675.00, 728.50) AND att4 = (-inf, 1.01) AND att3 = <-0.0035, 1.01) THEN label = {1.2E-4} [-0.0013,0.0015]
IF att6 = (-inf, 0.51) AND att16 = (-inf, 3.01) AND att2 = <-0.0055, inf) AND att1 = <-0.0057, inf) AND att13 = <675.0

In [6]:
clf = DecisionTreeRegressor(random_state=0)
clf.fit(samp, yee)
pred = clf.predict(test_set)


# Define your bounds
lower_bounds = [flattened_mean[i] - 0.5*flattened_mean[i]  for i in range(len(flattened_mean))]
upper_bounds = [flattened_mean[i] + 0.5*flattened_mean[i]  for i in range(len(flattened_mean))]

# Initialize a count for predictions within bounds
pr_count_within_bounds = sum(1 for prediction, lower_bound, upper_bound in zip(pred, lower_bounds, upper_bounds) if lower_bound <= prediction <= upper_bound)

# Calculate the percentage within bounds
percentage_within_bounds = (pr_count_within_bounds / len(pred)) 

# Print the result
print(f"Percentage of predictions within bounds: {percentage_within_bounds}%")

def get_rules(tree, feature_names, class_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []
    
    def recurse(node, path, paths):
        
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]
            
    recurse(0, path, paths)

    # sort by samples count
    samples_count = [p[-1][1] for p in paths]
    ii = list(np.argsort(samples_count))
    paths = [paths[i] for i in reversed(ii)]
    
    rules = []
    for path in paths:
        rule = "if "
        
        for p in path[:-1]:
            if rule != "if ":
                rule += " and "
            rule += str(p)
        #rule += " then "
        if class_names is None:
            s = 0
            #rule += "response: "+str(np.round(path[-1][0][0][0],3))
        else:
            classes = path[-1][0][0]
            l = np.argmax(classes)
            rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes),2)}%)"
        #rule += f" | based on {path[-1][1]:,} samples"
        rules += [rule]
        
    return rules
rules = get_rules(clf, Xe.columns, None)
for r in rules:
    print(r)
len(rules)

Percentage of predictions within bounds: 0.871%
if (sep_num_filters > 231.996) and (sep_num_filters > 273.504) and (sep_num_filters > 355.508) and (sep_num_filters > 403.5) and (sep_num_filters <= 752.5) and (sep_num_filters <= 664.497) and (sep_num_filters <= 621.0) and (sep_num_filters > 488.508) and (sep_num_filters > 532.498) and (sep_num_filters <= 613.501) and (sep_num_filters > 540.5) and (sep_num_filters <= 608.002) and (sep_num_filters <= 601.498) and (sep_num_filters > 547.999) and (sep_num_filters <= 596.999) and (sep_num_filters > 554.497) and (sep_num_filters <= 592.5) and (sep_num_filters > 559.502) and (sep_num_filters <= 589.005) and (categorical_pooling_avg <= 1.002) and (sep_num_filters <= 584.503) and (sep_num_filters > 567.499) and (sep_num_filters <= 582.5) and (num_dense_layers <= 2.0) and (categorical_activation_selu <= 0.006) and (categorical_conv2dnumfilters_32 <= 0.508) and (categorical_pooling_avg <= 0.004)
if (sep_num_filters > 231.996) and (sep_num_filters 

969

In [7]:
# from itertools import combinations

# # Given rule list
# rule_list = rules

# # Function to extract features from a rule
# def extract_features(rule):
#     features = set(rule.split("and"))
#     return features

# # Function to calculate Jaccard similarity between two rules
# def jaccard_similarity(rule1, rule2):
#     features1 = extract_features(rule1)
#     features2 = extract_features(rule2)
#     intersection = len(features1.intersection(features2))
#     union = len(features1.union(features2))
    
#     if union == 0:
#         return 0.0  # Handle the case where both sets are empty
#     else:
#         return intersection / union

# # Calculate Jaccard similarity between all pairs of rules and collect them
# similarities = []
# for rule1, rule2 in combinations(rule_list, 2):
#     similarity = jaccard_similarity(rule1, rule2)
#     similarities.append(similarity)

# # Calculate the mean Jaccard similarity for the entire rule set
# mean_similarity = sum(similarities) / len(similarities)

# print(f"Mean Jaccard Similarity for the Rule Set: {mean_similarity:.2f}")

In [7]:
from skrules import SkopeRules
from sklearn.preprocessing import KBinsDiscretizer
yee = yee/np.max(yee)

# transform the dataset with KBinsDiscretizer
enc = KBinsDiscretizer(n_bins=2, encode="ordinal", strategy='kmeans')
Y_binned = enc.fit_transform(yee.reshape(-1, 1))

clf1 = SkopeRules(
    max_depth_duplication=3, max_depth=3, max_features=0.5,
    max_samples_features=0.5, random_state=0, n_estimators=20,
    feature_names=Xe.columns, recall_min=0.04, precision_min=0.6)

clf1.fit(samp, Y_binned)

p = clf.predict(test_set)


# Define your bounds
lower_bounds = [flattened_mean[i] - 0.5*flattened_mean[i]  for i in range(len(flattened_mean))]
upper_bounds = [flattened_mean[i] + 0.5*flattened_mean[i]  for i in range(len(flattened_mean))]

# Initialize a count for predictions within bounds
clf_count_within_bounds = sum(1 for prediction, lower_bound, upper_bound in zip(p, lower_bounds, upper_bounds) if lower_bound <= prediction <= upper_bound)

# Calculate the percentage within bounds
percentage_within_bounds = (clf_count_within_bounds / len(p)) 

# Print the result
print(f"Percentage of predictions within bounds: {percentage_within_bounds}%")

print(len(clf1.rules_))
print(clf1.rules_)



Percentage of predictions within bounds: 0.871%
9
[('sep_num_filters <= 273.5037078857422 and sep_num_filters > 238.99710083007812', (1.0, 0.23626373626373626, 2)), ('sep_num_filters <= 273.5037078857422 and sep_num_filters > 239.50214385986328 and num_residual_blocks <= 8.006944179534912', (1.0, 0.22162162162162163, 2)), ('sep_num_filters <= 270.4953155517578 and sep_num_filters > 239.00221252441406 and categorical_activation_relu <= 0.002524379175156355', (1.0, 0.11475409836065574, 2)), ('sep_num_filters <= 275.505126953125 and sep_num_filters > 239.00221252441406 and categorical_activation_relu > 0.002524379175156355', (1.0, 0.1092896174863388, 2)), ('sep_num_filters > 751.498779296875 and categorical_activation_relu > 0.0027288567507639527', (1.0, 0.05913978494623656, 2)), ('categorical_conv2dnumfilters_64 > 1.004474401473999 and dropout_rate > 0.9948501884937286 and categorical_conv2dnumfilters_128 <= 0.00019133463138132356', (0.8333333333333334, 0.05434782608695652, 2)), ('sep_nu