In [1]:
import pandas as pd

import numpy as np
import six
import sys
sys.modules['sklearn.externals.six'] = six
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import _tree
import random

from skopt.space import Real, Categorical, Integer
random.seed(42)

In [2]:
def _sample_search_space(param_space, categorical_columns, num_samples = 1000):
    """
        Uniformly sample the search space for generating explanation dataset. Maintains the original order of search space.
        Sensitive to order: place categoricals first and everything else later.

        Args:
        - param_space (param space): parameter space
        - categorical_columns (list): list of categorical variables
        - num_samples (int): number of generated samples

        Returns:
        - new_data (numpy array): the augmented dataset
        """
    samples = []
    
    for _ in range(num_samples):
        sample = [param.rvs() for param in param_space]
        samples.append(sample)
    
    # Create a DataFrame with the samples
    param_names = [param.name for param in param_space]
    df = pd.DataFrame(samples, columns=param_names)
    
    # One-hot encode categorical variables and add dynamic prefixes
    encoded_columns = []
    for col in categorical_columns:
        prefix = f'categorical_{col}'
        encoded = pd.get_dummies(df[col], prefix=prefix).astype(float)
        encoded_columns.extend(encoded.columns)
        df = pd.concat([df.drop(col, axis=1), encoded], axis=1)
    
    # Rearrange columns: categorical columns in front, followed by non-categorical columns
    ordered_columns = []
    for col in param_space:
        if col.name in categorical_columns:
            ordered_columns.extend([col_name for col_name in df.columns if col_name.startswith(f'categorical_{col.name}')])
        elif col.name in df.columns:
            ordered_columns.append(col.name)
    
    # Now you have the DataFrame with categorical variables in the front and non-categorical variables in their original order
    df = df[ordered_columns]
    
    # Convert numerical variables from lists to floats
    for col in param_names:
        if col not in categorical_columns:
            df[col] = df[col]
    df = df.to_numpy().astype(np.float32)
    return df

In [3]:
df = pd.read_csv('mnist_resnet_25.csv')
df = df.drop(columns=['training_epoch_accuracy','training_epoch_loss','validation_epoch_loss','validation_evaluation_accuracy_vs_iterations','validation_evaluation_loss_vs_iterations'])

In [4]:
ye = df['validation_epoch_accuracy']
y = ye.to_numpy()

Xe = df.drop(columns='validation_epoch_accuracy')
dum1 = pd.get_dummies(Xe['conv3_depth'],prefix = 'categorical_conv3depth')
dum2 = pd.get_dummies(Xe['optimizer'],prefix = 'categorical_optimizer')
dum3 = pd.get_dummies(Xe['conv4_depth'],prefix = 'categorical_conv4depth')
dum4 = pd.get_dummies(Xe['version'],prefix = 'categorical_version')
dum5 = pd.get_dummies(Xe['pooling'],prefix = 'categorical_pooling')
Xe = Xe.drop(columns = ['conv3_depth','optimizer','conv4_depth','version','pooling'])
Xe = pd.concat([dum1, dum2, dum3, dum4, dum5, Xe], axis=1)


param_space = [
    Categorical(categories=['4', '8'], name='conv3_depth'),
    Categorical(categories=['adam', 'sgd', 'rmsprop'], name='optimizer'),
    Categorical(categories=['6', '23', '36'], name='conv4_depth'),
    Categorical(categories=['v1','v2', 'next'], name='version'),
    Categorical(categories=['avg', 'max'], name='pooling'),
    Integer(low=0, high=1, name='learning_rate')
]

# Define the categorical columns
categorical_columns = ['conv3_depth','optimizer','conv4_depth','version','pooling']

print (Xe.iloc[np.argmin(ye.values)])
print (np.max(ye.values))

categorical_conv3depth_4         0.0
categorical_conv3depth_8         1.0
categorical_optimizer_adam       1.0
categorical_optimizer_rmsprop    0.0
categorical_optimizer_sgd        0.0
categorical_conv4depth_6         0.0
categorical_conv4depth_23        0.0
categorical_conv4depth_36        1.0
categorical_version_next         0.0
categorical_version_v1           1.0
categorical_version_v2           0.0
categorical_pooling_avg          1.0
categorical_pooling_max          0.0
learning_rate                    0.1
Name: 3, dtype: float64
0.989333332


In [5]:
import GPy

def _train_gp(x_train, y_train):
    """
        Train a GPR model from Gpy library.

        Args:
        - X_train (numpy array): input data
        - Y_train (numpy array): ground truth data
        - n_samples (int): number of times to sample the posterior of the trained gp
        - sample_points (numpy array): new input X locations where the posterior would be sampled

        Returns:
        - model (trained gp model): trained gp model
        - mean_y (numpy array): mean of the Y from the sampled posterior
        - std_y (numpy array): std dev of the Y from the sampled posterior
        - sum_log_likelihoods (numpy array): returns the likelihood of observing
          each input point given the trained model
    """
    kernel = GPy.kern.RBF(input_dim=x_train.shape[1])
    model = GPy.models.GPRegression(x_train, y_train, kernel)
    model.optimize()
    return model

def augment_with_noise(data, n_samples=1000):
    """
    Add noise to the input data to create new points and generate a new dataset
    with the desired number of samples.

    Args:
    - data (numpy array): the input data
    - num_samples (int): the number of samples to generate
    - noise_factor (float): the magnitude of the noise to add to the data (default: 0.01)

    Returns:
    - new_data (numpy array): the augmented dataset
    """
    n_points, n_dims = data.shape
    new_data = np.empty((n_samples, n_dims))  # Create a new array for the augmented data

    for i in range(n_samples):
        noise = np.random.uniform(low=-0.01, high=0.01, size=n_dims)
        new_data[i] = data[np.random.randint(n_points)] + noise

    return new_data
    
model = _train_gp(Xe.to_numpy(), y.reshape(-1, 1))
test_set = _sample_search_space(param_space, categorical_columns, num_samples = 100)
test_set = augment_with_noise(test_set)
mean_y, var_y = model.predict_noiseless(test_set, full_cov=False)
flattened_mean = [item for sublist in mean_y for item in sublist]
flattened_var = [item for sublist in var_y for item in sublist]
std = np.sqrt(flattened_var)

samp = _sample_search_space(param_space, categorical_columns, num_samples = 1000)
samp = augment_with_noise(samp)
yee, var = model.predict_noiseless(samp, full_cov=False)
yee = np.squeeze(yee, axis=1)

In [6]:
from rulekit import RuleKit
from rulekit.regression import RuleRegressor
from rulekit.params import Measures
RuleKit.init()

reg = RuleRegressor(
    induction_measure=Measures.C2,
    pruning_measure=Measures.C2,
    voting_measure=Measures.C2,
)
reg.fit(samp, yee)
predictions = reg.predict(test_set)

ru = []
for rule in reg.model.rules:
    ru.append(str(rule))
    print (rule)

print(len(ru))

from itertools import combinations

# Given rule list in the new representation
rule_list = ru

# Function to extract features from a rule
def extract_features(rule):
    # Extract only the part before "THEN" and split by "AND"
    features = set(rule.split('IF')[1].split(" THEN")[0].split(" AND"))
    return features

# Function to calculate Jaccard similarity between two rules
def jaccard_similarity(rule1, rule2):
    features1 = extract_features(rule1)
  
    features2 = extract_features(rule2)
   
    intersection = len(features1.intersection(features2))
    union = len(features1.union(features2))
    
    if union == 0:
        return 0.0  # Handle the case where both sets are empty
    else:
        return intersection / union

# Calculate Jaccard similarity between all pairs of rules and collect them
similarities = []
for rule1, rule2 in combinations(rule_list, 2):
    similarity = jaccard_similarity(rule1, rule2)
    similarities.append(similarity)

# Calculate the mean Jaccard similarity for the entire rule set
mean_similarity = sum(similarities) / len(similarities)

print(f"Mean Jaccard Similarity for the Rule Set: {mean_similarity:.2f}")

## Define your bounds
lower_bounds = [flattened_mean[i] - 0.5*flattened_mean[i]  for i in range(len(flattened_mean))]
upper_bounds = [flattened_mean[i] + 0.5*flattened_mean[i]  for i in range(len(flattened_mean))]
# Initialize a count for predictions within bounds
count_within_bounds = sum(1 for prediction, lower_bound, upper_bound in zip(predictions, lower_bounds, upper_bounds) if lower_bound <= prediction <= upper_bound)

# Calculate the percentage within bounds
percentage_within_bounds = (count_within_bounds / len(predictions)) 

# Print the result
print(f"Percentage of predictions within bounds: {percentage_within_bounds}%")

IF att10 = <1.01, inf) THEN label = {0.84} [0.69,0.99]
IF att9 = <0.0063, inf) AND att5 = <-0.0086, inf) AND att14 = <-0.0068, inf) AND att7 = <-0.0079, 0.0052) AND att2 = <-0.0081, 1.00) AND att11 = <-0.0082, inf) AND att1 = <0.0029, 1.00) AND att4 = (-inf, 0.008) AND att3 = <0.99, inf) AND att12 = <-0.0019, inf) THEN label = {0.12} [0.11,0.13]
IF att6 = <-0.0054, inf) AND att8 = <0.99, 1.00) AND att7 = <-0.0063, 0.0082) AND att11 = <-0.0073, 1.00) AND att2 = <-0.0076, 1.00) AND att1 = (-inf, 1.00) AND att4 = (-inf, 0.0073) AND att3 = <0.99, inf) THEN label = {0.33} [0.29,0.37]
IF att9 = <0.99, inf) AND att6 = (-inf, 1.00) AND att14 = <-0.0041, inf) AND att7 = <-0.0063, 0.0088) AND att11 = <-0.0095, inf) AND att2 = <-0.0067, 1.01) AND att1 = <-0.0028, 1.00) AND att4 = <-0.0082, inf) AND att3 = <0.99, inf) THEN label = {0.15} [0.13,0.18]
IF att6 = (-inf, 1.01) AND att14 = <-0.0043, inf) AND att8 = (-inf, 1.00) AND att7 = <-0.0055, 0.0084) AND att11 = <-0.0095, inf) AND att2 = <-0.0074,

In [6]:
clf = DecisionTreeRegressor(random_state=0)
clf.fit(samp, yee)
pred = clf.predict(test_set)


# Define your bounds
lower_bounds = [flattened_mean[i] - 0.5*flattened_mean[i]  for i in range(len(flattened_mean))]
upper_bounds = [flattened_mean[i] + 0.5*flattened_mean[i]  for i in range(len(flattened_mean))]

# Initialize a count for predictions within bounds
pr_count_within_bounds = sum(1 for prediction, lower_bound, upper_bound in zip(pred, lower_bounds, upper_bounds) if lower_bound <= prediction <= upper_bound)

# Calculate the percentage within bounds
percentage_within_bounds = (pr_count_within_bounds / len(pred)) 

# Print the result
print(f"Percentage of predictions within bounds: {percentage_within_bounds}%")

def get_rules(tree, feature_names, class_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []
    
    def recurse(node, path, paths):
        
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]
            
    recurse(0, path, paths)

    # sort by samples count
    samples_count = [p[-1][1] for p in paths]
    ii = list(np.argsort(samples_count))
    paths = [paths[i] for i in reversed(ii)]
    
    rules = []
    for path in paths:
        rule = "if "
        
        for p in path[:-1]:
            if rule != "if ":
                rule += " and "
            rule += str(p)
        #rule += " then "
        if class_names is None:
            s = 1
            #rule += "response: "+str(np.round(path[-1][0][0][0],3))
        else:
            classes = path[-1][0][0]
            l = np.argmax(classes)
            rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes),2)}%)"
        #rule += f" | based on {path[-1][1]:,} samples"
        rules += [rule]
        
    return rules
rules = get_rules(clf, Xe.columns, None)
for r in rules:
    print(r)
len(rules)

Percentage of predictions within bounds: 0.972%
if (categorical_optimizer_adam > 0.5) and (categorical_conv4depth_23 > 0.99) and (categorical_version_v2 > 0.992) and (learning_rate > 0.007) and (categorical_conv3depth_4 > 0.993) and (categorical_conv3depth_4 > 0.996) and (categorical_pooling_avg > 1.002)
if (categorical_optimizer_adam <= 0.5) and (learning_rate > 0.01) and (categorical_conv4depth_23 <= 0.009) and (categorical_conv3depth_8 <= 0.991) and (categorical_version_v2 <= 0.01) and (categorical_version_next <= 1.003) and (categorical_optimizer_sgd <= 1.001) and (categorical_conv4depth_36 <= 0.993) and (categorical_version_next > 0.994) and (categorical_conv4depth_6 > 0.997) and (categorical_optimizer_sgd > 0.499) and (categorical_conv4depth_36 <= 0.0)
if (categorical_optimizer_adam <= 0.5) and (learning_rate > 0.01) and (categorical_conv4depth_23 <= 0.009) and (categorical_conv3depth_8 <= 0.991) and (categorical_version_v2 <= 0.01) and (categorical_version_next <= 1.003) and (ca

1000

In [7]:
# from itertools import combinations

# # Given rule list
# rule_list = rules

# # Function to extract features from a rule
# def extract_features(rule):
#     features = set(rule.split("and"))
#     return features

# # Function to calculate Jaccard similarity between two rules
# def jaccard_similarity(rule1, rule2):
#     features1 = extract_features(rule1)
#     features2 = extract_features(rule2)
#     intersection = len(features1.intersection(features2))
#     union = len(features1.union(features2))
    
#     if union == 0:
#         return 0.0  # Handle the case where both sets are empty
#     else:
#         return intersection / union

# # Calculate Jaccard similarity between all pairs of rules and collect them
# similarities = []
# for rule1, rule2 in combinations(rule_list, 2):
#     similarity = jaccard_similarity(rule1, rule2)
#     similarities.append(similarity)

# # Calculate the mean Jaccard similarity for the entire rule set
# mean_similarity = sum(similarities) / len(similarities)

# print(f"Mean Jaccard Similarity for the Rule Set: {mean_similarity:.2f}")

In [7]:
from skrules import SkopeRules
from sklearn.preprocessing import KBinsDiscretizer
yee = yee/np.max(yee)

# transform the dataset with KBinsDiscretizer
enc = KBinsDiscretizer(n_bins=2, encode="ordinal", strategy='kmeans')
Y_binned = enc.fit_transform(yee.reshape(-1, 1))

clf1 = SkopeRules(
    max_depth_duplication=3, max_depth=3, max_features=0.5,
    max_samples_features=0.5, random_state=0, n_estimators=20,
    feature_names=Xe.columns, recall_min=0.04, precision_min=0.6)

clf1.fit(samp, Y_binned)

p = clf.predict(test_set)


# Define your bounds
lower_bounds = [flattened_mean[i] - 0.5*flattened_mean[i]  for i in range(len(flattened_mean))]
upper_bounds = [flattened_mean[i] + 0.5*flattened_mean[i]  for i in range(len(flattened_mean))]

# Initialize a count for predictions within bounds
clf_count_within_bounds = sum(1 for prediction, lower_bound, upper_bound in zip(p, lower_bounds, upper_bounds) if lower_bound <= prediction <= upper_bound)

# Calculate the percentage within bounds
percentage_within_bounds = (clf_count_within_bounds / len(p)) 

# Print the result
print(f"Percentage of predictions within bounds: {percentage_within_bounds}%")

print(len(clf1.rules_))
print(clf1.rules_)



Percentage of predictions within bounds: 0.972%
19
[('categorical_conv3depth_8 <= 1.009646713733673 and categorical_optimizer_adam <= 0.5000374629162252 and categorical_version_v1 <= 1.0092248320579529', (0.9866666666666667, 0.8794567062818336, 2)), ('categorical_optimizer_adam <= 0.500089259352535 and categorical_conv4depth_36 <= 0.9917858242988586', (1.0, 0.6488294314381271, 2)), ('categorical_pooling_avg <= 0.9902397990226746 and categorical_optimizer_adam <= 0.500089259352535', (1.0, 0.5, 2)), ('categorical_optimizer_sgd > 0.9900659322738647 and categorical_conv4depth_6 > -0.009492157492786646 and categorical_conv4depth_23 > -0.009982461109757423', (0.9885496183206107, 0.4397283531409168, 2)), ('categorical_optimizer_rmsprop <= 0.009539392776787281 and categorical_optimizer_sgd > 0.009854401927441359 and categorical_conv4depth_23 > -0.009982461109757423', (0.9847328244274809, 0.431438127090301, 2)), ('categorical_optimizer_rmsprop > 0.9931654334068298 and categorical_optimizer_sgd 