In [1]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
import os
from imblearn.over_sampling import SMOTE
from sample import create_sampler
from tree_extractor import path_extractor
from model_extractor_maxnum import Extractor
import pickle

random_state = 24

class ExpModel:
    def __init__(self, dataset, model):
        self.dataset = dataset
        self.model = model
        self.n_splits = 4
        self._accuracy = []
        self._precision = []
        self._recall = []
        self._f1_score = []

    def init(self):
        if self.model == 'RF':
            if self.dataset == 'breast_cancer':
                self.target = 'diagnosis'
                parameters = {
                        'n_estimators': 200,
                        'max_depth': 10,
                        'random_state': random_state,
                        'max_features': None,
                }
                data_table = pd.read_csv('data/cancer.csv')
                data_table = data_table.drop(['id'], axis=1)
                X = data_table.drop(self.target, axis=1).values
                y = data_table[self.target].values
            elif self.dataset == 'abalone':
                self.target = 'Rings'
                parameters = {
                    'n_estimators': 80,
                    # 'max_depth': 30,
                    'random_state': 10,
                    'max_features': 'auto',
                    'oob_score': True,
                    'min_samples_split': 9,
                    'min_samples_leaf': 5,
                }

                data_table = pd.read_csv('data/abalone.csv')
                X = data_table.drop(self.target, axis=1).values
                y = data_table[self.target].values
                y = np.array([0 if v <= 7 else 1 for v in y])
            elif self.dataset == 'bankruptcy':
                self.target = 'Bankrupt?'
                parameters = {
                        'n_estimators': 150,
                        'max_depth': 15,
                        'random_state': random_state,
                }

                data_table = pd.read_csv('data/bank.csv')
                X = data_table.drop(self.target, axis=1).values
                y = data_table[self.target].values
            elif self.dataset == 'diabetes':
                self.target = 'class'
                parameters = {
                    'n_estimators': 100,
                    'max_depth': 10,
                    'random_state': random_state,
                    'max_features': None,
                }

                data_table = pd.read_csv('data/diabetes.csv')
                X = data_table.drop(self.target, axis=1).values
                y = data_table[self.target].values
            elif self.dataset == 'german_credit':
                self.target = 'credit_risk'
                #random_state = 24
                parameters = {
                    'random_state': random_state,
                    'max_depth': 12,
                    'n_estimators': 150,
                    'max_leaf_nodes': 100,
                    'min_samples_split': 6,
                    'min_samples_leaf': 3,
                    'bootstrap': True,
                }

                data_table = pd.read_csv('data/german.csv')
                qualitative_features = [
                    'credit_history', 'purpose', 'other_debtors', 
                    'property', 'other_installment_plans', 
                    'housing', 'job', 'people_liable', 'telephone',
                    'foreign_worker', 'number_credits',
                ]
                for feature in qualitative_features:
                    unique_values = np.unique(data_table[feature].values)
                    sorted(unique_values)
                    if int(unique_values[0]) == 0:
                        for i in unique_values:
                            data_table[feature + ' - '+ str(i)] = data_table[feature].values == i
                    else:
                        for i in unique_values:
                            data_table[feature + ' - '+ str(int(i) - 1)] = data_table[feature].values == i
                data_table['personal_status_sex'] = 1 * (data_table['personal_status_sex'].values == 3)
                print(data_table['personal_status_sex'].values)
                #data_table['personal_status_sex - 0'] = data_table[feature].values != 2
                #data_table['personal_status_sex - 1'] = data_table[feature].values == 2
                #data_table = data_table.drop('personal_status_sex', axis = 1)
                #    data_table['installment - '+ concurrent_credits[i]] = data_table['Other installment plans'].values == ix
                #data_table['Account Balance'] = np.array([v if v < 4 else 0 for v in data_table['Account Balance'].values])
                for feature in qualitative_features:
                    data_table = data_table.drop(feature, axis = 1)
                #data_table = data_table.drop('Other installment plans', axis = 1)
                X = data_table.drop(self.target, axis=1).values
                y = data_table[self.target].values
            elif self.dataset == 'wine':
                self.target = 'quality'
                parameters = {
                    'n_estimators': 150,
                    'max_depth': 13,
                    'random_state': random_state,
                    'max_features': 'auto',
                    'oob_score': True,
                }

                data_table = pd.read_csv('data/wine.csv')
                X = data_table.drop(self.target, axis=1).values
                y = data_table[self.target].values
                y = np.array([0 if v < 6 else 1 for v in y])
            self.data_table = data_table
            self.X = X
            self.y = y
            self.parameters = parameters
            
            kf = KFold(n_splits = self.n_splits, random_state=random_state, shuffle=True)
            self.splits = []
            for train_index, test_index in kf.split(X):
                self.splits.append((train_index, test_index))
            self.fold = 0

    def has_next_fold(self):
        return self.fold < len(self.splits)
    
    def next_fold(self):
        self.fold += 1

    def train(self):
        sm = SMOTE(random_state=random_state)
        data_table = self.data_table
        X = self.X
        y = self.y
        parameters = self.parameters
        train_index, test_index = self.splits[self.fold]
        X_train = self.X[train_index]
        y_train = self.y[train_index]
        X_test = self.X[test_index]
        y_test = self.y[test_index]
        X_train, y_train = sm.fit_resample(X_train, y_train)
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.data_table = data_table
        clf = RandomForestClassifier(**parameters)
        clf.fit(X_train, y_train)
        self.clf = clf
        self.y_pred = clf.predict(X_test)
        self.features = data_table.drop(self.target, axis=1).columns.to_list()

    def evaluate(self):
        _accuracy = accuracy_score(self.y_test, self.y_pred)
        _precision = precision_score(self.y_test, self.y_pred)
        _recall = recall_score(self.y_test, self.y_pred)
        _f1_score = f1_score(self.y_test, self.y_pred)
        print('Accuracy Score is', _accuracy)
        print('Precision is', _precision)
        print('Recall is', _recall)
        print('F1 Score is', _f1_score)
        self._accuracy.append(_accuracy)
        self._precision.append(_precision)
        self._recall.append(_recall)
        self._f1_score.append(_f1_score)
    
    def summary(self):
        return float(np.mean(self._accuracy)), float(np.mean(self._precision)), float(np.mean(self._recall)), float(np.mean(self._f1_score))

    def oversampling(self, rate = 2):
        is_continuous = []
        is_categorical = []
        is_integer = []

        for feature in self.data_table.columns:
            if feature == self.target:
                continue
            if self.data_table[feature].dtype == 'O':
                is_continuous.append(False)
                is_categorical.append(True)
            else:
                is_continuous.append(True)
                is_categorical.append(False)
            is_integer.append(False)
        sampler = create_sampler(self.X_train, is_continuous, is_categorical, is_integer)
        return sampler(len(self.X_train) * rate)

    def generate_paths(self):
        if self.model == 'RF':
            paths = path_extractor(self.clf, 'random forest', (self.X_train, self.y_train))
        else:
            paths = path_extractor(self.clf, 'lightgbm')
        print('num of paths', len(paths))
        return paths


In [2]:
dataset = 'german_credit'
model = 'RF'

exp = ExpModel(dataset, model)
exp.init()
exp.train()
paths = exp.generate_paths()
exp.evaluate()


[0 1 0 1 1 1 1 1 0 0 1 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1
 1 1 0 0 0 0 1 1 0 1 0 0 1 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1
 1 1 1 1 0 1 0 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0
 0 0 0 1 1 1 0 1 1 0 1 0 0 1 0 0 1 0 1 1 1 1 0 0 0 1 1 0 1 0 1 0 1 0 1 1 0
 1 1 0 0 1 1 0 1 0 1 0 1 0 0 0 1 0 1 0 1 0 0 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0
 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1
 1 1 1 0 1 1 0 1 1 0 1 0 0 0 1 1 0 1 0 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 0 0 1
 1 1 0 0 1 0 1 1 1 0 1 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0
 0 1 0 0 0 1 0 1 0 0 1 1 0 1 1 1 0 1 1 1 1 0 1 1 1 0 0 0 1 0 1 1 1 1 0 0 0
 0 0 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 0 0 0 0 1 1 1 1 0 0 1 0 0 0 1 1 0 1 0 0
 0 1 0 0 1 1 1 1 1 1 0 0 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 0 1 0 0 1 1 1
 0 0 0 0 1 1 1 1 1 0 1 1 1 0 0 1 0 1 1 1 0 0 0 0 1 1 0 1 1 0 0 1 0 0 1 1 0
 1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 1 0 1 0 0 1 0 1 1 1 1 1 0 1 0 1 1 0 0 1
 0 1 1 0 0 0 0 0 1 1 1 1 

In [4]:

from tree_extractor import assign_samples
assign_samples(paths, (exp.X, exp.y))
paths = [p for p in paths if p['coverage'] > 0]
print(len(paths))

14174


In [5]:
last_paths = paths
name2path = {}
for index, path in enumerate(paths):
    name2path[path['name']] = path
    path['level'] = 0

params = [1000, 80]
level_info = {}

for level, n in enumerate(params):
    tau = (n / 80) ** 0.65
    ex = Extractor(last_paths, exp.X_train, exp.clf.predict(exp.X_train))
    w, _, fidelity_train = ex.extract(n, tau)
    [idx] = np.nonzero(w)

    accuracy_train = ex.evaluate(w, exp.X_train, exp.y_train)
    accuracy_test = ex.evaluate(w, exp.X_test, exp.y_test)
    fidelity_test = ex.evaluate(w, exp.X_test, exp.clf.predict(exp.X_test))
    print(level, n, 'accuracy_train', accuracy_train, 'accuracy_test', accuracy_test, 'fidelity_test', fidelity_test)
    
    level_info[level + 1] = {
        'fidelity_test': fidelity_test,
        'accuracy_test': accuracy_test,
    }
    for i in idx:
        name2path[last_paths[i]['name']]['level'] = level + 1
    curr_paths = [last_paths[i] for i in idx]
    last_paths = curr_paths
    print(len(last_paths))

total path weight:  0.9999999999999876
0 1000 accuracy_train 0.9521072796934866 accuracy_test 0.816 fidelity_test 0.96
1000
total path weight:  0.9999999999999996
1 80 accuracy_train 0.9032567049808429 accuracy_test 0.756 fidelity_test 0.844
80


In [9]:
import shap

explainer = shap.Explainer(exp.clf)
shap_values = explainer(exp.X)
shap_values = shap_values[:,:,0]
new_shaps = []

new_feature = {}
features = [feature for feature in exp.data_table.columns if feature != exp.target]
for index, feature in enumerate(features):
    if ' - ' in feature:
        name, p = feature.split(' - ')
        p = int(p)
        if name not in new_feature:
            new_feature[name] = []
        while p >= len(new_feature[name]):
            new_feature[name].append(-1)
        new_feature[name][p] = index
    else:
        new_feature[feature] = [index]

In [10]:
features = []
feature_index = {}
feature_type = {}
for key in new_feature:
    if len(new_feature[key]) == 1:
        i = new_feature[key][0]
        if key in ['status', 'savings', 'employment_duration', 'installment_rate', 'present_residence', 'personal_status_sex']:
            min_value = min(exp.data_table[key].values)
            max_value = max(exp.data_table[key].values)
            unique_values = np.unique(exp.data_table[key].values)
            sorted(unique_values)
            features.append({
                "name": key,
                "range": [0, max_value - min_value + 1],
                "values": unique_values.tolist(),
                "min": min_value,
                "importance": exp.clf.feature_importances_[i],
                "dtype": "categoric",
            })
            feature_type[i] = "categoric"
        else:
            features.append({
                "name": key,
                "range": [min(exp.data_table[key].values), max(exp.data_table[key].values)],
                "importance": exp.clf.feature_importances_[i],
                "dtype": "numeric",
            })
            feature_type[i] = "numeric"
        shaps = shap_values[:, i]
        feature_index[i] = [len(features) - 1, 0]
    else:
        features.append({
            "name": key,
            "range": [0, len(new_feature[key])],
            "values": new_feature[key],
            "min": 0,
            "importance": sum([exp.clf.feature_importances_[i] for i in new_feature[key] if i != -1]),
            "dtype": "categoric",
        })
        feature_idxs = [i for i in  new_feature[key] if i != -1]
        shaps = shap_values[:, feature_idxs[0]]
        for i in feature_idxs[1:]:
            shaps = shaps + shap_values[:, i]

        for index, i in enumerate(new_feature[key]):
            if i != -1:
                feature_index[i] = [len(features) - 1, index]
                feature_type[i] = "categoric"
    new_shaps.append(shaps)

In [11]:
for path in paths:
    if not path.get('represent', True):
        continue
    new_range = {}
    for index in path['range']:
        i, j = feature_index[index]
        if feature_type[index] == 'numeric':
            r = path['range'][index]
            key = features[i]['name']
            if exp.data_table[key].dtype == np.int64:
                if r[0] < 0:
                    r[0] = 0
                if r[1] > features[i]['range'][1]:
                    r[1] = features[i]['range'][1]
                if features[index]['range'][0] > 0:
                    if r[0] < int(r[0]) + 1e-7:
                        r[0] = int(r[0]) - 1
                    else:
                        r[0] = int(r[0])
                    if r[1] > int(r[1]) + 1e-7:
                        r[1] = int(r[1])
                else:
                    if r[0] > int(r[0]) + 1e-7:
                        r[0] = int(r[0]) + 0.5
                    if r[1] > int(r[1]) + 1e-7:
                        r[1] = int(r[1]) + 0.5
            new_range[i] = r
        else:
            key = features[i]['name']
            if key in ['status', 'savings', 'employment_duration', 'installment_rate', 'present_residence', 'personal_status_sex']:
                new_range[i] = [0] * features[i]['range'][1]
                min_value = features[i]['min']
                r = path['range'][index]
                for j in range(features[i]['range'][1]):
                    if j + min_value >= r[0] and j + min_value <= r[1]:
                        new_range[i][j] = 1
            else:
                if i not in new_range:
                    new_range[i] = [0] * features[i]['range'][1]
                    if path['range'][index][0] <= 1 and 1 <= path['range'][index][1]:
                        new_range[i][j] = 1
                    else:
                        for k in range(len(new_range[i])):
                            if k != j:
                                new_range[i][k] = 1
                            new_range[i][j] = 0
    path['new_range'] = new_range
    path['represent'] = False

for i in idx:
    paths[i]['represent'] = True

output_data = {
    'paths': paths,
    'features': features,
    'selected': [paths[i]['name'] for i in idx],
    'shap_values': new_shaps,
    'model_info': {
        'accuracy': exp._accuracy[-1],
        'info': level_info,
        'num_of_rules': len(paths),
        'dataset': 'German Credit',
        'model': 'Random Forest',
    }
}

In [112]:
import pickle
pickle.dump(output_data, open('output/german0117_2.pkl', 'wb'))

In [110]:
for path in paths:
    path['range'] = path['new_range']
    del path['new_range']

In [17]:
print(features)

[{'name': 'status', 'range': [0, 4], 'values': [1, 2, 3, 4], 'min': 1, 'importance': 0.15251070283754695, 'dtype': 'categoric'}, {'name': 'duration', 'range': [4, 72], 'importance': 0.05695306807074818, 'dtype': 'numeric'}, {'name': 'amount', 'range': [250, 18424], 'importance': 0.06248818859485609, 'dtype': 'numeric'}, {'name': 'savings', 'range': [0, 5], 'values': [1, 2, 3, 4, 5], 'min': 1, 'importance': 0.04699644446291882, 'dtype': 'categoric'}, {'name': 'employment_duration', 'range': [0, 5], 'values': [1, 2, 3, 4, 5], 'min': 1, 'importance': 0.04186566532332252, 'dtype': 'categoric'}, {'name': 'installment_rate', 'range': [0, 4], 'values': [1, 2, 3, 4], 'min': 1, 'importance': 0.03927210304662811, 'dtype': 'categoric'}, {'name': 'personal_status_sex', 'range': [0, 2], 'values': [0, 1], 'min': 0, 'importance': 0.028227516841829833, 'dtype': 'categoric'}, {'name': 'present_residence', 'range': [0, 4], 'values': [1, 2, 3, 4], 'min': 1, 'importance': 0.03166133959410306, 'dtype': 'ca

In [15]:
lp = [p for p in paths if 19 in p['new_range']]


In [16]:
lp[0]

{'range': {3: [-100000000000000.0, 3.9953023195266724],
  33: [0.9958832561969757, 100000000000000.0],
  6: [-100000000000000.0, 0.034650105983018875],
  30: [-100000000000000.0, 0.013599947094917297],
  27: [0.965991199016571, 100000000000000.0],
  16: [-100000000000000.0, 0.22568358480930328],
  7: [1.5, 100000000000000.0],
  46: [-100000000000000.0, 0.5],
  5: [3.2413134574890137, 100000000000000.0],
  26: [-100000000000000.0, 0.7586865127086639],
  39: [0.5, 100000000000000.0],
  48: [-100000000000000.0, 0.5]},
 'value': 0.0625,
 'weight': 1,
 'tree_index': 0,
 'rule_index': 45,
 'name': 'r0_45',
 'confidence': 0.75,
 'sample': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

In [None]:
print(shap_values[0])

.values =
array([[ 1.09110125e-01, -1.09110125e-01],
       [ 3.01264698e-03, -3.01264698e-03],
       [-1.24874332e-01,  1.24874332e-01],
       [ 2.10891519e-03, -2.10891519e-03],
       [-1.99799022e-02,  1.99799022e-02],
       [ 3.40283799e-02, -3.40283799e-02],
       [ 3.85628726e-02, -3.85628726e-02],
       [ 5.60976830e-03, -5.60976830e-03],
       [ 3.36153339e-02, -3.36153339e-02],
       [ 3.85308858e-03, -3.85308858e-03],
       [-2.39386825e-02,  2.39386825e-02],
       [ 7.02421386e-03, -7.02421386e-03],
       [ 1.96232689e-02, -1.96232689e-02],
       [-1.74483971e-02,  1.74483971e-02],
       [ 6.65478065e-02, -6.65478065e-02],
       [ 6.73321595e-03, -6.73321595e-03],
       [ 1.58162875e-03, -1.58162875e-03],
       [-6.93273863e-05,  6.93273863e-05],
       [ 5.12965848e-03, -5.12965848e-03],
       [ 2.51444887e-03, -2.51444887e-03]])

.base_values =
array([0.50028956, 0.49971044])

.data =
array([   1,   18,    4,    2, 1049,    1,    2,    4,    2,    1,    4,

In [None]:
left_paths = [path for path in all_paths if path['fidelity'] > 0.75]
left_paths.sort(key=lambda x: -x['coverage'])

In [None]:
from sample import create_sampler

is_continuous = []
is_categorical = []
is_integer = []

for feature in data_table.columns:
    values = data_table[feature].values
    if feature == 'Creditability':
        continue
    if data_table[feature].dtype == 'O':
        is_continuous.append(False)
        is_categorical.append(True)
    else:
        is_continuous.append(True)
        is_categorical.append(False)
    is_integer.append(False)
sampler = create_sampler(X_train, is_continuous, is_categorical, is_integer)
#X2 = sampler(len(X) * 2)

0.76


In [17]:
for key in exp.data_table.columns:
    print(key, exp.data_table[key].dtype)

credit_risk int64
status int64
duration int64
amount int64
savings int64
employment_duration int64
installment_rate int64
personal_status_sex int64
present_residence int64
age int64
credit_history - 0 bool
credit_history - 1 bool
credit_history - 2 bool
credit_history - 3 bool
credit_history - 4 bool
purpose - 0 bool
purpose - 1 bool
purpose - 2 bool
purpose - 3 bool
purpose - 4 bool
purpose - 5 bool
purpose - 6 bool
purpose - 8 bool
purpose - 9 bool
purpose - 10 bool
other_debtors - 0 bool
other_debtors - 1 bool
other_debtors - 2 bool
property - 0 bool
property - 1 bool
property - 2 bool
property - 3 bool
other_installment_plans - 0 bool
other_installment_plans - 1 bool
other_installment_plans - 2 bool
housing - 0 bool
housing - 1 bool
housing - 2 bool
job - 0 bool
job - 1 bool
job - 2 bool
job - 3 bool
people_liable - 0 bool
people_liable - 1 bool
telephone - 0 bool
telephone - 1 bool
foreign_worker - 0 bool
foreign_worker - 1 bool
number_credits - 0 bool
number_credits - 1 bool
numb

In [19]:
print(exp.data_table['personal_status_sex'].max())

1
