In [18]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
import os
from imblearn.over_sampling import SMOTE
from sample import create_sampler
from tree_extractor import path_extractor
from model_extractor_maxnum import Extractor
import pickle

random_state = 126

class ExpModel:
    def __init__(self, dataset, model):
        self.dataset = dataset
        self.model = model
        self.n_splits = 4
        self._accuracy = []
        self._precision = []
        self._recall = []
        self._f1_score = []

    def init(self):
        if self.model == 'RF':
            if self.dataset == 'breast_cancer':
                self.target = 'diagnosis'
                parameters = {
                        'n_estimators': 200,
                        'max_depth': 10,
                        'random_state': random_state,
                        'max_features': None,
                }
                data_table = pd.read_csv('data/cancer.csv')
                data_table = data_table.drop(['id'], axis=1)
                X = data_table.drop(self.target, axis=1).values
                y = data_table[self.target].values
            elif self.dataset == 'abalone':
                self.target = 'Rings'
                parameters = {
                    'n_estimators': 80,
                    # 'max_depth': 30,
                    'random_state': 10,
                    'max_features': 'auto',
                    'oob_score': True,
                    'min_samples_split': 9,
                    'min_samples_leaf': 5,
                }

                data_table = pd.read_csv('data/abalone.csv')
                X = data_table.drop(self.target, axis=1).values
                y = data_table[self.target].values
                y = np.array([0 if v <= 7 else 1 for v in y])
            elif self.dataset == 'bankruptcy':
                self.target = 'Bankrupt?'
                parameters = {
                        'n_estimators': 150,
                        'max_depth': 15,
                        'random_state': random_state,
                }

                data_table = pd.read_csv('data/bank.csv')
                X = data_table.drop(self.target, axis=1).values
                y = data_table[self.target].values
            elif self.dataset == 'diabetes':
                self.target = 'class'
                parameters = {
                    'n_estimators': 100,
                    'max_depth': 10,
                    'random_state': random_state,
                    'max_features': None,
                }

                data_table = pd.read_csv('data/diabetes.csv')
                X = data_table.drop(self.target, axis=1).values
                y = data_table[self.target].values
            elif self.dataset == 'german_credit':
                self.target = 'Creditability'
                parameters = {
                    'random_state': random_state,
                    'max_depth': 12,
                    'n_estimators': 150,
                    'max_leaf_nodes': 100,
                    'min_samples_split': 6,
                    'min_samples_leaf': 3,
                    'bootstrap': True,
                }

                data_table = pd.read_csv('data/german.csv')          
                purposes = [
                    "car (new)",
                    "car (used)",
                    "furniture/equipment",
                    "radio/television",
                    "domestic appliances",
                    "repairs",
                    "education",
                    "vacation",
                    "retraining",
                    "business",
                    "others"
                ]
                qualitative_features = ['Account Balance' , 'Payment Status of Previous Credit' , 'Purpose' , 'Value Savings/Stocks' , 'Length of current employment' , 'Sex & Marital Status' , 'Guarantors' , 'Most valuable available asset' , 'Other installment plans' , 'Type of apartment' ,  'Occupation' , 'Telephone' , 'Foreign Worker']
                for feature in qualitative_features:
                    unique_values = np.unique(data_table[feature].values)
                    sorted(unique_values)
                    if int(unique_values[0]) == 0:
                        for i in unique_values:
                            data_table[feature + ' - '+ str(i)] = data_table[feature].values == i
                    else:
                        for i in unique_values:
                            data_table[feature + ' - '+ str(int(i) - 1)] = data_table[feature].values == i

                #    data_table['installment - '+ concurrent_credits[i]] = data_table['Other installment plans'].values == ix
                #data_table['Account Balance'] = np.array([v if v < 4 else 0 for v in data_table['Account Balance'].values])
                for feature in qualitative_features:
                    data_table = data_table.drop(feature, axis = 1)
                #data_table = data_table.drop('Other installment plans', axis = 1)
                X = data_table.drop(self.target, axis=1).values
                y = data_table[self.target].values
            elif self.dataset == 'wine':
                self.target = 'quality'
                parameters = {
                    'n_estimators': 150,
                    'max_depth': 13,
                    'random_state': random_state,
                    'max_features': 'auto',
                    'oob_score': True,
                }

                data_table = pd.read_csv('data/wine.csv')
                X = data_table.drop(self.target, axis=1).values
                y = data_table[self.target].values
                y = np.array([0 if v < 6 else 1 for v in y])
            self.data_table = data_table
            self.X = X
            self.y = y
            self.parameters = parameters
            
            kf = KFold(n_splits = self.n_splits, random_state=random_state, shuffle=True)
            self.splits = []
            for train_index, test_index in kf.split(X):
                self.splits.append((train_index, test_index))
            self.fold = 0

    def has_next_fold(self):
        return self.fold < len(self.splits)
    
    def next_fold(self):
        self.fold += 1

    def train(self):
        sm = SMOTE(random_state=random_state)
        data_table = self.data_table
        X = self.X
        y = self.y
        parameters = self.parameters
        train_index, test_index = self.splits[self.fold]
        X_train = self.X[train_index]
        y_train = self.y[train_index]
        X_test = self.X[test_index]
        y_test = self.y[test_index]
        X_train, y_train = sm.fit_resample(X_train, y_train)
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.data_table = data_table
        clf = RandomForestClassifier(**parameters)
        clf.fit(X_train, y_train)
        self.clf = clf
        self.y_pred = clf.predict(X_test)
        self.features = data_table.drop(self.target, axis=1).columns.to_list()

    def evaluate(self):
        _accuracy = accuracy_score(self.y_test, self.y_pred)
        _precision = precision_score(self.y_test, self.y_pred)
        _recall = recall_score(self.y_test, self.y_pred)
        _f1_score = f1_score(self.y_test, self.y_pred)
        print('Accuracy Score is', _accuracy)
        print('Precision is', _precision)
        print('Recall is', _recall)
        print('F1 Score is', _f1_score)
        self._accuracy.append(_accuracy)
        self._precision.append(_precision)
        self._recall.append(_recall)
        self._f1_score.append(_f1_score)
    
    def summary(self):
        return float(np.mean(self._accuracy)), float(np.mean(self._precision)), float(np.mean(self._recall)), float(np.mean(self._f1_score))

    def oversampling(self, rate = 2):
        is_continuous = []
        is_categorical = []
        is_integer = []

        for feature in self.data_table.columns:
            if feature == self.target:
                continue
            if self.data_table[feature].dtype == 'O':
                is_continuous.append(False)
                is_categorical.append(True)
            else:
                is_continuous.append(True)
                is_categorical.append(False)
            is_integer.append(False)
        sampler = create_sampler(self.X_train, is_continuous, is_categorical, is_integer)
        return sampler(len(self.X_train) * rate)

    def generate_paths(self):
        if self.model == 'RF':
            paths = path_extractor(self.clf, 'random forest', (self.X_train, self.y_train))
        else:
            paths = path_extractor(self.clf, 'lightgbm')
        print('num of paths', len(paths))
        return paths


In [19]:
dataset = 'german_credit'
model = 'RF'
exp = ExpModel(dataset, model)
exp.init()
exp.train()
paths = exp.generate_paths()
exp.evaluate()

from tree_extractor import assign_samples
assign_samples(paths, (exp.X, exp.y))
paths = [p for p in paths if p['coverage'] > 0]

num of paths 14554
Accuracy Score is 0.828
Precision is 0.8578680203045685
Recall is 0.9184782608695652
F1 Score is 0.8871391076115485


In [20]:
last_paths = paths
name2path = {}
for index, path in enumerate(paths):
    name2path[path['name']] = path
    path['level'] = 0

params = [1000, 80]
level_info = {}

for level, n in enumerate(params):
    tau = (n / 80) ** 0.5
    ex = Extractor(last_paths, exp.X_train, exp.clf.predict(exp.X_train))
    w, _, fidelity_train = ex.extract(n, tau)
    [idx] = np.nonzero(w)

    accuracy_train = ex.evaluate(w, exp.X_train, exp.y_train)
    accuracy_test = ex.evaluate(w, exp.X_test, exp.y_test)
    fidelity_test = ex.evaluate(w, exp.X_test, exp.clf.predict(exp.X_test))
    print(level, n, 'accuracy_train', accuracy_train, 'accuracy_test', accuracy_test, 'fidelity_test', fidelity_test)
    
    level_info[level + 1] = {
        'fidelity_test': fidelity_test,
        'accuracy_test': accuracy_test,
    }
    for i in idx:
        name2path[last_paths[i]['name']]['level'] = level + 1
    curr_paths = [last_paths[i] for i in idx]
    last_paths = curr_paths
    print(len(last_paths))

total path weight:  0.9999999999999902
0 1000 accuracy_train 0.9467054263565892 accuracy_test 0.804 fidelity_test 0.96
1000
total path weight:  0.9999999999999996
1 80 accuracy_train 0.8982558139534884 accuracy_test 0.76 fidelity_test 0.9
80


In [21]:
import shap

explainer = shap.Explainer(exp.clf)
shap_values = explainer(exp.X)
shap_values = shap_values[:,:,0]
new_shaps = []

new_feature = {}
features = [feature for feature in exp.data_table.columns if feature != exp.target]
for index, feature in enumerate(features):
    if ' - ' in feature:
        name, p = feature.split(' - ')
        p = int(p)
        if name not in new_feature:
            new_feature[name] = []
        while p >= len(new_feature[name]):
            new_feature[name].append(-1)
        new_feature[name][p] = index
    else:
        new_feature[feature] = [index]

In [22]:
features = []
feature_index = {}
feature_type = {}
for key in new_feature:
    if len(new_feature[key]) == 1:
        i = new_feature[key][0]
        features.append({
            "name": key,
            "range": [min(exp.data_table[key].values), max(exp.data_table[key].values)],
            "importance": exp.clf.feature_importances_[i],
            "dtype": "numeric",
        })
        shaps = shap_values[:, i]
        feature_index[i] = len(features) - 1
        feature_type[i] = "numeric"
    else:
        features.append({
            "name": key,
            "range": [0, len(new_feature[key])],
            "importance": sum([exp.clf.feature_importances_[i] for i in new_feature[key] if i != -1]),
            "dtype": "categoric",
        })
        feature_idxs = [i for i in  new_feature[key] if i != -1]
        shaps = shap_values[:, feature_idxs[0]]
        for i in feature_idxs[1:]:
            shaps = shaps + shap_values[:, i]

        for index, i in enumerate(new_feature[key]):
            if i != -1:
                feature_index[i] = [len(features) - 1, index]
                feature_type[i] = "categoric"
    new_shaps.append(shaps)

In [23]:
for path in paths:
    new_range = {}
    for index in path['range']:
        if feature_type[index] == 'numeric':
            i = feature_index[index]
            new_range[i] = path['range'][index]
        else:
            i, j = feature_index[index]
            if i not in new_range:
                new_range[i] = [0] * features[i]['range'][1]
                if path['range'][index][0] <= 1 and 1 <= path['range'][index][1]:
                    new_range[i][j] = 1
                else:
                    for k in range(len(new_range[i])):
                        if k != j:
                            new_range[i][k] = 1
                        new_range[i][j] = 0
    path['range'] = new_range
    path['represent'] = False

for i in idx:
    paths[i]['represent'] = True

output_data = {
    'paths': paths,
    'features': features,
    'selected': [paths[i]['name'] for i in idx],
    'shap_values': new_shaps,
    'model_info': {
        'accuracy': exp._accuracy[-1],
        'info': level_info,
        'num_of_rules': len(paths),
        'dataset': 'German Credit',
        'model': 'Random Forest',
    }
}

In [24]:
import pickle
pickle.dump(output_data, open('output/german1211.pkl', 'wb'))

In [15]:
features[17]

{'name': 'Occupation',
 'range': [0, 4],
 'importance': 0.03760634137209534,
 'dtype': 'categoric'}

In [None]:
sum(ex.Mat[10] == 1)

165

In [None]:
print(shap_values[0])

.values =
array([[ 1.09110125e-01, -1.09110125e-01],
       [ 3.01264698e-03, -3.01264698e-03],
       [-1.24874332e-01,  1.24874332e-01],
       [ 2.10891519e-03, -2.10891519e-03],
       [-1.99799022e-02,  1.99799022e-02],
       [ 3.40283799e-02, -3.40283799e-02],
       [ 3.85628726e-02, -3.85628726e-02],
       [ 5.60976830e-03, -5.60976830e-03],
       [ 3.36153339e-02, -3.36153339e-02],
       [ 3.85308858e-03, -3.85308858e-03],
       [-2.39386825e-02,  2.39386825e-02],
       [ 7.02421386e-03, -7.02421386e-03],
       [ 1.96232689e-02, -1.96232689e-02],
       [-1.74483971e-02,  1.74483971e-02],
       [ 6.65478065e-02, -6.65478065e-02],
       [ 6.73321595e-03, -6.73321595e-03],
       [ 1.58162875e-03, -1.58162875e-03],
       [-6.93273863e-05,  6.93273863e-05],
       [ 5.12965848e-03, -5.12965848e-03],
       [ 2.51444887e-03, -2.51444887e-03]])

.base_values =
array([0.50028956, 0.49971044])

.data =
array([   1,   18,    4,    2, 1049,    1,    2,    4,    2,    1,    4,

In [None]:
left_paths = [path for path in all_paths if path['fidelity'] > 0.75]
left_paths.sort(key=lambda x: -x['coverage'])

In [None]:
from sample import create_sampler

is_continuous = []
is_categorical = []
is_integer = []

for feature in data_table.columns:
    values = data_table[feature].values
    if feature == 'Creditability':
        continue
    if data_table[feature].dtype == 'O':
        is_continuous.append(False)
        is_categorical.append(True)
    else:
        is_continuous.append(True)
        is_categorical.append(False)
    is_integer.append(False)
sampler = create_sampler(X_train, is_continuous, is_categorical, is_integer)
#X2 = sampler(len(X) * 2)

0.76
