In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
import os
from imblearn.over_sampling import SMOTE
from sample import create_sampler
from tree_extractor import path_extractor
from model_extractor import Extractor
import pickle

random_state = 114

parameters = {
    'n_estimators': 150,
    'max_depth': 5,
    'random_state': random_state,
    'max_features': 'auto',
    'oob_score': True,
}


target = 'quality'
dataset = 'wine'
model = 'RF'

data_table = pd.read_csv('data/wine.csv')
X = data_table.drop(target, axis=1).values
y = data_table[target].values
y = np.array([0 if v < 6 else 1 for v in y])

sm = SMOTE(random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=random_state)
X_train, y_train = sm.fit_resample(X_train, y_train)

clf = RandomForestClassifier(**parameters)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
features = data_table.drop(target, axis=1).columns.to_list()

output_data = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1_score': f1_score(y_test, y_pred),
}

print('Accuracy Score is', output_data['accuracy'])
print('Precision is', output_data['precision'])
print('Recall is', output_data['recall'])
print('F1 Score is', output_data['f1_score'])
is_continuous = []
is_categorical = []
is_integer = []

for feature in data_table.columns:
    values = data_table[feature].values
    if feature == target:
        continue
    if data_table[feature].dtype == 'O':
        is_continuous.append(False)
        is_categorical.append(True)
    else:
        is_continuous.append(True)
        is_categorical.append(False)
    is_integer.append(False)
sampler = create_sampler(X_train, is_continuous, is_categorical, is_integer)
X2 = sampler(len(X) * 2)


Accuracy Score is 0.75
Precision is 0.788135593220339
Recall is 0.7265625
F1 Score is 0.7560975609756098


In [2]:
if model == 'RF':
    paths = path_extractor(clf, 'random forest', (X_train, y_train))
else:
    paths = path_extractor(clf, 'lightgbm')
print('num of paths', len(paths))

from sbrl_extractor import make_sbrl, test_sbrl

num of paths 4010


In [6]:
from sbrl_extractor import make_sbrl, test_sbrl
paths = [p for p in paths if p['confidence'] > 0.85]
sbrl = make_sbrl(paths, X2, clf.predict(X2), 50, 1)

print('num of rules', len(sbrl.paths))
print('train', test_sbrl(sbrl, X_train, clf.predict(X_train)))
print(test_sbrl(sbrl, X_train, y_train))
print('test', test_sbrl(sbrl, X_test, clf.predict(X_test), True))
print(test_sbrl(sbrl, X_test, y_test, True))

all_uncover_bits 1148 3198
num of rules 42
train 0.8564273789649416
0.7712854757929883
209.0
test 0.8333333333333334
209.0
0.7125


In [4]:
for p in sbrl.paths:
    o = p['original']
    print(o['confidence'])

0.9139784946236559
0.8484848484848485
0.8554216867469879
0.8411214953271028
0.9473684210526315
0.8888888888888888
1.0
0.8947368421052632
0.8205128205128205
1.0
1.0
0.9545454545454546
1.0
1.0
0.9359605911330049
1.0
0.8333333333333334
0.8571428571428571
0.9705882352941176
1.0
1.0
1.0
1.0
0.8125
1.0
0.9090909090909091
0.841726618705036
0.8421052631578947
1.0
0.8571428571428571
0.9
1.0
0.8181818181818182
0.8095238095238095
0.8333333333333334
0.8214285714285714
0.875
0.9347826086956522
1.0
0.9722222222222222
1.0
0.864406779661017
1.0
0.9166666666666666
1.0
1.0
0.9345794392523364
1.0
0.9279279279279279
0.967391304347826
1.0
1.0


In [27]:
from sample import create_sampler

is_continuous = []
is_categorical = []
is_integer = []

for feature in data_table.columns:
    values = data_table[feature].values
    if feature == 'Creditability':
        continue
    if data_table[feature].dtype == 'O':
        is_continuous.append(False)
        is_categorical.append(True)
    else:
        is_continuous.append(True)
        is_categorical.append(False)
    is_integer.append(False)
sampler = create_sampler(X_train, is_continuous, is_categorical, is_integer)
X2 = sampler(len(X) * 2)

In [28]:
ex2 = Extractor(paths, X2, clf.predict(X2))
records2 = []
for n in range(100, 1000, 100):
    w, acc1, acc2 = ex2.extract(n, 1 + n * 0.002)
    records2.append((n, w))
    print(n, acc1, acc2)
    print(ex2.evaluate(w, X_test, y_test), ex2.evaluate(w, X_test, clf.predict(X_test)))

paths 11037
100 0.959 0.904
0.7466666666666667 0.8933333333333333
200 0.959 0.9475
0.7733333333333333 0.9466666666666667
300 0.959 0.952
0.7866666666666666 0.9466666666666667
400 0.959 0.9665
0.78 0.9533333333333334
500 0.959 0.971
0.7733333333333333 0.94
600 0.959 0.9755
0.7833333333333333 0.9433333333333334
700 0.959 0.977
0.78 0.9533333333333334
800 0.959 0.977
0.7866666666666666 0.96
900 0.959 0.9795
0.7866666666666666 0.9666666666666667


In [29]:
ex = Extractor(paths, X_train, clf.predict(X_train))
records = []
for n in range(50, 1000, 50):
    w, acc1, acc2 = ex.extract(n, 1 + n * 0.002)
    records.append((n, w))
    print(n, acc1, acc2)
    print(ex.evaluate(w, X_test, y_test), ex.evaluate(w, X_test, clf.predict(X_test)))

paths 11037
50 0.9741735537190083 0.8708677685950413
0.72 0.8466666666666667
100 0.9741735537190083 0.9132231404958677
0.77 0.9033333333333333
150 0.9741735537190083 0.9349173553719008
0.7733333333333333 0.9133333333333333
200 0.9741735537190083 0.9576446280991735
0.7933333333333333 0.92
250 0.9741735537190083 0.9648760330578512
0.78 0.9266666666666666
300 0.9741735537190083 0.9741735537190083
0.7866666666666666 0.92
350 0.9741735537190083 0.9793388429752066
0.77 0.9166666666666666
400 0.9741735537190083 0.984504132231405
0.7866666666666666 0.9133333333333333
450 0.9741735537190083 0.9886363636363636
0.7933333333333333 0.9333333333333333
500 0.9741735537190083 0.9907024793388429
0.78 0.9333333333333333


KeyboardInterrupt: 

In [58]:
for (w, n) in weights:
    print(n, ex.evaluate(w, X_test, y_test), ex.evaluate(w, X_test, clf.predict(X_test)))

100 0.8047619047619048 0.861904761904762
200 0.8047619047619048 0.8761904761904762
300 0.7904761904761904 0.8666666666666667


In [16]:
y1 = clf.predict(X_test)
y2 = ex.predict(X_test, paths[:300])
print((y1 == y2).sum() / len(y1))

0.84


0.99


In [None]:
y_train

array([1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,

train_y

In [None]:
ret = ex.extract(75, 10)


In [None]:
print(ret[2:])
paths = rf.get_paths()

all_paths = []
for t in paths:
    all_paths = all_paths + t

import shap

explainer = shap.Explainer(r_clf)
shap_values = explainer(X)

(0.892, 0.804)


In [None]:
features = []
for index, feature in enumerate(data.columns):
    if len(feature_aggregate[index]) == 1:
        i = feature_aggregate[index][0]
        features.append({
            "name": feature,
            "range": [rf.feature_range[0][i], rf.feature_range[1][i]],
            "importance": r_clf.feature_importances_[i],
            "dtype": "numeric",
        })
    else:
        features.append({
            "name": feature,
            "range": [feature_unique[feature]],
            "importance": sum([r_clf.feature_importances_[i] for i in feature_aggregate[index]]),
            "dtype": "object",
        })

for path in all_paths:
    new_range = {}
    for index in path['range']:
        feature = feature_origin[int(index)]
        if type(feature) == list:
            if feature[0] not in new_range:
                new_range[feature[0]] = [-1] * len(feature_unique[feature[0]])
            new_range[feature[0]][feature[1]] = 0 if path['range'][index][0] == 0 else 1
        else:
            new_range[feature] = path['range'][index]
    path['range'] = new_range

output_data = {
    'paths': all_paths,
    'features': features,
    'selected': ret[0],
    'shap_values': shap_values,
}

import pickle
pickle.dump(output_data, open('output/german2.pkl', 'wb'))

In [None]:
all_paths[5]

{'name': 'r5-0',
 'tree_index': 0,
 'rule_index': 5,
 'range': {'Length of current employment': [-1, -1, 0, 0, -1],
  'Account Balance': [-1, -1, 0, 0],
  'Payment Status of Previous Credit': [-1, -1, 1, -1, 0],
  'Value Savings/Stocks': [-1, -1, 0, -1, -1],
  'Credit Amount': [669.5, 18424.000000001],
  'Purpose': [-1, -1, -1, -1, 1, -1, -1, -1, -1, -1]},
 'distribution': [22.0, 14.0],
 'coverage': 36.0,
 'fidelity': 0.6111111111111112,
 'sample': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 

In [None]:
features

['A11',
 'A12',
 'A13',
 'A14',
 'Duration of Credit (month)',
 'A30',
 'A31',
 'A32',
 'A33',
 'A34',
 'A40',
 'A41',
 'A410',
 'A42',
 'A43',
 'A44',
 'A45',
 'A46',
 'A48',
 'A49',
 'Credit Amount',
 'A61',
 'A62',
 'A63',
 'A64',
 'A65',
 'A71',
 'A72',
 'A73',
 'A74',
 'A75',
 'Installment per cent',
 'A91',
 'A92',
 'A93',
 'A94',
 'A101',
 'A102',
 'A103',
 'Duration in Current address',
 'A121',
 'A122',
 'A123',
 'A124',
 'Age (years)',
 'A141',
 'A142',
 'A143',
 'A151',
 'A152',
 'A153',
 'No of Credits at this Bank',
 'A171',
 'A172',
 'A173',
 'A174',
 'No of dependents',
 'A191',
 'A192',
 'A201',
 'A202']

In [None]:
sum(ex.Mat[10] == 1)

165

In [None]:
print(shap_values[0])

.values =
array([[ 1.09110125e-01, -1.09110125e-01],
       [ 3.01264698e-03, -3.01264698e-03],
       [-1.24874332e-01,  1.24874332e-01],
       [ 2.10891519e-03, -2.10891519e-03],
       [-1.99799022e-02,  1.99799022e-02],
       [ 3.40283799e-02, -3.40283799e-02],
       [ 3.85628726e-02, -3.85628726e-02],
       [ 5.60976830e-03, -5.60976830e-03],
       [ 3.36153339e-02, -3.36153339e-02],
       [ 3.85308858e-03, -3.85308858e-03],
       [-2.39386825e-02,  2.39386825e-02],
       [ 7.02421386e-03, -7.02421386e-03],
       [ 1.96232689e-02, -1.96232689e-02],
       [-1.74483971e-02,  1.74483971e-02],
       [ 6.65478065e-02, -6.65478065e-02],
       [ 6.73321595e-03, -6.73321595e-03],
       [ 1.58162875e-03, -1.58162875e-03],
       [-6.93273863e-05,  6.93273863e-05],
       [ 5.12965848e-03, -5.12965848e-03],
       [ 2.51444887e-03, -2.51444887e-03]])

.base_values =
array([0.50028956, 0.49971044])

.data =
array([   1,   18,    4,    2, 1049,    1,    2,    4,    2,    1,    4,

In [None]:
left_paths = [path for path in all_paths if path['fidelity'] > 0.75]
left_paths.sort(key=lambda x: -x['coverage'])

In [None]:
from sample import create_sampler

is_continuous = []
is_categorical = []
is_integer = []

for feature in data_table.columns:
    values = data_table[feature].values
    if feature == 'Creditability':
        continue
    if data_table[feature].dtype == 'O':
        is_continuous.append(False)
        is_categorical.append(True)
    else:
        is_continuous.append(True)
        is_categorical.append(False)
    is_integer.append(False)
sampler = create_sampler(X_train, is_continuous, is_categorical, is_integer)
#X2 = sampler(len(X) * 2)

0.76
