In [30]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
#import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

In [31]:
random_state = 114
criterion = "entropy"
max_depth = 8
max_features = "sqrt"
n_estimators = 150
max_leaf_nodes = 90
bootstrap = True


'''

data_table = pd.read_csv('data/german.csv')

X = data_table.drop('Creditability', axis=1).values
y = data_table['Creditability'].values
features = data_table.drop('Creditability', axis=1).columns.to_list()

data_table = pd.read_csv('data/cancer.csv')

X = data_table.drop('diagnosis', axis=1).values
y = data_table['diagnosis'].values
features = data_table.drop('diagnosis', axis=1).columns.to_list()

'''
data_table = pd.read_csv('data/bank.csv')

X = data_table.drop('Bankrupt?', axis=1).values
y = data_table['Bankrupt?'].values
features = data_table.drop('Bankrupt?', axis=1).columns.to_list()

sm = SMOTE(random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=random_state)
X_train, y_train = sm.fit_resample(X_train, y_train)


best_acc = 0

clf = LGBMClassifier(
    learning_rate=0.1,
    n_estimators=100,
    num_leaves=50,
    reg_alpha=3,
    max_depth=8,
    random_state=random_state,
)
'''

clf = RandomForestClassifier(
    max_leaf_nodes=max_leaf_nodes,
    max_features=max_features,
    bootstrap=bootstrap,
    criterion=criterion, 
    max_depth=max_depth,
    random_state=random_state,
    n_estimators=n_estimators,
)
'''
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print('Test')
print((y_test == y_pred).sum() / len(y_test))
print('Accuracy Score is', accuracy_score(y_test, y_pred))
print('Precision is', precision_score(y_test, y_pred))
print('Recall is', recall_score(y_test, y_pred))
print('F1-Score is', f1_score(y_test, y_pred))

y_pred = clf.predict(X_train)

print('Train')
print((y_train == y_pred).sum() / len(y_train))
print('Accuracy Score is', accuracy_score(y_train, y_pred))
print('Precision is', precision_score(y_train, y_pred))
print('Recall is', recall_score(y_train, y_pred))
print('F1-Score is', f1_score(y_train, y_pred))


Test
0.9672531769305963
Accuracy Score is 0.9672531769305963
Precision is 0.4714285714285714
Recall is 0.5238095238095238
F1-Score is 0.4962406015037594
Train
0.9998916811091855
Accuracy Score is 0.9998916811091855
Precision is 0.9997834091401343
Recall is 1.0
F1-Score is 0.9998916928408967


In [32]:
from tree_extractor import path_extractor

paths = path_extractor(clf, 'lightgbm', (X_train, y_train))

In [4]:
print(paths[0])

{'range': {23: [-1000000000.0, 105.40000000000002], 28: [-1000000000.0, 0.12195000000000002]}, 'value': 0.1891891891891892, 'weight': 55.5, 'confidence': 1, 'tree': 0}


In [33]:

from sklearn.neighbors import LocalOutlierFactor
import pulp
from copy import deepcopy

class Extractor:
    # 可以调用的接口：compute_accuracy和extract
    def __init__(self, paths, X_train, y_train, X_test, y_test):
        # X_raw、y_raw：训练数据集
        self.X_raw = X_train
        self.y_raw = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.paths = [path for path in paths if path['confidence'] > 0.7]
        print('paths', len(paths))

    def compute_accuracy_on_train(self, paths):
        # 计算训练集在给定规则集下的accuracy
        # paths：规则集
        y_pred = self.predict(self.X_raw, paths)
        y_pred = np.where(y_pred == 1, 1, 0)
        return np.sum(np.where(y_pred == self.y_raw, 1, 0)) / len(self.X_raw)

    def compute_accuracy_on_test(self, paths):
        # 计算测试集在给定规则集下的accuracy
        # paths：规则集
        y_pred = self.predict(self.X_test, paths)
        y_pred = np.where(y_pred == 1, 1, 0)
        return np.sum(np.where(y_pred == self.y_test, 1, 0)) / len(self.X_test)

    def extract(self, max_num, tau):
        # 根据给定的max_num和tau，使用rf的全部规则和数据集抽取出相应的规则
        # max_num：抽取出规则的最大数量
        # tau：每个样本允许的最大惩罚
        # 返回抽取出规则的列表、数据集使用全部规则的accuracy、数据集使用抽取规则的accuracy
        Mat = self.getMat(self.X_raw, self.y_raw, self.paths)
        print('getWeight')
        w = self.getWeight(self.getMat(self.X_raw, self.y_raw, self.paths))
        print('LP_extraction')
        paths_weight = self.LP_extraction(w, Mat, max_num, tau)
        print('compute_accuracy_on_test')
        accuracy_origin = self.compute_accuracy_on_test(self.paths)
        accuracy_origin1 = self.compute_accuracy_on_train(self.paths)
        path_copy = deepcopy(self.paths)
        for i in range(len(path_copy)):
            path_copy[i]['weight'] = paths_weight[i]
        accuracy_new = self.compute_accuracy_on_test(path_copy)
        accuracy_new1 = self.compute_accuracy_on_train(path_copy)
        return paths_weight, accuracy_origin1, accuracy_new1, accuracy_origin, accuracy_new

    def predict(self, X, paths):
        # 根据给定规则集对数据进行预测
        Y = np.zeros(X.shape[0])
        for p in paths:
            ans = np.ones(X.shape[0])
            m = p.get('range')
            for key in m:
                ans = ans * (X[:,int(key)] >= m[key][0]) * (X[:,int(key)] < m[key][1])
            Y += ans * (p.get('weight') * p.get('value'))
        Y = np.where(Y > 0, 1, 0)
        return Y

    def getMat(self, X_raw, y_raw, paths):
        Mat = np.array([self.path_score(p, X_raw, y_raw) for p in paths]).astype('float')
        return Mat

    def path_score(self, path, X, y):
        value = float(path.get('value'))
        ans = 2 * (value * y > 0) - 1
        m = path.get('range')
        for key in m:
            ans = ans * (X[:, int(key)] >= m[key][0]) * (X[:, int(key)] < m[key][1])
        return ans

    def getWeight(self, Mat):
        # 权重向量w
        RXMat = np.abs(Mat)
        XRMat = RXMat.transpose()
        XXAnd = np.dot(XRMat, RXMat)
        XROne = np.ones(XRMat.shape)
        XXOr = 2 * np.dot(XROne, RXMat) - XXAnd
        XXOr = (XXOr + XXOr.transpose()) / 2
        XXDis = 1 - XXAnd / XXOr
        K = int(np.ceil(np.sqrt(len(self.X_raw))))
        clf = LocalOutlierFactor(n_neighbors=K, metric="precomputed")
        clf.fit(XXDis)
        XW = -clf.negative_outlier_factor_
        MXW, mXW = np.max(XW), np.min(XW)
        XW = 1 + (3 - 1) * (XW - mXW) / (MXW - mXW)
        return XW / np.sum(XW)

    def LP_extraction(self, w, Mat, max_num, tau):
        m = pulp.LpProblem(sense=pulp.LpMinimize)
        # 创建最小化问题
        var = []
        for i in range(len(self.paths)):
            var.append(pulp.LpVariable(f'x{i}', cat=pulp.LpContinuous, lowBound=0, upBound=1))
        for i in range(len(w)):
            var.append(pulp.LpVariable(f'k{i}', cat=pulp.LpContinuous, lowBound=0))
        # 添加变量x_0至x_{M-1}, k_0至k_{N-1}

        m += pulp.lpSum([w[j] * (var[j + len(self.paths)])
                         for j in range(len(w))])
        # 添加目标函数

        m += (pulp.lpSum([var[j] for j in range(len(self.paths))]) <= max_num)
        # 筛选出不超过max_num条规则

        for j in range(len(w)):
            m += (var[j + len(self.paths)] >= 1000 + tau - pulp.lpSum(
                [var[k] * Mat[k][j] for k in range(len(self.paths))]))
            m += (var[j + len(self.paths)] >= 1000)
            # max约束

        m.solve(pulp.PULP_CBC_CMD())  # solver = pulp.solver.CPLEX())#
        paths_weight = [var[i].value() for i in range(len(self.paths))]
        paths_weight = np.array(paths_weight)
        paths_weight = paths_weight / np.sum(paths_weight)
        for k in np.argsort(paths_weight)[:-max_num]:
            paths_weight[k] = 0
        print('paths_weight', sum(paths_weight))
        return paths_weight


In [35]:
ex = Extractor(paths, X_train, y_train, X_test, y_test)
ret = ex.extract(50, 2)
print(ret[1:])

paths 4460
getWeight
LP_extraction


(0.9018595041322314, 0.9948347107438017, 0.8, 0.7566666666666667)


In [10]:
X_train.shape

(968, 20)

In [64]:
print(ret[2:])

(0.7733333333333333, 0.62)


In [17]:
import shap

explainer = shap.Explainer(r_clf)
shap_values = explainer(X)


In [20]:

features=[
    {
        "name": rf.features[i],
        "lbound":rf.feature_range[0][i],
        "rbound":rf.feature_range[1][i],
        "importance":r_clf.feature_importances_[i],
        "options":"+",
    } for i in range(rf.n_features)
]

data = {
    'paths': all_paths,
    'features': features,
    'selected': ret[0],
    'shap_values': shap_values,
}

import pickle
pickle.dump(data, open('output/german.pkl', 'wb'))

In [21]:
print(shap_values[0])

.values =
array([[ 1.09110125e-01, -1.09110125e-01],
       [ 3.01264698e-03, -3.01264698e-03],
       [-1.24874332e-01,  1.24874332e-01],
       [ 2.10891519e-03, -2.10891519e-03],
       [-1.99799022e-02,  1.99799022e-02],
       [ 3.40283799e-02, -3.40283799e-02],
       [ 3.85628726e-02, -3.85628726e-02],
       [ 5.60976830e-03, -5.60976830e-03],
       [ 3.36153339e-02, -3.36153339e-02],
       [ 3.85308858e-03, -3.85308858e-03],
       [-2.39386825e-02,  2.39386825e-02],
       [ 7.02421386e-03, -7.02421386e-03],
       [ 1.96232689e-02, -1.96232689e-02],
       [-1.74483971e-02,  1.74483971e-02],
       [ 6.65478065e-02, -6.65478065e-02],
       [ 6.73321595e-03, -6.73321595e-03],
       [ 1.58162875e-03, -1.58162875e-03],
       [-6.93273863e-05,  6.93273863e-05],
       [ 5.12965848e-03, -5.12965848e-03],
       [ 2.51444887e-03, -2.51444887e-03]])

.base_values =
array([0.50028956, 0.49971044])

.data =
array([   1,   18,    4,    2, 1049,    1,    2,    4,    2,    1,    4,

In [32]:
ret[0][3]

'r99-69'

In [18]:
data = pd.read_csv('data/german.data', sep=',')

In [34]:
data[data.columns[3]].dtype == 'O'

True