In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import manifold, datasets
import pickle
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import pairwise_distances
import pandas as pd
import seaborn as sns
from sklearn.covariance import MinCovDet
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import NearestNeighbors
from sklearn.svm import OneClassSVM
from sklearn.metrics import pairwise_distances
from sklearn.cluster import DBSCAN
from sklearn.linear_model import LinearRegression
import math

class DetectorEnsemble:
    def __init__(self):
        self.detectors = []
        '''
        self.detectors.append(('iforest1', IsolationForest(random_state = 0, max_samples = 128, n_estimators = 100)))
        self.detectors.append(('iforest2', IsolationForest(random_state = 0, max_samples = 128, n_estimators = 200)))
        self.detectors.append(('iforest3', IsolationForest(random_state = 0, max_samples = 256, n_estimators = 100)))
        self.detectors.append(('iforest4', IsolationForest(random_state = 0, max_samples = 256, n_estimators = 200)))
        self.detectors.append(('iforest5', IsolationForest(random_state = 0, max_samples = 512, n_estimators = 100)))
        self.detectors.append(('iforest6', IsolationForest(random_state = 0, max_samples = 512, n_estimators = 200)))
        '''
        self.detectors.append(('knn', NearestNeighbors(algorithm='ball_tree')))
        self.detectors.append(('lof', LocalOutlierFactor(metric="precomputed")))
        #self.detectors.append(('robustcov', MinCovDet()))
        self.detectors.append(('iforest', IsolationForest()))
        self.detectors.append(('ocsvm', OneClassSVM()))
        self.detectors.append(('dbscan',  DBSCAN()))
    
    def fit_detector(self, X, y):
        self.clf = LinearRegression(fit_intercept=True, normalize=False, copy_X=True).fit(X, y)

    def fit(self, mat):
        dist = pairwise_distances(X = mat, metric='euclidean')
        self.scores = []
        for (name, detector) in self.detectors:
            if name[:3] == 'lof':
                detector.fit_predict(dist)
                self.scores.append(-detector.negative_outlier_factor_)
            elif name == 'robustcov':
                detector.fit(mat)
                self.scores.append(detector.mahalanobis(mat))
            elif name == 'knn':
                detector.fit(mat)
                self.scores.append(-detector.kneighbors(mat)[0][:, -1])
            elif name == 'dbscan':
                detector.fit(mat)
                score = np.array([1 if x == -1 else 0 for x in detector.labels_])
                self.scores.append(score)
            else:
                detector.fit_predict(mat)
                self.scores.append(-detector.score_samples(mat))
            print(name, min(self.scores[-1]), max(self.scores[-1]), self.scores[-1].shape)
        tmp = []
        for score in self.scores:
            min_s = np.min(score)
            max_s = np.max(score)
            range_s = max(1, max_s - min_s)
            score = (score - min_s) / range_s
            tmp.append(score)
        self.n = mat.shape[0]
        self.scores = np.array(tmp)
        self.ground_truth = {}
        self.adjust_sample_weight = self.n // 100
        self.weights = np.ones(len(self.detectors))
        weights = self.weights / np.sum(self.weights)

        self.scores = self.scores.transpose()
        y = (self.scores * weights).sum(axis = 1)
        print('before fit', self.scores.shape, y.shape)
        self.fit_detector(self.scores, y)
        print('after fit')
    
    def weighted_score(self):
        y = self.clf.predict(self.scores)
        for i in self.ground_truth:
            y[i] = self.ground_truth[i]
        return y

    def adjust_weight(self, idx, score):
        self.ground_truth[idx] = score
        sample_weight = np.ones(self.n)
        for i in self.ground_truth:
            sample_weight[i] = self.adjust_sample_weight
        y = self.weighted_score()
        self.fit_detector(self.scores, y)

model = pickle.load(open('../../output/dump/german0315v2.pkl', 'rb'))
paths = model['paths']
features = model['features']
mat = np.array([p['sample'] for p in paths]).astype('float')
for i in range(mat.shape[0]):
    mat[i] = mat[i] > 0
    mat[i] /= mat[i].sum() #np.sqrt(mat[i].sum())
all_dist = pairwise_distances(X = mat, metric='euclidean')

expected_count = 50
expected_one_class_count = 40

output_labels = ['reject', 'accept']

In [2]:
current_encoding = {
    'credit_risk' : ['No', 'Yes'], 
    'credit_history' : [
        "delay in paying off in the past",
        "critical account/other credits elsewhere",
        "no credits taken/all credits paid back duly",
        "existing credits paid back duly till now",
        "all credits at this bank paid back duly",
    ],
    'purpose' : [
        "others",
        "car (new)",
        "car (used)",
        "furniture/equipment",
        "radio/television",
        "domestic appliances",
        "repairs",
        "education",
        "vacation",
        "retraining",
        "business"
    ],
    'installment_rate': ["< 20", "20 <= ... < 25",  "25 <= ... < 35", ">= 35"],
    'present_residence': [
        "< 1 yr", 
        "1 <= ... < 4 yrs",
        "4 <= ... < 7 yrs", 
        ">= 7 yrs"
    ],
    'number_credits': ["1", "2~3", "4~5", ">= 6"],
    'people_liable': ["0 to 2", "3 or more"],
    'savings': [
        "unknown/no savings account",
        "... <  100 DM", 
        "100 <= ... <  500 DM",
        "500 <= ... < 1000 DM", 
        "... >= 1000 DM",
    ],
    'employment_duration': [
        "unemployed", 
        "< 1 yr", 
        "1 <= ... < 4 yrs",
        "4 <= ... < 7 yrs", 
        ">= 7 yrs"
    ],
    'personal_status_sex': [
        "not married male",
        "married male",
    ],
    'other_debtors': [
        'none',
        'co-applicant',
        'guarantor'
    ],
    'property': [
        "real estate",
        "building soc. savings agr./life insurance", 
        "car or other",
        "unknown / no property",
    ],
    'other_installment_plans': ['bank', 'stores', 'none'],
    'housing': ["rent", "own", "for free"],
    'job': [
        'unemployed/ unskilled - non-resident',
        'unskilled - resident',
        'skilled employee / official',
        'management/ self-employed/ highly qualified employee/ officer'
    ],
    'status': [
        "no checking account",
        "... < 0 DM",
        "0<= ... < 200 DM",
        "... >= 200 DM / salary for at least 1 year",
    ],
    'telephone': ['No', 'Yes'],
    'foreign_worker': ['No', 'Yes'],
}

In [3]:
feature_by_name = {}

for feature in features:
    feature_by_name[feature['name']] = feature

data = pd.read_csv('../model/data/german.csv')
for col in data.columns:
    max_v = data[col].max()
    min_v = data[col].min()
    if min_v == 1:
        data[col] -= 1

feature_mat = []
for it, p in enumerate(paths):
    if it % 100 == 0:
        print('%d paths have been processed.' % it)
    embed = []
    for index, feature in enumerate(features):
        if index in p['range']:
            range_ = p['range'][index]
            val = data[feature['name']]
            #print(range_)
            #print(feature['name'])
            if feature['dtype'] == 'number':
                embed.append(np.array(((val >= range_[0]) & (val <= range_[1])).astype(int)))
            elif len(range_) == 2 and val.max() > 1:
                embed.append(np.array(((val >= range_[0]) & (val <= range_[1])).astype(int)))
            else:
                #print(len(range_), val.max())
                embed.append(np.array([range_[i] for i in val]))
        else:
            embed.append(np.zeros(len(data)))
    embed = np.concatenate(embed)
    feature_mat.append(embed)
feature_mat = np.array(feature_mat)
feature_dist = pairwise_distances(X = feature_mat, metric='cosine')


sample_mat = np.array([p['sample'] for p in paths]).astype('float')
for i in range(sample_mat.shape[0]):
    sample_mat[i] = sample_mat[i] > 0
    sample_mat[i] /= np.sqrt(sample_mat[i].sum())
sample_dist = pairwise_distances(X = sample_mat, metric='euclidean')
all_dist = sample_dist + feature_dist

0 paths have been processed.
100 paths have been processed.
200 paths have been processed.
300 paths have been processed.
400 paths have been processed.
500 paths have been processed.
600 paths have been processed.
700 paths have been processed.
800 paths have been processed.
900 paths have been processed.
1000 paths have been processed.
1100 paths have been processed.
1200 paths have been processed.
1300 paths have been processed.
1400 paths have been processed.
1500 paths have been processed.
1600 paths have been processed.
1700 paths have been processed.
1800 paths have been processed.
1900 paths have been processed.
2000 paths have been processed.
2100 paths have been processed.
2200 paths have been processed.
2300 paths have been processed.
2400 paths have been processed.
2500 paths have been processed.
2600 paths have been processed.
2700 paths have been processed.
2800 paths have been processed.
2900 paths have been processed.
3000 paths have been processed.
3100 paths have been

In [4]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1000)
new_feature_mat = pca.fit_transform(feature_mat)
print(new_feature_mat.shape)

(10423, 1000)


In [5]:
all_dist = sample_dist + feature_dist
all_mat = np.concatenate((sample_mat,  new_feature_mat), axis = 1)
print('all_mat shape', all_mat.shape)

ensemble = DetectorEnsemble()
ensemble.fit(all_mat)
selected_path_idxes = ensemble.weighted_score().argsort()[::-1]

all_mat shape (10423, 2000)
knn -60.613529842767015 -1.4142135623730951 (10423,)
lof 0.9533603164386572 1.3885540164809769 (10423,)
iforest 0.29889968414006535 0.40539423056390256 (10423,)
ocsvm -967.5815332998574 -258.54513306962053 (10423,)
dbscan 1 1 (10423,)
before fit (10423, 5) (10423,)
after fit




In [25]:
rule_type = [
    1,1,1,1,1, 0,0,0,0,1, 
    0,0,0,1,1, 0,0,0,1,0,
    0,1,0,1,1, 0,0,0,0,1,
    1,1,0,0,0, 0,0,1,0,1,
    0,0,1,0,1, 0,0,1,0,1
]

topk = 5
new_idxes = []
new_dists = []
new_pos = []
for it, idx in enumerate(selected_path_idxes):
    if it >= len(rule_type):
        break
    if rule_type[it] > 0:
        nearest = all_dist[idx, :].argsort()[:topk]
        dists = all_dist[idx, nearest]
        new_idxes += nearest.tolist()
        new_dists += dists.tolist()
        new_pos += [it] * topk

In [26]:

def interpret_path(path, features):
    conds = []
    for key in path['range']:
        feature = features[key]
        values = path['range'][key]
        name = feature['name']
        op = 'is'
        value = ''
        if feature['dtype'] == 'category':
            if len(values) < len(feature['values']):
                t_values = [1 if (i >= values[0] and i <= values[1]) else 0 for i in range(1, len(feature['values']) + 1)]
                values = t_values
            is_negation = np.sum(values) + 1 == len(values)
            if is_negation:
                op = 'is not'
                for i, d in enumerate(values):
                    if d == 0:
                        value = feature['values'][i]
                        break
            else:
                for i, d in enumerate(values):
                    if d == 1:
                        value = value + ' or ' + feature['values'][i]
                value = value[4:]
        else:
            op = 'in'
            value = '%d ~ %d' % (values[0], values[1])
        conds.append((name, op, value))
    output_label = output_labels[path['output']]
    # print(output_labels, path['output'])
    return conds, output_label

for index, feature in enumerate(features):
    if feature['name'] in current_encoding:
        feature['values'] = current_encoding[feature['name']]
    else:
        feature['values'] = feature['range']

rules = []
class_count = {}
max_n_conds = 0
for it, i in enumerate(new_idxes):
    conds, output = interpret_path(paths[i], features)
    if class_count.get(output, 0) >= expected_one_class_count:
        continue
    class_count[output] = class_count.get(output, 0) + 1
    rules.append({'cond': conds, 'predict': output, 'index': i, 'dist': new_dists[it] })
    max_n_conds = max(len(conds), max_n_conds)
    #if len(rules) >= expected_count:
    #    break
conds_per_line = 4
max_n_conds = math.ceil(max_n_conds / conds_per_line) * conds_per_line


rule_idxes = [rule['index'] for rule in rules]

In [27]:
f = open('3.csv', 'w')

for it, rule in enumerate(rules):
    print(new_idxes[it])
    if it % topk == 0:
        s = '' + str(new_pos[it])
    else:
        s = 'dist: %.4f' % (new_dists[it])
    line = 0
    n_conds = len(rule['cond'])
    n_lines = math.ceil(n_conds / conds_per_line)
    base = it - it % topk
    overlap = np.sum(np.array(paths[rule['index']]['sample']) * np.array(paths[rules[base]['index']]['sample']))

    for line in range(n_lines):
        if line == 0:
            s += ',%d,%d,%d,IF,' % (rule['index'], np.sum(paths[rule['index']]['sample']), overlap)
        else:
            s += ',,,,,'
        for pos in range(conds_per_line):
            i = pos + line * conds_per_line
            if i < n_conds:
                item = rule['cond'][i]
                s += item[0] + ',' + item[1] + ',' + item[2] + ','
                s += 'AND,' if i < n_conds - 1 else ','
            else:
                s += '...,...,...,...,'
        if line == n_lines - 1:
            s += 'THEN,' + rule['predict']
        s += '\n'
    f.write(s + '\n')
f.close()


967
6854
963
965
966
966
1858
965
964
963
965
7758
7757
964
966
963
1753
964
965
966
964
10180
966
963
965
2951
7917
5259
2876
9572
2955
2876
3275
2951
2952
2956
4598
6260
2952
2953
7441
7438
7440
7439
7501
4078
4080
4471
4077
4076
9349
4148
4147
5072
9781
3994
3993
3018
3996
2529
9350
4068
8239
9352
9351
9351
9349
9352
9350
9348
2020
2019
2018
2017
2021
10154
10157
9846
10153
10152


In [7]:
selected_path_idxes = selected_path_idxes[:50]

def interpret_path(path, features):
    conds = []
    for key in path['range']:
        feature = features[key]
        values = path['range'][key]
        name = feature['name']
        op = 'is'
        value = ''
        if feature['dtype'] == 'category':
            if len(values) < len(feature['values']):
                t_values = [1 if (i >= values[0] and i <= values[1]) else 0 for i in range(1, len(feature['values']) + 1)]
                values = t_values
            is_negation = np.sum(values) + 1 == len(values)
            if is_negation:
                op = 'is not'
                for i, d in enumerate(values):
                    if d == 0:
                        value = feature['values'][i]
                        break
            else:
                for i, d in enumerate(values):
                    if d == 1:
                        value = value + ' or ' + feature['values'][i]
                value = value[4:]
        else:
            op = 'in'
            value = '%d ~ %d' % (values[0], values[1])
        conds.append((name, op, value))
    output_label = output_labels[path['output']]
    # print(output_labels, path['output'])
    return conds, output_label

for index, feature in enumerate(features):
    if feature['name'] in current_encoding:
        feature['values'] = current_encoding[feature['name']]
    else:
        feature['values'] = feature['range']

rules = []
class_count = {}
max_n_conds = 0
for it, i in enumerate(selected_path_idxes):
    conds, output = interpret_path(paths[i], features)
    #if class_count.get(output, 0) >= expected_one_class_count:
    #    continue
    class_count[output] = class_count.get(output, 0) + 1
    rules.append({'cond': conds, 'predict': output, 'index': i })
    max_n_conds = max(len(conds), max_n_conds)
    #if len(rules) >= expected_count:
    #    break
conds_per_line = 4
max_n_conds = math.ceil(max_n_conds / conds_per_line) * conds_per_line


rule_idxes = [rule['index'] for rule in rules]

In [11]:
f = open('4.csv', 'w')

for it, rule in enumerate(rules):
    s = '' + str(it)
    line = 0
    n_conds = len(rule['cond'])
    n_lines = math.ceil(n_conds / conds_per_line)
    # base = it - it % topk
    # overlap = np.sum(np.array(paths[rule['index']]['sample']) * np.array(paths[rules[base]['index']]['sample']))

    for line in range(n_lines):
        if line == 0:
            s += ',%d,%d,IF,' % (rule['index'], np.sum(paths[rule['index']]['sample']))#, overlap)
        else:
            s += ',,,,'
        for pos in range(conds_per_line):
            i = pos + line * conds_per_line
            if i < n_conds:
                item = rule['cond'][i]
                s += item[0] + ',' + item[1] + ',' + item[2] + ','
                s += 'AND,' if i < n_conds - 1 else ','
            else:
                s += '...,...,...,...,'
        if line == n_lines - 1:
            s += 'THEN,' + rule['predict']
        s += '\n'
    f.write(s + '\n')
f.close()


In [14]:
features[12]

{'name': 'job',
 'range': [0, 4],
 'importance': 0.034366424362505296,
 'dtype': 'category',
 'values': ['unemployed/ unskilled - non-resident',
  'unskilled - resident',
  'skilled employee / official',
  'management/ self-employed/ highly qualified employee/ officer']}

In [15]:
current_encoding['job']

['unemployed/ unskilled - non-resident',
 'unskilled - resident',
 'skilled employee / official',
 'management/ self-employed/ highly qualified employee/ officer']

In [24]:
paths[1000]['range']

{14: [1, 0],
 11: [0, 1, 0],
 16: [1, 0, 0, 0],
 19: [0, 1, 1, 1],
 12: [0, 0, 1, 0],
 3: [0.5, 2],
 2: [0, 2.5],
 18: [1, 1, 0, 1, 1],
 6: [1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]}

In [23]:
paths[1001]['range']

{14: [1, 0],
 11: [0, 1, 0],
 16: [1, 0, 0, 0],
 19: [0, 1, 1, 1],
 12: [0, 0, 1, 0],
 3: [0.5, 2],
 2: [0, 2.5],
 18: [1, 1, 0, 1, 1],
 6: [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]}