In [230]:
import sys, os, io, json, numpy as np, random
#from sklearn.metrics import roc_auc_score
#import xgboost as xgb
import tensorflow as tf
import pandas as pd
from split import split_graph, split_interface, make_split
import graphviz
from sklearn import metrics

In [2]:
pool = np.load('../prepare_dataset/pool_00.npz')
features_bool, f_time, label = (pool[name] for name in ['features', 'f_time', 'label'])

In [3]:
features = np.random.normal(loc=features_bool*1.0, scale=1.0)

In [6]:
#features_swap = features.copy()
#features_swap[:,0],features_swap[:,7] = features_swap[:,7],features_swap[:,0]
features_swap = np.concatenate([features[:,7:8], features[:,1:7], features[:,0:1], features[:,8:]], axis=1)

In [7]:
%%time
make_split(np.zeros(label.shape), features, label)

CPU times: user 5.26 s, sys: 808 ms, total: 6.07 s
Wall time: 2.59 s


{'avg_current_loss': array([0.69314718]),
 'best_avg_loss': 0.6124292611643064,
 'best_delta_down': -0.8319551880501467,
 'best_delta_up': -0.7878868675267141,
 'best_feature_index': 0,
 'best_index': 195027,
 'best_loss': 183728.7783492919,
 'current_loss': array([207944.15416803]),
 'thr': 0.6264478451338642}

In [8]:
%%time
make_split(np.zeros(label.shape), features_swap, label)

CPU times: user 5.59 s, sys: 591 ms, total: 6.18 s
Wall time: 1.87 s


{'avg_current_loss': array([0.69314718]),
 'best_avg_loss': 0.6124292611643064,
 'best_delta_down': -0.8319551880501467,
 'best_delta_up': -0.7878868675267141,
 'best_feature_index': 7,
 'best_index': 195027,
 'best_loss': 183728.7783492919,
 'current_loss': array([207944.15416803]),
 'thr': 0.6264478451338642}

In [34]:
class EMatrix:
    def __init__(self, bias, features, label):
        self.bias = bias
        self.features = features
        self.label = label

In [140]:
class LeafData:
    def __init__(self, info):
        self.val = info['prediction']
        
    def to_text(self, floatformat = '.6f'):
        return ('{:'+ floatformat + '}').format(self.val)
    
    def shape(self):
        return 'box'

class SplitData:
    def __init__(self, val):
        self.val = val
        
    def to_text(self, floatformat = '.4f'):
        return ('f_{{{ind}}} < {thr:'+floatformat+'}').format(ind=self.val['best_feature_index'], thr=self.val['thr'])
    
        
class TreeNode:
    def __init__(self):
        self.left = None
        self.right = None
        self.depth = 0
        self.val = None
        self.id = None
        
    def to_text(self, floatformat = '.6f'):
        return self.val.to_text(floatformat)
    
    def shape(self):
        return 'circle'
        

In [134]:
def init_id_helper(node, current_id):
    node.id = current_id[0]
    current_id[0] += 1
    if not isinstance(node, TreeNode):
        return
    init_id_helper(node.left, current_id)
    init_id_helper(node.right, current_id)

def init_id(root):
    current_id = [0]
    init_id_helper(root, current_id)
    return current_id[0]

In [178]:
def init_arrays_helper(node, arrays):
    if not isinstance(node, TreeNode):
        arrays['is_leaf'][node.id] = 1
        arrays['leaf_data'][node.id, 0] = node.val  # Leaf
        return
    init_arrays_helper(node.left, arrays)
    init_arrays_helper(node.right, arrays)
    arrays['yes_node'][node.id] = node.left.id
    arrays['no_node'][node.id] = node.right.id
    arrays['thresholds'][node.id] = node.val.val['thr']
    arrays['features'][node.id] = node.val.val['best_feature_index']
    arrays['is_leaf'][node.id] = 0
    arrays['depths'][node.id] = node.depth
    
def init_arrays(root, n):
    def empty_array():
        return np.zeros(n, dtype=np.int32)
    arrays = dict(features=empty_array(),
                  thresholds=np.zeros(n, dtype=np.float32),
                  yes_node=empty_array(),
                  no_node=empty_array(),
                  is_leaf=empty_array(),
                  depths=empty_array(),
                  leaf_data=np.zeros((n,1), dtype=np.float32)
                 )
    init_arrays_helper(root, arrays)
    arrays['treedepth'] = np.max(arrays['depths'])
    return arrays

In [248]:
def prior_finish(params, info, parent):
    if parent is None:
        return False
    return params['max_depth'] <= parent.depth

def post_finish(params, info, split_info, parent):
    if split_info['left_info']['ematrix'].label.shape[0] < 2:
        return True
    if split_info['right_info']['ematrix'].label.shape[0] < 2:
        return True
    return False

In [261]:
def split_ematrix(ematrix):
    split_info = make_split(ematrix.bias, ematrix.features, ematrix.label)
    thr = split_info['thr']
    features = ematrix.features
    bias = ematrix.bias
    label = ematrix.label
    best_feature = features[:, split_info['best_feature_index']] < thr
    best_feature_not = np.logical_not(best_feature)
    left_ematrix = EMatrix(bias[best_feature], features[best_feature, :], label[best_feature])
    right_ematrix = EMatrix(bias[best_feature_not], features[best_feature_not, :], label[best_feature_not])
    left_info = {'prediction': split_info['best_delta_up'], 'ematrix': left_ematrix}
    right_info = {'prediction': split_info['best_delta_down'], 'ematrix': right_ematrix}
    split_info['left_info'] = left_info
    split_info['right_info'] = right_info
    return left_info, right_info, split_info

In [273]:
def build_tree_helper(params, info, parent):
    print("{d}".format(d=parent.depth) 
          if parent else '---',
          "({shape})".format(shape=info['ematrix'].label.shape[0]),
          end=' ', file=sys.stderr)
    if prior_finish(params, info, parent):
        return LeafData(info)
    left_info, right_info, split_info = split_ematrix(info['ematrix'])
    if post_finish(params, info, split_info, parent):
        return LeafData(info)
    node = TreeNode()
    node.depth = parent.depth + 1 if parent else 1
    node.val = SplitData(split_info)
    node.left = build_tree_helper(params, left_info, node)
    node.right = build_tree_helper(params, right_info, node)  
    return node

In [274]:
def build_tree(params, ematrix):
    info = {'ematrix': ematrix}
    return build_tree_helper(params, info=info, parent=None)

In [276]:
start_params = {'max_depth': 15}

tree = build_tree(start_params, EMatrix(np.zeros(label.shape), features, label))

--- (300000) 1 (195028) 2 (189790) 3 (122607) 4 (70681) 5 (29469) 6 (29449) 7 (29408) 8 (29387) 9 (138) 10 (129) 11 (106) 11 (23) 12 (11) 12 (12) 13 (5) 13 (7) 10 (9) 11 (6) 12 (3) 12 (3) 11 (3) 9 (29249) 10 (1834) 11 (200) 12 (7) 12 (193) 13 (92) 14 (85) 15 (83) 15 (2) 14 (7) 13 (101) 14 (94) 15 (16) 15 (78) 14 (7) 11 (1634) 12 (1544) 13 (232) 14 (207) 15 (202) 15 (5) 14 (25) 15 (14) 15 (11) 13 (1312) 14 (49) 15 (32) 15 (17) 14 (1263) 15 (146) 15 (1117) 12 (90) 13 (64) 13 (26) 14 (10) 15 (3) 15 (7) 14 (16) 10 (27415) 11 (8868) 12 (8552) 13 (115) 14 (104) 15 (2) 15 (102) 14 (11) 15 (7) 15 (4) 13 (8437) 14 (16) 15 (3) 15 (13) 14 (8421) 15 (247) 15 (8174) 12 (316) 13 (133) 14 (131) 15 (17) 15 (114) 14 (2) 13 (183) 14 (24) 14 (159) 15 (5) 15 (154) 11 (18547) 12 (15493) 13 (15439) 14 (402) 15 (254) 15 (148) 14 (15037) 15 (107) 15 (14930) 13 (54) 14 (7) 15 (4) 15 (3) 14 (47) 12 (3054) 13 (44) 13 (3010) 14 (2239) 15 (8) 15 (2231) 14 (771) 15 (30) 15 (741) 8 (21) 9 (6) 10 (3) 10 (3) 9 (15) 10

In [91]:
tree.left.left.left.left.val, tree.val

(-0.7472163664917022, <__main__.SplitData at 0x10753efd0>)

In [116]:
def tree2gv(tree):
    result = graphviz.Graph('ni')
    #result.attr(size='12,0')
    tree2gv_helper(tree, result, '')
    return result

In [117]:
def tree2gv_helper(node, result, id):
    idn = id
    result.node(idn, node.to_text(), shape='box') # node.shape())
    if isinstance(node, LeafData):
        return
    if node.left is not None:
        idl = id + '0'
        tree2gv_helper(node.left, result, idl)
        result.edge(idn, idl)
    if node.right is not None:
        idr = id + '1'
        tree2gv_helper(node.right, result, idr)
        result.edge(idn, idr)
    
    

In [125]:
#tree2gv(tree)

In [129]:
tree

<__main__.TreeNode at 0x107573710>

In [267]:
tree_arrays = init_arrays(tree, init_id(tree))

In [268]:
#tree_arrays

In [269]:
def apply(tree_arrays, features):
    qi = np.zeros(features.shape[0], dtype=np.int32)
    for current_depth in range(tree_arrays['treedepth']):
        fi = tree_arrays['features'][qi]
        f = np.choose(fi, features.T)
        t = tree_arrays['thresholds'][qi]
        #print(qi, fi, f, t)
        #if current_depth == 0: 
        #    print(fi, f.shape, features.shape, f)
        answer = (f < t)*1
        new_qi = answer*tree_arrays['yes_node'][qi] + (1-answer)*tree_arrays['no_node'][qi]
        qi = new_qi
    leaf_data = tree_arrays['leaf_data'][qi, 0]
    return leaf_data

In [270]:
pred = apply(tree_arrays, features)

In [271]:
metrics.roc_auc_score(label[:, 0], pred)

0.5245617998843118

In [272]:
metrics.roc_auc_score(label[:, 0], np.zeros(label.shape[0]))

0.5