In [27]:
import sys, os, io, json, numpy as np, random
#from sklearn.metrics import roc_auc_score
#import xgboost as xgb
import tensorflow as tf
import pandas as pd
from split import split_graph, split_interface, make_split
import graphviz

In [2]:
pool = np.load('../prepare_dataset/pool_00.npz')
features_bool, f_time, label = (pool[name] for name in ['features', 'f_time', 'label'])

In [3]:
features = np.random.normal(loc=features_bool*1.0, scale=1.0)

In [6]:
#features_swap = features.copy()
#features_swap[:,0],features_swap[:,7] = features_swap[:,7],features_swap[:,0]
features_swap = np.concatenate([features[:,7:8], features[:,1:7], features[:,0:1], features[:,8:]], axis=1)

In [7]:
%%time
make_split(np.zeros(label.shape), features, label)

CPU times: user 5.26 s, sys: 808 ms, total: 6.07 s
Wall time: 2.59 s


{'avg_current_loss': array([0.69314718]),
 'best_avg_loss': 0.6124292611643064,
 'best_delta_down': -0.8319551880501467,
 'best_delta_up': -0.7878868675267141,
 'best_feature_index': 0,
 'best_index': 195027,
 'best_loss': 183728.7783492919,
 'current_loss': array([207944.15416803]),
 'thr': 0.6264478451338642}

In [8]:
%%time
make_split(np.zeros(label.shape), features_swap, label)

CPU times: user 5.59 s, sys: 591 ms, total: 6.18 s
Wall time: 1.87 s


{'avg_current_loss': array([0.69314718]),
 'best_avg_loss': 0.6124292611643064,
 'best_delta_down': -0.8319551880501467,
 'best_delta_up': -0.7878868675267141,
 'best_feature_index': 7,
 'best_index': 195027,
 'best_loss': 183728.7783492919,
 'current_loss': array([207944.15416803]),
 'thr': 0.6264478451338642}

In [34]:
class EMatrix:
    def __init__(self, bias, features, label):
        self.bias = bias
        self.features = features
        self.label = label

In [95]:
class LeafData:
    def __init__(self, info):
        self.val = info['prediction']
        
    def to_text(self, floatformat = '.6f'):
        return ('{:'+ floatformat + '}').format(self.val)
    
    def shape(self):
        return 'box'

class SplitData:
    def __init__(self, val):
        self.val = val
        
    def to_text(self, floatformat = '.4f'):
        return ('f_{{{ind}}} < {thr:'+floatformat+'}').format(ind=self.val['best_feature_index'], thr=self.val['thr'])
    
        
class TreeNode:
    def __init__(self):
        self.left = None
        self.right = None
        self.depth = 0
        self.val = None
        
    def to_text(self, floatformat = '.6f'):
        return self.val.to_text(floatformat)
    
    def shape(self):
        return 'circle'
        

In [96]:
def prior_finish(params, info, parent):
    if parent is None:
        return False
    return params['max_depth'] <= parent.depth

def post_finish(params, info, split_info, parent):
    return False

In [97]:
start_params = {'max_depth': 4}

In [98]:
def split_ematrix(ematrix):
    split_info = make_split(ematrix.bias, ematrix.features, ematrix.label)
    thr = split_info['thr']
    features = ematrix.features
    bias = ematrix.bias
    label = ematrix.label
    best_feature = features[:, split_info['best_feature_index']] < thr
    best_feature_not = np.logical_not(best_feature)
    left_ematrix = EMatrix(bias[best_feature], features[best_feature, :], label[best_feature])
    right_ematrix = EMatrix(bias[best_feature_not], features[best_feature_not, :], label[best_feature_not])
    left_info = {'prediction': split_info['best_delta_up'], 'ematrix': left_ematrix}
    right_info = {'prediction': split_info['best_delta_down'], 'ematrix': right_ematrix}
    return left_info, right_info, split_info

In [99]:
def build_tree_helper(params, info, parent):
    print(parent.depth if parent else '---', end=' ', file=sys.stderr)
    if prior_finish(params, info, parent):
        return LeafData(info)
    left_info, right_info, split_info = split_ematrix(info['ematrix'])
    if post_finish(params, info, split_info, parent):
        return LeafData(info)
    node = TreeNode()
    node.depth = parent.depth + 1 if parent else 1
    node.val = SplitData(split_info)
    node.left = build_tree_helper(params, left_info, node)
    node.right = build_tree_helper(params, right_info, node)  
    return node

In [100]:
def build_tree(params, ematrix):
    info = {'ematrix': ematrix}
    return build_tree_helper(params, info=info, parent=None)

In [101]:
tree = build_tree(start_params, EMatrix(np.zeros(label.shape), features, label))

--- 1 2 3 4 4 3 4 4 2 3 4 4 3 4 4 1 2 3 4 4 3 4 4 2 3 4 4 3 4 4 

In [91]:
tree.left.left.left.left.val, tree.val

(-0.7472163664917022, <__main__.SplitData at 0x10753efd0>)

In [116]:
def tree2gv(tree):
    result = graphviz.Graph('ni')
    #result.attr(size='12,0')
    tree2gv_helper(tree, result, '')
    return result

In [117]:
def tree2gv_helper(node, result, id):
    idn = id
    result.node(idn, node.to_text(), shape='box') # node.shape())
    if isinstance(node, LeafData):
        return
    if node.left is not None:
        idl = id + '0'
        tree2gv_helper(node.left, result, idl)
        result.edge(idn, idl)
    if node.right is not None:
        idr = id + '1'
        tree2gv_helper(node.right, result, idr)
        result.edge(idn, idr)
    
    

In [125]:
#tree2gv(tree)