In [1]:
# %load simulations
from TreeClassifier import *
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import tree
import numpy as np
import datetime
import os

In [2]:
depth = 10
output_dir = r"C:\Temp\simulations-tests"

In [3]:
def get_simulation_data_1(d=100,n=1000):
    X = np.random.normal(size=[n, d])
    y = np.greater_equal(np.sum(X, axis=1), 0.0)
    return X, y


def get_simulation_data_2(d=100,n=1000):
    X = np.random.normal(size=[n, d])
    w = np.random.normal(size=[d, 1])
    y = np.greater_equal(np.dot(X,w).reshape(-1), 0.0)
    print 'positive class weight {}'.format(float(y.sum()) / len(y))
    return X, y


def get_simulation_data_3(d=100,n=1000):
    X = np.random.normal(size=[n, d])
    w1 = np.random.normal(size=[d, 1])
    w2 = np.random.normal(size=[d, 1])
    y = np.greater_equal(np.multiply(np.dot(X,w1),np.dot(X,w2)).reshape(-1), 0.0)
    print 'positive class weight {}'.format(float(y.sum()) / len(y))
    return X, y

def get_simulation_data_4(d=100,n=1000):
    X = np.random.normal(size=[n, d])
    X1 = X[:,:d/2]
    X2 = X[:,d/2:]
    w1 = np.random.normal(size=[d/2, 1])
    w2 = np.random.normal(size=[d/2, 1])
    y = np.greater_equal(np.multiply(np.dot(X1,w1),np.dot(X2,w2)).reshape(-1), 0.0)
    print 'positive class weight {}'.format(float(y.sum()) / len(y))
    return X, y

In [4]:
def run_comparison(X,y):
    tree_classifier = TreeClassifier(0.001, depth, normalizer_mode="norm", feature_drop_probability=0.0)
    stochastic_scores = cross_val_score(tree_classifier, X, y, cv=5)
    print "stochastic"
    print stochastic_scores.mean()

    tree_scores = cross_val_score(tree.DecisionTreeClassifier(max_depth=depth), X, y, cv=5)
    print "regular"
    print tree_scores.mean()
    
    return stochastic_scores, tree_scores

In [None]:
def manage_run(data_generation, sim_name):
    test_time = str(datetime.datetime.now()).replace(':','-').replace(' ','-').replace('.','-')
    current_dir = os.path.join(output_dir,sim_name+test_time)
    if not os.path.exists(current_dir):
        os.makedirs(current_dir)
    X,y = data_generation()
    rho = float(y.sum()) / len(y)
    np.savez(os.path.join(current_dir,'data.npz'), X, y, rho)
    stochastic_scores, tree_scores = run_comparison(X,y)
    np.savez(os.path.join(current_dir,'results.npz'), stochastic_scores, stochastic_scores.mean(), tree_scores, tree_scores.mean())

In [None]:
manage_run(get_simulation_data_1,'sim1-')

splitting leaf with weight 1.0 and purity 0.499374217772
l_child weight 0.499989479895 and purity 0.529376421491
r_child weight 0.500010520105 and purity 0.469373276532
splitting leaf with weight 0.499989479895 and purity 0.529376421491
l_child weight 0.252788026129 and purity 0.558927269539
r_child weight 0.247201453765 and purity 0.499157745854
splitting leaf with weight 0.252788026129 and purity 0.558927269539
l_child weight 0.12358983924 and purity 0.528662953992
r_child weight 0.129198186889 and purity 0.587877845265
splitting leaf with weight 0.12358983924 and purity 0.528662953992
l_child weight 0.0611284555449 and purity 0.498605260313
r_child weight 0.0624613836953 and purity 0.558079215417
splitting leaf with weight 0.0611284555449 and purity 0.498605260313
l_child weight 0.0305562290494 and purity 0.528311719701
r_child weight 0.0305722264955 and purity 0.468914345344
splitting leaf with weight 0.0305562290494 and purity 0.528311719701
l_child weight 0.0151149373739 and puri

In [None]:
for i in range(3):
    manage_run(get_simulation_data_1,'sim1-')
for i in range(3):
    manage_run(get_simulation_data_2,'sim2-')
for i in range(3):
    manage_run(get_simulation_data_3,'sim3-')
for i in range(3):
    manage_run(get_simulation_data_4,'sim4-')