In [1]:
# %load simulations
from TreeClassifier import *
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.svm import LinearSVC
import numpy as np
import datetime
import os

In [2]:
depth = 2
output_dir = r"C:\Temp\simulations-tests"

In [3]:
def get_simulation_data_1(d,n):
    X = np.random.normal(size=[n, d])
    y = np.greater_equal(np.sum(X, axis=1), 0.0)
    return X, y


def get_simulation_data_2(d,n):
    X = np.random.normal(size=[n, d])
    w = np.random.normal(size=[d, 1])
    y = np.greater_equal(np.dot(X,w).reshape(-1), 0.0)
    print 'positive class weight {}'.format(float(y.sum()) / len(y))
    return X, y


def get_simulation_data_3(d,n):
    X = np.random.normal(size=[n, d])
    w1 = np.random.normal(size=[d, 1])
    w2 = np.random.normal(size=[d, 1])
    y = np.greater_equal(np.multiply(np.dot(X,w1),np.dot(X,w2)).reshape(-1), 0.0)
    print 'positive class weight {}'.format(float(y.sum()) / len(y))
    return X, y

def get_simulation_data_4(d,n):
    X = np.random.normal(size=[n, d])
    X1 = X[:,:d/2]
    X2 = X[:,d/2:]
    w1 = np.random.normal(size=[d/2, 1])
    w2 = np.random.normal(size=[d/2, 1])
    y = np.greater_equal(np.multiply(np.dot(X1,w1),np.dot(X2,w2)).reshape(-1), 0.0)
    print 'positive class weight {}'.format(float(y.sum()) / len(y))
    return X, y

In [4]:
def run_comparison(X,y):
    tree_classifier = TreeClassifier(0.001, depth, normalizer_mode="norm", feature_drop_probability=0.0)
    stochastic_scores = cross_val_score(tree_classifier, X, y, cv=5)
    print "stochastic"
    print stochastic_scores.mean()

    tree_scores = cross_val_score(tree.DecisionTreeClassifier(max_depth=depth), X, y, cv=5)
    print "regular"
    print tree_scores.mean()
    
    svm_scores = cross_val_score(LinearSVC(), X, y, cv=5)
    print "svm"
    print svm_scores.mean()
    
    return stochastic_scores, tree_scores, svm_scores

In [5]:
def manage_run(data_generation, sim_name, d=100, n=1000, noise = False):
    test_time = str(datetime.datetime.now()).replace(':','-').replace(' ','-').replace('.','-')
    name = sim_name
    if noise:
        name = name + 'noise-'
    name = name + test_time
    current_dir = os.path.join(output_dir,name)
    if not os.path.exists(current_dir):
        os.makedirs(current_dir)
    X,y = data_generation(d,n)
    rho = float(y.sum()) / len(y)
    if noise:
        X_n = X + np.random.normal(scale=0.1,size=[n, d])
    else:
        X_n = X
    np.savez(os.path.join(current_dir,'data.npz'), X, X_n, y, rho)
    stochastic_scores, tree_scores, svm_scores = run_comparison(X,y)
    np.savez(os.path.join(current_dir,'results.npz'), stochastic_scores, stochastic_scores.mean(), 
             tree_scores, tree_scores.mean(), svm_scores, svm_scores.mean())

In [6]:
manage_run(get_simulation_data_1,'sim1-',d=3,n=100,noise=True)

splitting leaf with weight 1.0 and purity 0.443037974684
l_child weight 0.50234870443 and purity 0.572855976151
r_child weight 0.49765129557 and purity 0.311994600679
stopping level 1
stopping level 1
splitting leaf with weight 1.0 and purity 0.4375
l_child weight 0.471258405354 and purity 0.574570931679
r_child weight 0.528741594646 and purity 0.315331006038
stopping level 1
stopping level 1
splitting leaf with weight 1.0 and purity 0.4375
l_child weight 0.491364245161 and purity 0.577908117637
r_child weight 0.508635754839 and purity 0.301859656039
stopping level 1
stopping level 1
splitting leaf with weight 1.0 and purity 0.4375
l_child weight 0.543956381672 and purity 0.322048956859
r_child weight 0.456043618328 and purity 0.575206853387
stopping level 1
stopping level 1
splitting leaf with weight 1.0 and purity 0.444444444444
l_child weight 0.529654087857 and purity 0.312779008384
r_child weight 0.470345912143 and purity 0.592712207889
stopping level 1
stopping level 1
stochastic


In [None]:
for i in range(5):
    for b in [True,False]:
        manage_run(get_simulation_data_1,'sim1-',noise=b)
for i in range(5):
    for b in [True,False]:
        manage_run(get_simulation_data_2,'sim2-',noise=b)
for i in range(5):
    for b in [True,False]:
        manage_run(get_simulation_data_3,'sim3-',noise=b)
for i in range(5):
    for b in [True,False]:
        manage_run(get_simulation_data_4,'sim4-',noise=b)