In [None]:
# %load simulations
from TreeClassifier import *
from TreeHardClassifier import *
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.svm import LinearSVC
import numpy as np
import datetime
import os

In [None]:
depth = 10
output_dir = r"C:\Temp\simulations-tests-11"

In [None]:
def get_simulation_data_1(d,n):
    X = np.random.normal(size=[n, d])
    y = np.sum(X, axis=1)
#     y = np.greater_equal(np.sum(X, axis=1), 0.0)
    return X, y


def get_simulation_data_2(d,n):
    X = np.random.normal(size=[n, d])
    w = np.random.normal(size=[d, 1])
    y = np.dot(X,w).reshape(-1)
#     y = np.greater_equal(np.dot(X,w).reshape(-1), 0.0)
#     print 'positive class weight {}'.format(float(y.sum()) / len(y))
    return X, y


def get_simulation_data_3(d,n):
    X = np.random.normal(size=[n, d])
    w1 = np.random.normal(size=[d, 1])
    w2 = np.random.normal(size=[d, 1])
    y = np.multiply(np.dot(X,w1),np.dot(X,w2)).reshape(-1)
#     y = np.greater_equal(np.multiply(np.dot(X,w1),np.dot(X,w2)).reshape(-1), 0.0)
#     print 'positive class weight {}'.format(float(y.sum()) / len(y))
    return X, y

def get_simulation_data_4(d,n):
    X = np.random.normal(size=[n, d])
    X1 = X[:,:d/2]
    X2 = X[:,d/2:]
    w1 = np.random.normal(size=[d/2, 1])
    w2 = np.random.normal(size=[d/2, 1])
    y = np.multiply(np.dot(X1,w1),np.dot(X2,w2)).reshape(-1)
#     y = np.greater_equal(np.multiply(np.dot(X1,w1),np.dot(X2,w2)).reshape(-1), 0.0)
#     print 'positive class weight {}'.format(float(y.sum()) / len(y))
    return X, y

def get_simulation_data_5(d,n):
    X = np.random.normal(size=[n, d])
    w1 = np.random.normal(size=[d, 1])
    w2 = np.random.normal(size=[d, 1])
    y = np.max([np.dot(X,w1),np.dot(X,w2)],axis =0).reshape(-1)
    y = y - np.median(y)
#     y = np.greater_equal(np.multiply(np.dot(X,w1),np.dot(X,w2)).reshape(-1), 0.0)
#     print 'positive class weight {}'.format(float(y.sum()) / len(y))
    return X, y

def get_simulation_data_6(d,n):
    X = np.random.normal(size=[n, d])
    w1 = np.ones([d/2, 1])
    w2 = np.zeros([d/2, 1])
    u1 = np.concatenate([w1,w2])
    u2 = np.concatenate([w2,w1])
    y = np.multiply(np.dot(X,u1),np.dot(X,u2)).reshape(-1)
#     y = np.greater_equal(np.multiply(np.dot(X,w1),np.dot(X,w2)).reshape(-1), 0.0)
#     print 'positive class weight {}'.format(float(y.sum()) / len(y))
    return X, y

In [None]:
def run_comparison(X,y):
    tree_classifier = TreeClassifier(0.001, depth, normalizer_mode="norm", fit_full_tree=True, print_debug=False)
    stochastic_scores = cross_val_score(tree_classifier, X, y, cv=5)
    print "stochastic"
    print stochastic_scores.mean()
    
#     sharp_tree_classifier = TreeHardClassifier(0.001, depth, normalizer_mode="norm", print_debug=False)
#     sharp_scores = cross_val_score(sharp_tree_classifier, X, y, cv=5)
#     print "sharp"
#     print sharp_scores.mean()

    tree_scores = cross_val_score(tree.DecisionTreeClassifier(max_depth=depth), X, y, cv=5)
    print "regular"
    print tree_scores.mean()
    
    svm_scores = cross_val_score(LinearSVC(fit_intercept=False), X, y, cv=5)
    print "svm"
    print svm_scores.mean()
    
    return stochastic_scores, tree_scores, svm_scores
#     return svm_scores, stochastic_scores

In [None]:
def manage_run(data_generation, sim_name, d=100, n=1000, noise = None):
    test_time = str(datetime.datetime.now()).replace(':','-').replace(' ','-').replace('.','-')
    name = sim_name
    if noise:
        name = name + 'noise-' + str(noise) +'-'
    name = name + test_time
    print name
    current_dir = os.path.join(output_dir,name)
    if not os.path.exists(current_dir):
        os.makedirs(current_dir)
    X,y = data_generation(d,n)
    if noise:
        flip = np.random.binomial(1,noise,y.shape)
        print 'flipped: ' + str(flip.sum())
        y = np.multiply(flip,-y) + np.multiply(1-flip,y)
    y = np.greater_equal(y, 0.0)
    rho = float(y.sum()) / len(y)
    print 'positive class weight {}'.format(rho)
    np.savez(os.path.join(current_dir,'data.npz'), X, y, rho)
    stochastic_scores, tree_scores, svm_scores = run_comparison(X,y)
#     stochastic_scores, tree_scores, svm_scores = run_comparison(X,y)
    np.savez(os.path.join(current_dir,'results.npz'), stochastic_scores, stochastic_scores.mean(), 
             tree_scores, tree_scores.mean(), svm_scores, svm_scores.mean())
    print ''
    return stochastic_scores.mean(), tree_scores.mean(), svm_scores.mean()

In [None]:
# for i in range(5):
#     for noise in [None]:
#         X,y = get_simulation_data_6(100,1000)
#         y = np.greater_equal(y, 0.0)
#         print run_comparison(X,y)

In [None]:
stochastic_means = {}
tree_means = {}
svm_means = {}

noise_list = [None,0.1,0.2,0.3]
# noise_list = [None,0.1]

for noise in noise_list:
    stochastic_means[noise] = []
    tree_means[noise] = []
    svm_means[noise] = []
    for i in range(4):    
#     for i in range(2):
        stochastic_mean, tree_mean, svm_mean = manage_run(get_simulation_data_2,'sim2-',noise=noise)
        stochastic_means[noise] += [stochastic_mean]
        tree_means[noise] += [tree_mean]
        svm_means[noise] += [svm_mean]

for noise in noise_list:        
    print stochastic_means[noise]
    print tree_means[noise]
    print svm_means[noise]
    print np.mean(stochastic_means[noise])
    print np.mean(tree_means[noise])
    print np.mean(svm_means[noise])

In [None]:
# for i in range(4):
#     for noise in [None,0.1,0.2,0.3]:
#         manage_run(get_simulation_data_1,'sim1-',noise=noise)
#         manage_run(get_simulation_data_2,'sim2-',noise=noise)
#         manage_run(get_simulation_data_3,'sim3-',noise=noise)
#         manage_run(get_simulation_data_4,'sim4-',noise=noise)