In [41]:
from functools import partial
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.datasets import load_iris, load_breast_cancer, load_wine
from sporboost.forest import *
import pandas as pd
from time import time
import os

################################################################################
################################################################################
################################################################################
# Data Load
def load_simulated(path):
    data = np.loadtxt(path, delimiter=",")
    return data[:,:-1], data[:,-1].astype('int32')

dfs = {
    'iris' : load_iris(return_X_y = True),
    'breast_cancer' : load_breast_cancer(return_X_y = True),
    'wine' : load_wine(return_X_y = True),
    'trunk' : load_simulated("source_data/simulated/Trunk_train.csv"),
    'orthant' : load_simulated("source_data/simulated/Orthant_train.csv"),
    'sparse_parity' : load_simulated("source_data/simulated/Sparse_parity_train.csv")
}


In [2]:
################################################################################
################################################################################
################################################################################
# Hyperparameters across all estimators
# Boosted Forest
boosted_max_depth = (1,2)

# Sparse Random Trees
d_ratio = (0.5, 1)
s = (3, 4)

# Rotational Decision Trees
K = (2, 3)


In [3]:
################################################################################
################################################################################
################################################################################
# Initalize cross-validation params
pipe = Pipeline([
    ('clf', RandomForest())
])

params = [
    {
        'clf' : (RandomForest(),),
    },
    {
        'clf' : (AdaBoost(),),
        'clf__max_depth' : boosted_max_depth
    },
    {
        'clf' : (SPORF(),),
        'clf__d_ratio' : d_ratio,
        'clf__s' : s
    },
    {
        'clf' : (SPORBoost(),),
        'clf__max_depth' : boosted_max_depth,
        'clf__d_ratio' : d_ratio,
        'clf__s' : s
    },
    {
        'clf' : (RotationalForest(),),
        'clf__K' : K
    },
    {
        'clf' : (RotBoost(),),
        'clf__max_depth' : boosted_max_depth,
        'clf__K' : K
    }
]

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=20220329)

cv_base = GridSearchCV(estimator = pipe, param_grid = params,
                       cv=kfold,
                       verbose=3,
                       n_jobs=-1,
                       return_train_score=True
)


In [4]:
################################################################################
################################################################################
################################################################################
# Func to write results to disk
def write_results(cv, label):
    front_cols = ['param_clf', 'mean_test_score', 'std_test_score',
                'mean_train_score', 'std_train_score',
                'param_clf__max_depth',
                'param_clf__d_ratio', 'param_clf__s',
                'param_clf__K'
                ]

    results = pd.DataFrame(cv.cv_results_).drop(['params'], axis=1)
    results['param_clf'] = results['param_clf'].apply(lambda c: type(c).__name__)
    results = results.sort_values(['param_clf', 'mean_test_score'],
                                ascending=False, ignore_index = True)
            
    results = results[front_cols + results.drop(front_cols, axis=1).columns.tolist()]
    results.to_csv(f"report/{label}.csv", index=False)

In [42]:
################################################################################
################################################################################
################################################################################
# Fit the CVs
for name, df in dfs.items():
    if not os.path.exists(f"report/{name}.csv"):
        start = time()
        print(f"Testing {name}")
        write_results(cv_base.fit(*df), name)
        end = time()
        print(f"Finished {name} in {(end - start) / 60:.1f} mins")
    else:
        print(f"Skipping {name} as it has already been tested")

Skipping iris as it has already been tested
Skipping breast_cancer as it has already been tested
Skipping wine as it has already been tested
Testing trunk
Fitting 10 folds for each of 21 candidates, totalling 210 fits


 0.98 0.96 0.98 0.98  nan 0.92  nan]
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         0.99555775
        nan 1.                nan]


Finished trunk in 2.5 mins
Testing orthant
Fitting 10 folds for each of 21 candidates, totalling 210 fits




Finished orthant in 6.5 mins
Testing sparse_parity
Fitting 10 folds for each of 21 candidates, totalling 210 fits
Finished sparse_parity in 2.1 mins
