In [44]:
from functools import partial
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.datasets import load_iris, load_breast_cancer, load_wine
from sporboost.forest import *
import pandas as pd
from time import time
import os
from sklearn.metrics import make_scorer

################################################################################
################################################################################
################################################################################
# Data Load
def load_simulated(path):
    data = np.loadtxt(path, delimiter=",")
    return data[:,:-1], data[:,-1].astype('int32')

dfs = {
    'iris' : load_iris(return_X_y = True),
    'breast_cancer' : load_breast_cancer(return_X_y = True),
    'wine' : load_wine(return_X_y = True),
    'trunk' : load_simulated("source_data/simulated/Trunk_train.csv"),
    'orthant' : load_simulated("source_data/simulated/Orthant_train.csv"),
    'sparse_parity' : load_simulated("source_data/simulated/Sparse_parity_train.csv")
}


In [45]:
################################################################################
################################################################################
################################################################################
# Hyperparameters across all estimators
# Boosted Forest
boosted_max_depth = (1,2)

# Sparse Random Trees
d_ratio = (0.5, 1)
s = (3, 4)

# Rotational Decision Trees
K = (2, 3)


In [53]:
################################################################################
################################################################################
################################################################################
# Scoring Functions

# Note sklearn's cohen's kappa formula is undefined when predictions are 
# perfect, for the sake of comparing metrics across folds we will override this
# to return 1 in those scenarios
def cohens_kappa(*args, **kwargs):
    out = cohen_kappa_score(*args, **kwargs)
    if np.isnan(out):
        return 1.
    return out

# Scoring Functions
scoring = {
    "cohens_kappa" : make_scorer(cohens_kappa),
    "accuracy" : 'accuracy'
}

In [47]:
################################################################################
################################################################################
################################################################################
# Initalize cross-validation params
pipe = Pipeline([
    ('clf', RandomForest())
])

params = [
    {
        'clf' : (RandomForest(),),
    },
    {
        'clf' : (AdaBoost(),),
        'clf__max_depth' : boosted_max_depth
    },
    {
        'clf' : (SPORF(),),
        'clf__d_ratio' : d_ratio,
        'clf__s' : s
    },
    {
        'clf' : (SPORBoost(),),
        'clf__max_depth' : boosted_max_depth,
        'clf__d_ratio' : d_ratio,
        'clf__s' : s
    },
    {
        'clf' : (RotationalForest(),),
        'clf__K' : K
    },
    {
        'clf' : (RotBoost(),),
        'clf__max_depth' : boosted_max_depth,
        'clf__K' : K
    }
]

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=20220329)

cv_base = GridSearchCV(estimator = pipe, param_grid = params,
                       cv=kfold,
                       verbose=3,
                       n_jobs=-1,
                       return_train_score=True,
                       refit=False,
                       scoring=scoring
)


In [54]:
################################################################################
################################################################################
################################################################################
# Func to write results to disk
def write_results(cv, label):
    front_cols = ['param_clf',
                  'mean_test_cohens_kappa', 'std_test_cohens_kappa',
                  'mean_test_accuracy', 'std_test_accuracy',
                  'mean_train_cohens_kappa', 'std_train_cohens_kappa',
                  'mean_train_accuracy', 'std_train_accuracy',
                  'param_clf__max_depth',
                  'param_clf__d_ratio', 'param_clf__s',
                  'param_clf__K'
                ]

    results = pd.DataFrame(cv.cv_results_).drop(['params'], axis=1)
    results['param_clf'] = results['param_clf'].apply(lambda c: type(c).__name__)
    results = results.sort_values(['param_clf', 'mean_test_cohens_kappa'],
                                ascending=False, ignore_index = True)
            
    results = results[front_cols + results.drop(front_cols, axis=1).columns.tolist()]
    results.to_csv(f"report/{label}.csv", index=False)

In [55]:
################################################################################
################################################################################
################################################################################
# Fit the CVs
for name, df in dfs.items():
    if not os.path.exists(f"report/{name}.csv"):
        start = time()
        print(f"Testing {name}")
        write_results(cv_base.fit(*df), name)
        end = time()
        print(f"Finished {name} in {(end - start) / 60:.1f} mins")
    else:
        print(f"Skipping {name} as it has already been tested")

Testing iris
Fitting 10 folds for each of 21 candidates, totalling 210 fits


KeyboardInterrupt: 

In [52]:
cv_base.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_clf', 'param_clf__max_depth', 'param_clf__d_ratio', 'param_clf__s', 'param_clf__K', 'params', "split0_test_Cohen's Kappa", "split1_test_Cohen's Kappa", "split2_test_Cohen's Kappa", "split3_test_Cohen's Kappa", "split4_test_Cohen's Kappa", "split5_test_Cohen's Kappa", "split6_test_Cohen's Kappa", "split7_test_Cohen's Kappa", "split8_test_Cohen's Kappa", "split9_test_Cohen's Kappa", "mean_test_Cohen's Kappa", "std_test_Cohen's Kappa", "rank_test_Cohen's Kappa", "split0_train_Cohen's Kappa", "split1_train_Cohen's Kappa", "split2_train_Cohen's Kappa", "split3_train_Cohen's Kappa", "split4_train_Cohen's Kappa", "split5_train_Cohen's Kappa", "split6_train_Cohen's Kappa", "split7_train_Cohen's Kappa", "split8_train_Cohen's Kappa", "split9_train_Cohen's Kappa", "mean_train_Cohen's Kappa", "std_train_Cohen's Kappa", 'split0_test_Accuracy', 'split1_test_Accuracy', 'split2_test_Accuracy', 'split3_test_Accuracy