In [1]:
import pandas as pd
from sklearn.metrics import r2_score, accuracy_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn import tree
from sklearn.datasets import load_breast_cancer, make_classification
from matplotlib import pyplot as plt
from rustrees import Dataset, TreeNode
import time
import numpy as np

In [2]:
datasets = {
    "reg": ["diabetes", "housing"],
    "clf": ["breast_cancer", "titanic"]
}

In [3]:
n_repeats = 100
max_depth = 5

In [4]:
def evaluate_dataset(dataset, problem):
    df_train = pd.read_csv(f"datasets/{dataset}_train.csv")
    df_test = pd.read_csv(f"datasets/{dataset}_test.csv")
    
    df_train_rt = Dataset.read_csv(f"datasets/{dataset}_train.csv", sep=",")
    df_test_rt = Dataset.read_csv(f"datasets/{dataset}_test.csv", sep=",")
        
    if problem == "reg":
        metric_fn = r2_score
        metric = "r2"
        dt = DecisionTreeRegressor(max_depth=max_depth)
    elif problem == "clf":
        metric_fn = accuracy_score
        metric = "acc"
        dt = DecisionTreeClassifier(max_depth=max_depth)

    start_time = time.time()
    results = []
    for i in range(n_repeats):
        dt.fit(df_train.drop("target", axis=1), df_train.target)
        results.append(metric_fn(df_test.target, dt.predict(df_test.drop("target", axis=1))))
    sk_time = time.time() - start_time
    sk_mean = np.mean(results)
    sk_std = np.std(results)
    
    start_time = time.time()
    results = []
    for i in range(n_repeats):
        if problem == "reg":
            dt = TreeNode.train_reg(df_train_rt, 0, max_depth)
            pred_rt = dt.predict(df_test_rt)
            results.append(metric_fn(df_test.target, pred_rt))
        elif problem == "clf":
            dt = TreeNode.train_clf(df_train_rt, 0, max_depth)
            pred_rt = dt.predict(df_test_rt)
            results.append(metric_fn(df_test.target, np.array(pred_rt) > 0.5))
    rt_time = time.time() - start_time
    rt_mean = np.mean(results)
    rt_std = np.std(results)
        
    return (dataset, sk_mean, sk_std, sk_time, rt_mean, rt_std, rt_time, metric)

In [5]:
results_reg = [evaluate_dataset(d, "reg") for d in datasets["reg"]]
results_clf = [evaluate_dataset(d, "clf") for d in datasets["clf"]]
results = results_reg + results_clf

cols = "dataset sk_mean sk_std sk_time(s) rt_mean rt_std rt_time(s) metric".split()
pd.DataFrame(results, columns=cols)

Unnamed: 0,dataset,sk_mean,sk_std,sk_time(s),rt_mean,rt_std,rt_time(s),metric
0,diabetes,0.317059,0.03645488,0.353968,0.297726,0.03680381,0.32407,r2
1,housing,0.599732,1.246222e-16,4.584062,0.599732,1.110223e-16,12.55795,r2
2,breast_cancer,0.928596,0.006361917,0.521264,0.929263,0.006977429,0.767765,acc
3,titanic,0.786441,1.110223e-16,0.327158,0.779661,3.330669e-16,0.321554,acc
