In [1]:
import pandas as pd
from sklearn.metrics import r2_score, accuracy_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import rustrees.decision_tree as rt_dt
import rustrees.random_forest as rt_rf
import time
import numpy as np

In [2]:
datasets = {
    "reg": ["diabetes", "housing", "dgp"],
    "clf": ["breast_cancer", "titanic"]
}

In [3]:
def evaluate_dataset(dataset, problem, model, max_depth, n_repeats, n_estimators=None):
    df_train = pd.read_csv(f"../../datasets/{dataset}_train.csv")
    df_test = pd.read_csv(f"../../datasets/{dataset}_test.csv")

    if problem == "reg":
        metric_fn = r2_score
        metric = "r2"
        if model == "dt":
            model_sk = DecisionTreeRegressor(max_depth=max_depth)
            model_rt = rt_dt.DecisionTreeRegressor(max_depth=max_depth)
        elif model == "rf":
            model_sk = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1)
            model_rt = rt_rf.RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
    elif problem == "clf":
        metric_fn = accuracy_score
        metric = "acc"
        if model == "dt":
            model_sk = DecisionTreeClassifier(max_depth=max_depth)
            model_rt = rt_dt.DecisionTreeClassifier(max_depth=max_depth)
        elif model == "rf":
            model_sk = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1)
            model_rt = rt_rf.RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)

    X = df_train.drop("target", axis=1)
    y = df_train.target
    start_time = time.time()
    results_sk = []
    for _ in range(n_repeats):
        model_sk.fit(X, y)
        results_sk.append(0)#results_sk.append(metric_fn(df_test.target, df_test.target))#model_sk.predict(df_test.drop("target", axis=1))))
    sk_time = (time.time() - start_time)/n_repeats
    sk_mean = np.mean(results_sk)
    sk_std = np.std(results_sk)
    
    start_time = time.time()
    results_rt = []
    for _ in range(n_repeats):
        # model_rt.fit(X, y)
        results_rt.append(0)#metric_fn(df_test.target, model_rt.predict(df_test.drop("target", axis=1)))
    rt_time = (time.time() - start_time)/n_repeats
    rt_mean = np.mean(results_rt)
    rt_std = np.std(results_rt)
        
    return (dataset, sk_mean, rt_mean, sk_std, rt_std, sk_time, rt_time, metric)

In [4]:
results_reg = [evaluate_dataset(d, "reg", model="dt", max_depth=5, n_repeats=100) for d in datasets["reg"]]
results_clf = [evaluate_dataset(d, "clf", model="dt", max_depth=5, n_repeats=100) for d in datasets["clf"]]
results = results_reg + results_clf

cols = "dataset sk_mean rt_mean sk_std rt_std sk_time(s) rt_time(s) metric".split()

pd.DataFrame(results, columns=cols)


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(p

Unnamed: 0,dataset,sk_mean,rt_mean,sk_std,rt_std,sk_time(s),rt_time(s),metric
0,diabetes,0.0,0.0,0.0,0.0,0.001312,0.001902,r2
1,housing,0.0,0.0,0.0,0.0,0.039629,0.047335,r2
2,dgp,0.0,0.0,0.0,0.0,0.053538,0.290211,r2
3,breast_cancer,0.0,0.0,0.0,0.0,0.002603,0.002763,acc
4,titanic,0.0,0.0,0.0,0.0,0.000863,0.001108,acc


In [5]:
results_reg = [evaluate_dataset(d, "reg", model="rf", max_depth=2, n_estimators=100, n_repeats=100) for d in datasets["reg"]]
results_clf = [evaluate_dataset(d, "clf", model="rf", max_depth=2, n_estimators=100, n_repeats=100) for d in datasets["clf"]]
results = results_reg + results_clf

cols = "dataset sk_mean rt_mean sk_std rt_std sk_time(s) rt_time(s) metric".split()

pd.DataFrame(results, columns=cols)


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(p

Unnamed: 0,dataset,sk_mean,rt_mean,sk_std,rt_std,sk_time(s),rt_time(s),metric
0,diabetes,0.0,0.0,0.0,0.0,0.060998,4.053116e-08,r2
1,housing,0.0,0.0,0.0,0.0,0.2203,4.053116e-08,r2
2,dgp,0.0,0.0,0.0,0.0,0.280662,4.053116e-08,r2
3,breast_cancer,0.0,0.0,0.0,0.0,0.073808,3.099442e-08,acc
4,titanic,0.0,0.0,0.0,0.0,0.084586,3.099442e-08,acc
