In [1]:
import pandas as pd
from sklearn.metrics import r2_score, accuracy_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from rustrees import Dataset, DecisionTree, RandomForest
import time
import numpy as np

In [2]:
datasets = {
    "reg": ["diabetes", "housing"],
    "clf": ["breast_cancer", "titanic"]
}

In [3]:
n_repeats = 100
max_depth = 5
n_estimators = 10

In [4]:
def evaluate_dataset(dataset, problem, model="dt"):
    df_train = pd.read_csv(f"datasets/{dataset}_train.csv")
    df_test = pd.read_csv(f"datasets/{dataset}_test.csv")
    
    df_train_rt = Dataset.read_csv(f"datasets/{dataset}_train.csv", sep=",")
    df_test_rt = Dataset.read_csv(f"datasets/{dataset}_test.csv", sep=",")

    if problem == "reg":
        metric_fn = r2_score
        metric = "r2"
        if model == "dt":
            model_sk = DecisionTreeRegressor(max_depth=max_depth)
        elif model == "rf":
            model_sk = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1)
    elif problem == "clf":
        metric_fn = accuracy_score
        metric = "acc"
        if model == "dt":
            model_sk = DecisionTreeClassifier(max_depth=max_depth)
        elif model == "rf":
            model_sk = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1)
    
    start_time = time.time()
    results_sk = []
    for _ in range(n_repeats):
        model_sk.fit(df_train.drop("target", axis=1), df_train.target)
        results_sk.append(metric_fn(df_test.target, model_sk.predict(df_test.drop("target", axis=1))))
    sk_time = time.time() - start_time
    sk_mean = np.mean(results_sk)
    sk_std = np.std(results_sk)
    
    start_time = time.time()
    results_rt = []
    for _ in range(n_repeats):
        if problem == "reg" and model == "dt":
            model_rt = DecisionTree.train_reg(df_train_rt, max_depth=max_depth)
        elif problem == "reg" and model == "rf":
            model_rt = RandomForest.train_reg(df_train_rt, n_estimators=n_estimators, max_depth=max_depth)
        elif problem == "clf" and model == "dt":
            model_rt = DecisionTree.train_clf(df_train_rt, max_depth=max_depth)   
        elif problem == "clf" and model == "rf":
            model_rt = RandomForest.train_clf(df_train_rt, n_estimators=n_estimators, max_depth=max_depth)

        if problem == "reg":
            pred_rt = model_rt.predict(df_test_rt)
            results_rt.append(metric_fn(df_test.target, pred_rt))
        elif problem == "clf":
            pred_rt = model_rt.predict(df_test_rt)
            results_rt.append(metric_fn(df_test.target, np.array(pred_rt) > 0.5))
    rt_time = time.time() - start_time
    rt_mean = np.mean(results_rt)
    rt_std = np.std(results_rt)
        
    return (dataset, sk_mean, rt_mean, sk_std, rt_std, sk_time, rt_time, metric)

In [5]:
results_reg = [evaluate_dataset(d, "reg") for d in datasets["reg"]]
results_clf = [evaluate_dataset(d, "clf") for d in datasets["clf"]]
results = results_reg + results_clf

cols = "dataset sk_mean rt_mean sk_std rt_std sk_time(s) rt_time(s) metric".split()

pd.DataFrame(results, columns=cols)


Reading CSV file datasets/diabetes_train.csv
Reading CSV file datasets/diabetes_test.csv
Reading CSV file datasets/housing_train.csv
Reading CSV file datasets/housing_test.csv
Reading CSV file datasets/breast_cancer_train.csv
Reading CSV file datasets/breast_cancer_test.csv
Reading CSV file datasets/titanic_train.csv
Reading CSV file datasets/titanic_test.csv


Unnamed: 0,dataset,sk_mean,rt_mean,sk_std,rt_std,sk_time(s),rt_time(s),metric
0,diabetes,0.310188,0.309735,0.03828418,0.03887897,0.273454,0.323043,r2
1,housing,0.599732,0.599732,1.190582e-16,1.110223e-16,4.4797,10.571949,r2
2,breast_cancer,0.929228,0.92986,0.006316862,0.007647111,0.487107,0.67625,acc
3,titanic,0.786441,0.779661,1.110223e-16,3.330669e-16,0.27236,0.275078,acc


In [6]:
results_reg = [evaluate_dataset(d, "reg", model="rf") for d in datasets["reg"]]
results_clf = [evaluate_dataset(d, "clf", model="rf") for d in datasets["clf"]]
results = results_reg + results_clf

cols = "dataset sk_mean rt_mean sk_std rt_std sk_time(s) rt_time(s) metric".split()

pd.DataFrame(results, columns=cols)


Reading CSV file datasets/diabetes_train.csv
Reading CSV file datasets/diabetes_test.csv
Reading CSV file datasets/housing_train.csv
Reading CSV file datasets/housing_test.csv
Reading CSV file datasets/breast_cancer_train.csv
Reading CSV file datasets/breast_cancer_test.csv
Reading CSV file datasets/titanic_train.csv
Reading CSV file datasets/titanic_test.csv


Unnamed: 0,dataset,sk_mean,rt_mean,sk_std,rt_std,sk_time(s),rt_time(s),metric
0,diabetes,0.441172,0.434636,0.027669,0.026188,1.93476,0.757053,r2
1,housing,0.642172,0.64143,0.003613,0.003289,8.041145,25.780383,r2
2,breast_cancer,0.95593,0.941368,0.009354,0.010588,2.202806,2.380955,acc
3,titanic,0.795559,0.792542,0.014617,0.010805,2.342281,0.89069,acc
