# DRF + treefarms

In [1]:
# Import the package
import pandas as pd
import numpy as np
from pydrf.model import DRFModel 
from sklearn import datasets
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from treefarms import TREEFARMS


In [2]:
data = datasets.load_iris()
# iris_df = pd.DataFrame(data.data, columns=data.feature_names)
iris_df = pd.DataFrame(data= np.c_[data['data'], data['target']],
                       columns= data.feature_names + ['target'])
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [3]:
X, y = iris_df.iloc[:, :-1], iris_df.iloc[:, -1]
X

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [4]:
y

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
145    2.0
146    2.0
147    2.0
148    2.0
149    2.0
Name: target, Length: 150, dtype: float64

In [131]:
# Define the structure of DRF
param_list = [
    [{"max_leaf_nodes":6, "n_estimators":100, "max_features":0.8}],
    [{"max_leaf_nodes":6, "n_estimators":100, "max_features":0.8}],
    [{"max_leaf_nodes":6, "n_estimators":100, "max_features":0.8}],
    [{"max_leaf_nodes":2, "n_estimators":100, "max_features":0.8}],
    [{"max_leaf_nodes":2, "n_estimators":50, "max_features":0.8}]
    ]

# Fit the model
drf = DRFModel(parallel_cores=5, parameter_list = param_list)
drf_model = drf.fit(X, y)


Layer 0 training...finished.
Layer 1 training...finished.
Layer 2 training...finished.
Layer 3 training...finished.
Layer 4 training...finished.


In [132]:
transform_train = drf_model.transform_data(X.values, stop_layer = 5)
print(type(transform_train))
print(transform_train.shape)

Transforming data...finished.
<class 'numpy.ndarray'>
(150, 50)


In [133]:
V = ["V"+str(i) for i in range(1, len(transform_train[0])+1)]
df_transform_train = pd.DataFrame(transform_train, columns = V, dtype = "category")
df_transform_train.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [134]:
for col in df_transform_train.columns:
    print(df_transform_train[col].value_counts())  

1    100
2     50
Name: V1, dtype: int64
1    100
2     50
Name: V2, dtype: int64
1    100
2     50
Name: V3, dtype: int64
2    100
1     50
Name: V4, dtype: int64
2    100
1     50
Name: V5, dtype: int64
2    100
1     50
Name: V6, dtype: int64
2    100
1     50
Name: V7, dtype: int64
1    100
2     50
Name: V8, dtype: int64
2    100
1     50
Name: V9, dtype: int64
1    100
2     50
Name: V10, dtype: int64
2    100
1     50
Name: V11, dtype: int64
2    100
1     50
Name: V12, dtype: int64
2    100
1     50
Name: V13, dtype: int64
2    100
1     50
Name: V14, dtype: int64
1    100
2     50
Name: V15, dtype: int64
1    100
2     50
Name: V16, dtype: int64
2    100
1     50
Name: V17, dtype: int64
1    100
2     50
Name: V18, dtype: int64
1    100
2     50
Name: V19, dtype: int64
2    100
1     50
Name: V20, dtype: int64
1    100
2     50
Name: V21, dtype: int64
1    100
2     50
Name: V22, dtype: int64
1    100
2     50
Name: V23, dtype: int64
2    100
1     50
Name: V24, dtype: int64
1

In [135]:
for col in df_transform_train.columns:
    # 進行 one-hot 編碼
    dummies = pd.get_dummies(df_transform_train[col], prefix=col)
    df_transform_train = pd.concat([df_transform_train, dummies], axis=1)
    # 刪除原始特徵
    df_transform_train.drop(col, axis=1, inplace=True)

In [136]:
print(df_transform_train.shape)
print(y.shape)

(150, 100)
(150,)


In [137]:
df_transform_train

Unnamed: 0,V1_1,V1_2,V2_1,V2_2,V3_1,V3_2,V4_1,V4_2,V5_1,V5_2,...,V46_1,V46_2,V47_1,V47_2,V48_1,V48_2,V49_1,V49_2,V50_1,V50_2
0,1,0,1,0,1,0,1,0,1,0,...,1,0,1,0,1,0,1,0,1,0
1,1,0,1,0,1,0,1,0,1,0,...,1,0,1,0,1,0,1,0,1,0
2,1,0,1,0,1,0,1,0,1,0,...,1,0,1,0,1,0,1,0,1,0
3,1,0,1,0,1,0,1,0,1,0,...,1,0,1,0,1,0,1,0,1,0
4,1,0,1,0,1,0,1,0,1,0,...,1,0,1,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
146,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
147,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
148,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1


In [138]:
# for col in df_transform_train.columns:
#     print(df_transform_train[col].unique())    
# # print(df_transform_train.iloc[:,:10])

In [139]:
# train TREEFARMS model
config = {
    "regularization": 0.01,  # regularization penalizes the tree with more leaves. We recommend to set it to relative high value to find a sparse tree.
    "rashomon_bound_multiplier": 0.01,  # rashomon bound multiplier indicates how large of a Rashomon set would you like to get
}

drf_treefarms_model = TREEFARMS(config)

drf_treefarms_model.fit(df_transform_train, y)


treefarms reported successful execution
training completed. Number of trees in the Rashomon set: 4784


<treefarms.model.treefarms.TREEFARMS at 0x7f411b9e9490>

In [140]:
# treefarms_model.visualize()


In [141]:
first_tree = drf_treefarms_model[1]
print("evaluating the first model in the Rashomon set", flush=True)

# get the results
train_acc = first_tree.score(df_transform_train, y)
n_leaves = first_tree.leaves()
n_nodes = first_tree.nodes()

print("Training accuracy: {}".format(train_acc))
print("# of leaves: {}".format(n_leaves))
print(first_tree)

evaluating the first model in the Rashomon set
Training accuracy: 1.0
# of leaves: 3
if feature_0 = true and feature_11 = true then:
    predicted Prediction: 1

else if feature_0 = true and feature_11 != true then:
    predicted Prediction: 0

else if feature_0 != true then:
    predicted Prediction: 2


# DRF+treefarms with train_test_split

In [3]:
# Import the package
import pandas as pd
import numpy as np
from pydrf.model import DRFModel 
from sklearn import datasets
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from treefarms import TREEFARMS


In [4]:
data = datasets.load_iris()
# iris_df = pd.DataFrame(data.data, columns=data.feature_names)
iris_df = pd.DataFrame(data= np.c_[data['data'], data['target']],
                       columns= data.feature_names + ['target'])
X, y = iris_df.iloc[:, :-1], iris_df.iloc[:, -1]

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 100)



In [6]:
X_train

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
81,5.5,2.4,3.7,1.0
99,5.7,2.8,4.1,1.3
119,6.0,2.2,5.0,1.5
12,4.8,3.0,1.4,0.1
16,5.4,3.9,1.3,0.4
...,...,...,...,...
87,6.3,2.3,4.4,1.3
103,6.3,2.9,5.6,1.8
67,5.8,2.7,4.1,1.0
24,4.8,3.4,1.9,0.2


In [7]:
y_train

81     1.0
99     1.0
119    2.0
12     0.0
16     0.0
      ... 
87     1.0
103    2.0
67     1.0
24     0.0
8      0.0
Name: target, Length: 120, dtype: float64

### train treefarm

In [10]:
# Define the structure of DRF
# param_list = [
#     [{"max_leaf_nodes":6, "n_estimators":100, "max_features":0.8}],
#     [{"max_leaf_nodes":6, "n_estimators":100, "max_features":0.8}],
#     [{"max_leaf_nodes":6, "n_estimators":100, "max_features":0.8}],
#     [{"max_leaf_nodes":2, "n_estimators":100, "max_features":0.8}],
#     [{"max_leaf_nodes":2, "n_estimators":100, "max_features":0.8}]
#     ]
param_list = [
    [{"max_leaf_nodes":6, "n_estimators":30, "max_features":0.7},
    {"max_leaf_nodes":8, "n_estimators":50, "max_features":0.8},
    {"max_leaf_nodes":4, "n_estimators":30, "max_features":0.6}],
    [{"max_leaf_nodes":4, "n_estimators":40, "max_features":0.8},
    {"max_leaf_nodes":4, "n_estimators":40, "max_features":0.8}]
    ]


# Fit the model
drf = DRFModel(parallel_cores=5, parameter_list = param_list)
drf_model = drf.fit(X_train, y_train)


Layer 0 training...finished.
Layer 1 training...finished.


In [11]:
# transform_train = drf_model.transform_data(X_train.values, stop_layer = 5)
transform_train = drf_model.transform_data(X_train.values, stop_layer = 2)

print(type(transform_train))
print(transform_train.shape)

Transforming data...finished.
<class 'numpy.ndarray'>
(120, 80)


### numpy 轉成dataframe 要把training set index記錄下來 (pd.Dataframe會自動從頭編號)

In [12]:
train_idx = X_train.index

In [13]:
train_idx

Int64Index([ 81,  99, 119,  12,  16,  51,  46,  89, 136, 114,
            ...
             53,  98,  94, 138,  79,  87, 103,  67,  24,   8],
           dtype='int64', length=120)

In [14]:
V = ["V"+str(i) for i in range(1, len(transform_train[0])+1)]
df_transform_train = pd.DataFrame(transform_train, columns = V, dtype = "category")
df_transform_train= df_transform_train.set_index(train_idx)
df_transform_train.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80
81,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,3,2,2,2,2
99,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,3,2,2,2,2
119,3,3,3,3,3,3,3,4,2,3,...,3,3,2,3,4,4,2,2,3,3
12,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
16,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [15]:
# for col in df_transform_train.columns:
#     print(df_transform_train[col].value_counts())  

In [16]:
for col in df_transform_train.columns:
    # 進行 one-hot 編碼
    dummies = pd.get_dummies(df_transform_train[col], prefix=col)
    df_transform_train = pd.concat([df_transform_train, dummies], axis=1)
    # 刪除原始特徵
    df_transform_train.drop(col, axis=1, inplace=True)

In [17]:
print(df_transform_train.shape)
print(y_train.shape)

(120, 247)
(120,)


In [18]:
df_transform_train

Unnamed: 0,V1_1,V1_2,V1_3,V2_1,V2_2,V2_3,V3_1,V3_2,V3_3,V4_1,...,V77_3,V78_1,V78_2,V78_3,V79_1,V79_2,V79_3,V80_1,V80_2,V80_3
81,0,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
99,0,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
119,0,0,1,0,0,1,0,0,1,0,...,0,0,1,0,0,0,1,0,0,1
12,1,0,0,1,0,0,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
16,1,0,0,1,0,0,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,0,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
103,0,0,1,0,0,1,0,0,1,0,...,1,0,0,1,0,0,1,0,0,1
67,0,1,0,0,1,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
24,1,0,0,1,0,0,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0


In [19]:
# for col in df_transform_train.columns:
#     print(df_transform_train[col].unique())    
# # print(df_transform_train.iloc[:,:10])

In [42]:
# train TREEFARMS model
config = {
    "regularization": 0.1,  # regularization penalizes the tree with more leaves. We recommend to set it to relative high value to find a sparse tree.
    "rashomon_bound_multiplier": 0.005,  # rashomon bound multiplier indicates how large of a Rashomon set would you like to get
}

drf_treefarms_model = TREEFARMS(config)

drf_treefarms_model.fit(df_transform_train, y_train)


treefarms reported successful execution
training completed. Number of trees in the Rashomon set: 1144


<treefarms.model.treefarms.TREEFARMS at 0x7f808d2ba130>

In [43]:
# treefarms_model.visualize()

In [44]:
first_tree = drf_treefarms_model[0]
print("evaluating the first model in the Rashomon set")

# get the results
train_acc = first_tree.score(df_transform_train, y_train)
n_leaves = first_tree.leaves()
n_nodes = first_tree.nodes()

print("Training accuracy: {}".format(train_acc))
print("# of leaves: {}".format(n_leaves))
print(first_tree)

evaluating the first model in the Rashomon set
Training accuracy: 1.0
# of leaves: 3
if feature_0 = true then:
    predicted Prediction: 0

else if feature_0 != true and feature_159 = true then:
    predicted Prediction: 2

else if feature_0 != true and feature_159 != true then:
    predicted Prediction: 1


In [68]:
for i in range(50):
    tree = drf_treefarms_model[i]
    print(tree.score(df_transform_train, y_train))

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


### test treefarm

In [45]:
transform_test = drf_model.transform_data(X_test.values, stop_layer = 2)

print(type(transform_test))
print(transform_test.shape)

Transforming data...finished.
<class 'numpy.ndarray'>
(30, 80)


numpy 轉成dataframe index記錄下來 (pd.Dataframe會自動從頭編號)

In [46]:
test_idx = X_test.index

In [47]:
test_idx

Int64Index([128,  11, 118,  15, 123, 135,  32,   1, 116,  45,  40, 115,  26,
             28, 145,  97,  62,  77, 122, 112, 125,  31, 146,  29,  69, 149,
             75,  20,  73, 120],
           dtype='int64')

In [48]:
V = ["V"+str(i) for i in range(1, len(transform_test[0])+1)]
df_transform_test = pd.DataFrame(transform_test, columns = V, dtype = "category")
df_transform_test= df_transform_test.set_index(test_idx)
df_transform_test.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V71,V72,V73,V74,V75,V76,V77,V78,V79,V80
128,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,4,4,3,3,3,3
11,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
118,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,4,4,3,3,3,3
15,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
123,3,2,3,3,3,2,2,3,3,3,...,3,3,3,3,4,4,3,3,2,3


In [49]:
for col in df_transform_test.columns:
    # 進行 one-hot 編碼
    dummies = pd.get_dummies(df_transform_test[col], prefix=col)
    df_transform_test = pd.concat([df_transform_test, dummies], axis=1)
    # 刪除原始特徵
    df_transform_test.drop(col, axis=1, inplace=True)

In [50]:
print(df_transform_test.shape)
print(y_test.shape)

(30, 241)
(30,)


In [51]:
df_transform_test

Unnamed: 0,V1_1,V1_2,V1_3,V2_1,V2_2,V2_3,V3_1,V3_2,V3_3,V4_1,...,V77_3,V78_1,V78_2,V78_3,V79_1,V79_2,V79_3,V80_1,V80_2,V80_3
128,0,0,1,0,0,1,0,0,1,0,...,1,0,0,1,0,0,1,0,0,1
11,1,0,0,1,0,0,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
118,0,0,1,0,0,1,0,0,1,0,...,1,0,0,1,0,0,1,0,0,1
15,1,0,0,1,0,0,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
123,0,0,1,0,1,0,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1
135,0,0,1,0,0,1,0,0,1,0,...,1,0,0,1,0,0,1,0,0,1
32,1,0,0,1,0,0,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
1,1,0,0,1,0,0,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
116,0,0,1,0,0,1,0,0,1,0,...,1,0,0,1,0,0,1,0,0,1
45,1,0,0,1,0,0,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0


In [64]:
first_tree = drf_treefarms_model[100]
print("evaluating the first model in the Rashomon set")

# get the results
test_acc = first_tree.score(df_transform_test, y_test)
n_leaves = first_tree.leaves()
n_nodes = first_tree.nodes()

print("testing accuracy: {}".format(test_acc))
print("# of leaves: {}".format(n_leaves))
print(first_tree)

evaluating the first model in the Rashomon set
testing accuracy: 0.0
# of leaves: 3
if feature_144 = true then:
    predicted Prediction: 0

else if feature_144 != true and feature_230 = true then:
    predicted Prediction: 2

else if feature_144 != true and feature_230 != true then:
    predicted Prediction: 1


In [66]:
for i in range(50):
    tree = drf_treefarms_model[i]
    print(tree.score(df_transform_test,y_test))

0.9666666666666667
0.9666666666666667
0.9333333333333333
0.9666666666666667
0.4
0.43333333333333335
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.0
0.0
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.0
0.0
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.0
0.0
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.0
0.0
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.0
0.0
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.0
0.0
0.9666666666666667
0.9666666666666667
0.9333333333333333
0.9666666666666667
0.4
0.43333333333333335
0.16666666666666666
0.16666666666666666
