In [1]:
# Import the package
import pandas as pd
import numpy as np
from pydrf.model import DRFModel 
from sklearn import datasets
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from treefarms import TREEFARMS

In [2]:
data = datasets.load_iris()
# iris_df = pd.DataFrame(data.data, columns=data.feature_names)
iris_df = pd.DataFrame(data = np.c_[data['data'], data['target']],
                       columns= data.feature_names + ['target'])
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [3]:
print(type(iris_df))
iris_df.columns

<class 'pandas.core.frame.DataFrame'>


Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target'],
      dtype='object')

In [4]:
X, y = iris_df.iloc[:, :-1], iris_df.iloc[:, -1]
print(X.columns)
y

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')


0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
145    2.0
146    2.0
147    2.0
148    2.0
149    2.0
Name: target, Length: 150, dtype: float64

In [5]:
for col in X.columns:
    # 使用 qcut 分為三個等分
    X[col + '_bin'] = pd.qcut(X[col], 3, labels=False, duplicates='drop')
    # 進行 one-hot 編碼
    dummies = pd.get_dummies(X[col + '_bin'], prefix=col)
    X = pd.concat([X, dummies], axis=1)
    # 刪除原始的分箱特徵
    X.drop(col + '_bin', axis=1, inplace=True)
    X.drop(col, axis=1, inplace=True)

print(X.head())

   sepal length (cm)_0  sepal length (cm)_1  sepal length (cm)_2  \
0                    1                    0                    0   
1                    1                    0                    0   
2                    1                    0                    0   
3                    1                    0                    0   
4                    1                    0                    0   

   sepal width (cm)_0  sepal width (cm)_1  sepal width (cm)_2  \
0                   0                   0                   1   
1                   0                   1                   0   
2                   0                   1                   0   
3                   0                   1                   0   
4                   0                   0                   1   

   petal length (cm)_0  petal length (cm)_1  petal length (cm)_2  \
0                    1                    0                    0   
1                    1                    0                    0

# Treefarms

In [6]:
# train TREEFARMS model
config = {
    "regularization": 0.01,  # regularization penalizes the tree with more leaves. We recommend to set it to relative high value to find a sparse tree.
    "rashomon_bound_multiplier": 0.05,  # rashomon bound multiplier indicates how large of a Rashomon set would you like to get
}

treefarms_model = TREEFARMS(config)

treefarms_model.fit(X, y)


null
Finding Optimal Objective...
treefarms reported successful execution
training completed. Number of trees in the Rashomon set: 56


<treefarms.model.treefarms.TREEFARMS at 0x7fb10792b820>

{
  "false": {
    "false": {
      "complexity": 0.009999999776482582,
      "loss": 0.013333333656191826,
      "name": "target",
      "prediction": 2.0
    },
    "feature": 9,
    "name": "petal width (cm)_0",
    "reference": 1,
    "relation": "==",
    "true": {
      "complexity": 0.009999999776482582,
      "loss": 0.0,
      "name": "target",
      "prediction": 0.0
    },
    "type": "integral"
  },
  "feature": 10,
  "model_objective": 0.05999999865889549,
  "name": "petal width (cm)_1",
  "reference": 1,
  "relation": "==",
  "true": {
    "false": {
      "complexity": 0.009999999776482582,
      "loss": 0.0,
      "name": "target",
      "prediction": 1.0
    },
    "feature": 8,
    "name": "petal length (cm)_2",
    "reference": 1,
    "relation": "==",
    "true": {
      "complexity": 0.009999999776482582,
      "loss": 0.006666666828095913,
      "name": "target",
      "prediction": 2.0
    },
    "type": "integral"
  },
  "type": "integral"
}
{
  "false": {
    "

In [7]:
# treefarms_model.visualize()


ty": 0.009999999776482582,
      "loss": 0.0,
      "name": "target",
      "prediction": 0.0
    },
    "feature": 8,
    "name": "petal length (cm)_2",
    "reference": 1,
    "relation": "==",
    "true": {
      "complexity": 0.009999999776482582,
      "loss": 0.013333333656191826,
      "name": "target",
      "prediction": 2.0
    },
    "type": "integral"
  },
  "feature": 7,
  "model_objective": 0.05999999865889549,
  "name": "petal length (cm)_1",
  "reference": 1,
  "relation": "==",
  "true": {
    "false": {
      "complexity": 0.009999999776482582,
      "loss": 0.0,
      "name": "target",
      "prediction": 1.0
    },
    "feature": 11,
    "name": "petal width (cm)_2",
    "reference": 1,
    "relation": "==",
    "true": {
      "complexity": 0.009999999776482582,
      "loss": 0.006666666828095913,
      "name": "target",
      "prediction": 2.0
    },
    "type": "integral"
  },
  "type": "integral"
}
{
  "false": {
    "false": {
      "complexity": 0.009999999776

In [8]:
for i in range(10):
    print (treefarms_model[i].score(X,y))

0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98


In [9]:
first_tree = treefarms_model[0]
print("evaluating the first model in the Rashomon set", flush=True)

# get the results
train_acc = first_tree.score(X, y)
n_leaves = first_tree.leaves()
n_nodes = first_tree.nodes()

print("Training accuracy: {}".format(train_acc))
print("# of leaves: {}".format(n_leaves))
print(first_tree)

evaluating the first model in the Rashomon set
Training accuracy: 0.98
# of leaves: 4
if feature_10 = true and feature_7 = true then:
    predicted Prediction: 1

else if feature_10 = true and feature_7 != true then:
    predicted Prediction: 2

else if feature_10 != true and feature_11 = true then:
    predicted Prediction: 2

else if feature_10 != true and feature_11 != true then:
    predicted Prediction: 0


# DRF + treefarms

In [10]:
# Define the structure of DRF
param_list = [
    [{"max_leaf_nodes":6, "n_estimators":100, "max_features":0.8}],
    [{"max_leaf_nodes":6, "n_estimators":100, "max_features":0.8}],
    [{"max_leaf_nodes":6, "n_estimators":100, "max_features":0.8}],
    [{"max_leaf_nodes":2, "n_estimators":100, "max_features":0.8}],
    [{"max_leaf_nodes":2, "n_estimators":100, "max_features":0.8}]
    ]

# Fit the model
drf = DRFModel(parallel_cores=5, parameter_list = param_list)
drf_model = drf.fit(X, y)


Layer 0 training...finished.
Layer 1 training...finished.
Layer 2 training...finished.
Layer 3 training...finished.
Layer 4 training...finished.


In [11]:
transform_train = drf_model.transform_data(X.values, stop_layer = 5)
print(type(transform_train))
print(transform_train.shape)

Transforming data...finished.
<class 'numpy.ndarray'>
(150, 100)


In [12]:
V = ["V"+str(i) for i in range(1, len(transform_train[0])+1)]
df_transform_train = pd.DataFrame(transform_train, columns = V, dtype = "category")
df_transform_train.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [13]:
for col in df_transform_train.columns:
    print(df_transform_train[col].value_counts())  

2    100
1     50
Name: V1, dtype: int64
2    100
1     50
Name: V2, dtype: int64
2    100
1     50
Name: V3, dtype: int64
1    100
2     50
Name: V4, dtype: int64
2    100
1     50
Name: V5, dtype: int64
2    100
1     50
Name: V6, dtype: int64
2    100
1     50
Name: V7, dtype: int64
2    100
1     50
Name: V8, dtype: int64
2    100
1     50
Name: V9, dtype: int64
2    100
1     50
Name: V10, dtype: int64
2    100
1     50
Name: V11, dtype: int64
2    100
1     50
Name: V12, dtype: int64
2    100
1     50
Name: V13, dtype: int64
2    100
1     50
Name: V14, dtype: int64
2    100
1     50
Name: V15, dtype: int64
2    100
1     50
Name: V16, dtype: int64
2    100
1     50
Name: V17, dtype: int64
2    100
1     50
Name: V18, dtype: int64
2    100
1     50
Name: V19, dtype: int64
1    97
2    53
Name: V20, dtype: int64
2    100
1     50
Name: V21, dtype: int64
1    97
2    53
Name: V22, dtype: int64
2    100
1     50
Name: V23, dtype: int64
2    100
1     50
Name: V24, dtype: int64
2    

In [14]:
for col in df_transform_train.columns:
    # 進行 one-hot 編碼
    dummies = pd.get_dummies(df_transform_train[col], prefix=col)
    df_transform_train = pd.concat([df_transform_train, dummies], axis=1)
    # 刪除原始特徵
    df_transform_train.drop(col, axis=1, inplace=True)

In [15]:
print(df_transform_train.shape)
print(y.shape)

(150, 200)
(150,)


In [16]:
df_transform_train

Unnamed: 0,V1_1,V1_2,V2_1,V2_2,V3_1,V3_2,V4_1,V4_2,V5_1,V5_2,...,V96_1,V96_2,V97_1,V97_2,V98_1,V98_2,V99_1,V99_2,V100_1,V100_2
0,1,0,1,0,1,0,1,0,1,0,...,1,0,1,0,1,0,1,0,1,0
1,1,0,1,0,1,0,1,0,1,0,...,1,0,1,0,1,0,1,0,1,0
2,1,0,1,0,1,0,1,0,1,0,...,1,0,1,0,1,0,1,0,1,0
3,1,0,1,0,1,0,1,0,1,0,...,1,0,1,0,1,0,1,0,1,0
4,1,0,1,0,1,0,1,0,1,0,...,1,0,1,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
146,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
147,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1
148,0,1,0,1,0,1,0,1,0,1,...,0,1,0,1,0,1,0,1,0,1


In [17]:
# for col in df_transform_train.columns:
#     print(df_transform_train[col].unique())    
# # print(df_transform_train.iloc[:,:10])

In [26]:
# train TREEFARMS model
config = {
    "regularization": 0.01,  # regularization penalizes the tree with more leaves. We recommend to set it to relative high value to find a sparse tree.
    "rashomon_bound_multiplier": 0.05,  # rashomon bound multiplier indicates how large of a Rashomon set would you like to get
}

drf_treefarms_model = TREEFARMS(config)

drf_treefarms_model.fit(df_transform_train, y)


treefarms reported successful execution
training completed. Number of trees in the Rashomon set: 8848


<treefarms.model.treefarms.TREEFARMS at 0x7fb20c92cdc0>

In [27]:
# treefarms_model.visualize()


In [28]:
first_tree = drf_treefarms_model[1]
print("evaluating the first model in the Rashomon set", flush=True)

# get the results
train_acc = first_tree.score(df_transform_train, y)
n_leaves = first_tree.leaves()
n_nodes = first_tree.nodes()

print("Training accuracy: {}".format(train_acc))
print("# of leaves: {}".format(n_leaves))
print(first_tree)

evaluating the first model in the Rashomon set
Training accuracy: 0.98
# of leaves: 3
if feature_0 = true then:
    predicted Prediction: 0

else if feature_0 != true and feature_111 = true then:
    predicted Prediction: 2

else if feature_0 != true and feature_111 != true then:
    predicted Prediction: 1


In [24]:
for i in range(50):
    tree = drf_treefarms_model[i]
    print(tree.score(df_transform_train,y))

0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
0.98
