In [2]:
using DecisionTree
using RDatasets

┌ Info: Recompiling stale cache file /home/ye/.julia/compiled/v0.7/RDatasets/JyIbx.ji for RDatasets [ce6b1742-4840-55fa-b093-852dadbb1d8b]
└ @ Base loading.jl:1185


### Classification Example

#### load dataset

In [5]:
features,labels=load_data("iris");
features=float.(features);
labels=string.(labels);

#### Pruned Tree Classifier

In [8]:
model=DecisionTreeClassifier(max_depth=2)
DecisionTree.fit!(model,features,labels)
print_tree(model,5)

# apply learned model
predict(model, [5.9,3.0,5.1,1.9])

Feature 3, Threshold 2.45
L-> Iris-setosa : 50/50
R-> Feature 4, Threshold 1.75
    L-> Iris-versicolor : 49/54
    R-> Iris-virginica : 45/46


"Iris-virginica"

In [10]:
# get the probability of each label
predict_proba(model, [5.9,3.0,5.1,1.9])

3-element Array{Float64,1}:
 0.0                 
 0.021739130434782608
 0.9782608695652174  

In [11]:
println(get_classes(model)) # returns the ordering of the columns in predict_proba's output

["Iris-setosa", "Iris-versicolor", "Iris-virginica"]


#### train full-tree classifier

In [12]:
model=DecisionTree.build_tree(labels,features)

#  prune tree: merge leaves having >= 90% combined purity (default: 100%)
model=DecisionTree.prune_tree(model,0.9)

# pretty print of the tree, to a depth of 5 nodes (optional)
DecisionTree.print_tree(model,5)

Feature 4, Threshold 0.8
L-> Iris-setosa : 50/50
R-> Feature 4, Threshold 1.75
    L-> Feature 3, Threshold 4.95
        L-> Iris-versicolor : 47/48
        R-> Feature 4, Threshold 1.55
            L-> Iris-virginica : 3/3
            R-> Feature 3, Threshold 5.449999999999999
                L-> Iris-versicolor : 2/2
                R-> Iris-virginica : 1/1
    R-> Feature 3, Threshold 4.85
        L-> Feature 1, Threshold 5.95
            L-> Iris-versicolor : 1/1
            R-> Iris-virginica : 2/2
        R-> Iris-virginica : 43/43


In [13]:
apply_tree_proba(model, [5.9,3.0,5.1,1.9], ["Iris-setosa", "Iris-versicolor", "Iris-virginica"])

3-element Array{Float64,1}:
 0.0
 0.0
 1.0

In [15]:
# set of classification parameters and respective default values
# pruning_purity: purity threshold used for post-pruning (default: 1.0, no pruning)
# max_depth: maximum depth of the decision tree (default: -1, no maximum)
# min_samples_leaf: the minimum number of samples each leaf needs to have (default: 1)
# min_samples_split: the minimum number of samples in needed for a split (default: 2)
# min_purity_increase: minimum purity needed for a split (default: 0.0)
# n_subfeatures: number of features to select at random (default: 0, keep all)
n_subfeatures=0; max_depth=-1; min_samples_leaf=1; min_samples_split=2
min_purity_increase=0.0; pruning_purity = 1.0;n_folds=5

model    =   build_tree(labels, features,
                        n_subfeatures,
                        max_depth,
                        min_samples_leaf,
                        min_samples_split,
                        min_purity_increase)

accuracy = nfoldCV_tree(labels, features,
                        n_folds,
                        pruning_purity,
                        max_depth,
                        min_samples_leaf,
                        min_samples_split,
                        min_purity_increase)


Fold 1
Classes:  

3×3 Array{Int64,2}:
 8   0  0
 0  15  0
 0   0  7

["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
Matrix:   

3×3 Array{Int64,2}:
 10  0   0
  0  9   0
  0  0  11

3×3 Array{Int64,2}:
 16  0  0
  0  7  0
  0  0  7

3×3 Array{Int64,2}:
 9  0   0
 0  8   0
 0  0  13

3×3 Array{Int64,2}:
 7   0   0
 0  11   0
 0   0  12


Accuracy: 1.0
Kappa:    1.0

Fold 2
Classes:  ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
Matrix:   
Accuracy: 1.0
Kappa:    1.0

Fold 3
Classes:  ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
Matrix:   
Accuracy: 1.0
Kappa:    1.0

Fold 4
Classes:  ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
Matrix:   
Accuracy: 1.0
Kappa:    1.0

Fold 5
Classes:  ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
Matrix:   
Accuracy: 1.0
Kappa:    1.0

Mean Accuracy: 1.0


5-element Array{Float64,1}:
 1.0
 1.0
 1.0
 1.0
 1.0

### Random Forest Classifier

In [16]:
model=build_forest(labels,features,2,10,0.5,6)
n_folds=3; n_subfeatures=2
accuracy = nfoldCV_forest(labels, features, n_folds, n_subfeatures)


3×3 Array{Int64,2}:
 13   0   0
  0  16   0
  0   0  21

3×3 Array{Int64,2}:
 22   0   0
  0  16   1
  0   0  11

3×3 Array{Int64,2}:
 15   0   0
  0  17   0
  0   0  18


Fold 1
Classes:  ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
Matrix:   
Accuracy: 1.0
Kappa:    1.0

Fold 2
Classes:  ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
Matrix:   
Accuracy: 0.98
Kappa:    0.9689826302729528

Fold 3
Classes:  ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
Matrix:   
Accuracy: 1.0
Kappa:    1.0

Mean Accuracy: 0.9933333333333333


3-element Array{Float64,1}:
 1.0 
 0.98
 1.0 

In [18]:
n_subfeatures=-1; n_trees=10; partial_sampling=0.7; max_depth=-1
min_samples_leaf=5; min_samples_split=2; min_purity_increase=0.0

model    =   build_forest(labels, features,
                          n_subfeatures,
                          n_trees,
                          partial_sampling,
                          max_depth,
                          min_samples_leaf,
                          min_samples_split,
                          min_purity_increase)

accuracy = nfoldCV_forest(labels, features,
                          n_folds,
                          n_subfeatures,
                          n_trees,
                          partial_sampling,
                          max_depth,
                          min_samples_leaf,
                          min_samples_split,
                          min_purity_increase)

3×3 Array{Int64,2}:
 18   0   0
  0  16   1
  0   0  15

3×3 Array{Int64,2}:
 17   0   0
  0  14   3
  0   0  16

3×3 Array{Int64,2}:
 15   0   0
  0  16   0
  0   2  17


Fold 1
Classes:  ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
Matrix:   
Accuracy: 0.98
Kappa:    0.969951923076923

Fold 2
Classes:  ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
Matrix:   
Accuracy: 0.94
Kappa:    0.9101258238466147

Fold 3
Classes:  ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]
Matrix:   
Accuracy: 0.96
Kappa:    0.9399038461538461

Mean Accuracy: 0.96


3-element Array{Float64,1}:
 0.98
 0.94
 0.96