# Julia 機器學習：DecisionTree 決策樹

## 作業 030：乳癌預測資料集

請使用隨機森林模型建立一個分類模型，來預測乳癌資料集中，為良性(benign)或是惡性(malignant)的腫瘤。

In [1]:
using DecisionTree, RDatasets, DataFrames, MLDataUtils, Statistics

## 讀取資料

In [2]:
biopsy = dataset("MASS", "biopsy")

Unnamed: 0_level_0,ID,V1,V2,V3,V4,V5,V6,V7,V8,V9,Class
Unnamed: 0_level_1,String,Int32,Int32,Int32,Int32,Int32,Int32⍰,Int32,Int32,Int32,Categorical…
1,1000025,5,1,1,1,2,1,3,1,1,benign
2,1002945,5,4,4,5,7,10,3,2,1,benign
3,1015425,3,1,1,1,2,2,3,1,1,benign
4,1016277,6,8,8,1,3,4,3,7,1,benign
5,1017023,4,1,1,3,2,1,3,1,1,benign
6,1017122,8,10,10,8,7,10,9,7,1,malignant
7,1018099,1,1,1,1,2,10,3,1,1,benign
8,1018561,2,1,2,1,2,1,3,1,1,benign
9,1033078,2,1,1,1,2,1,1,1,5,benign
10,1033078,4,2,1,1,2,1,2,1,1,benign


In [3]:
describe(biopsy)

Unnamed: 0_level_0,variable,mean,min,median,max,nunique,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Union…,Union…,Type
1,ID,,1000025,,95719,645.0,,String
2,V1,4.41774,1,4.0,10,,,Int32
3,V2,3.13448,1,1.0,10,,,Int32
4,V3,3.20744,1,1.0,10,,,Int32
5,V4,2.80687,1,1.0,10,,,Int32
6,V5,3.21602,1,2.0,10,,,Int32
7,V6,3.54466,1,1.0,10,,16.0,"Union{Missing, Int32}"
8,V7,3.43777,1,3.0,10,,,Int32
9,V8,2.86695,1,1.0,10,,,Int32
10,V9,1.58941,1,1.0,10,,,Int32


## 拆分資料

In [27]:
train_features = dropmissing(biopsy)[1:2:end, 2:10];
train_labels = dropmissing(biopsy)[1:2:end, 11];
test_features = dropmissing(biopsy)[2:2:end, 2:10];
test_labels = dropmissing(biopsy)[2:2:end, 11];

In [29]:
train_features = convert(Array{Int}, train_features);
train_labels = convert(Array{String}, train_labels);
test_features = convert(Array{Int}, test_features);
test_labels = convert(Array{String}, test_labels);

## 建立隨機森林模型

In [30]:
model = DecisionTree.RandomForestClassifier(n_trees=30, max_depth=2)

RandomForestClassifier
n_trees:             30
n_subfeatures:       -1
partial_sampling:    0.7
max_depth:           2
min_samples_leaf:    1
min_samples_split:   2
min_purity_increase: 0.0
classes:             nothing
ensemble:            nothing

## 訓練

In [31]:
DecisionTree.fit!(model, train_features, train_labels)

RandomForestClassifier
n_trees:             30
n_subfeatures:       -1
partial_sampling:    0.7
max_depth:           2
min_samples_leaf:    1
min_samples_split:   2
min_purity_increase: 0.0
classes:             ["benign", "malignant"]
ensemble:            Ensemble of Decision Trees
Trees:      30
Avg Leaves: 3.9
Avg Depth:  2.0

## 預測

In [33]:
predicted_labels = DecisionTree.predict(model, test_features);

341-element Array{String,1}:
 "malignant"
 "malignant"
 "malignant"
 "benign"
 "benign"
 "benign"
 "benign"
 "malignant"
 "benign"
 "benign"
 "malignant"
 "benign"
 "benign"
 ⋮
 "benign"
 "benign"
 "benign"
 "malignant"
 "benign"
 "benign"
 "benign"
 "benign"
 "malignant"
 "benign"
 "benign"
 "malignant"

## 評估

In [35]:
using Printf
@printf("Accuracy: %.2f%%\n", mean((predicted_labels .== test_labels))*100)

Accuracy: 96.77%


In [39]:
using ScikitLearn.CrossValidation: cross_val_score

features = convert(Array{Int}, dropmissing(biopsy)[:, 2:10]);
labels = convert(Array{String}, dropmissing(biopsy)[:, 11]);
n_folds = 10;
accuracy = nfoldCV_tree(labels, features, n_folds)

2×2 Array{Int64,2}:
 46   0
  0  22

2×2 Array{Int64,2}:
 42   0
  0  26

2×2 Array{Int64,2}:
 42   0
  0  26

2×2 Array{Int64,2}:
 42   0
  0  26

2×2 Array{Int64,2}:
 44   0
  0  24

2×2 Array{Int64,2}:
 45   0
  0  23

2×2 Array{Int64,2}:
 40   0
  0  28


Fold 1
Classes:  ["benign", "malignant"]
Matrix:   
Accuracy: 1.0
Kappa:    1.0

Fold 2
Classes:  ["benign", "malignant"]
Matrix:   
Accuracy: 1.0
Kappa:    1.0

Fold 3
Classes:  ["benign", "malignant"]
Matrix:   
Accuracy: 1.0
Kappa:    1.0

Fold 4
Classes:  ["benign", "malignant"]
Matrix:   
Accuracy: 1.0
Kappa:    1.0

Fold 5
Classes:  ["benign", "malignant"]
Matrix:   
Accuracy: 1.0
Kappa:    1.0

Fold 6
Classes:  ["benign", "malignant"]
Matrix:   
Accuracy: 1.0
Kappa:    1.0

Fold 7
Classes:  ["benign", "malignant"]

2×2 Array{Int64,2}:
 52   0
  0  16

2×2 Array{Int64,2}:
 52   0
  0  16

2×2 Array{Int64,2}:
 37   0
  0  31


Matrix:   
Accuracy: 1.0
Kappa:    1.0

Fold 8
Classes:  ["benign", "malignant"]
Matrix:   
Accuracy: 1.0
Kappa:    1.0

Fold 9
Classes:  ["benign", "malignant"]
Matrix:   
Accuracy: 1.0
Kappa:    1.0

Fold 10
Classes:  ["benign", "malignant"]
Matrix:   
Accuracy: 1.0
Kappa:    1.0

Mean Accuracy: 1.0


10-element Array{Float64,1}:
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0
 1.0