In [1]:
from sklearn import datasets, metrics

# 如果是分類問題，請使用 DecisionTreeClassifier，若為回歸問題，請使用 DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [2]:
# 讀取鳶尾花資料集
iris = datasets.load_iris()

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=4)

# 建立模型
clf = DecisionTreeClassifier(
                                        criterion = 'gini',
                                        max_depth = 2,
                                        min_samples_split = 2,
                                        min_samples_leaf = 10)

# 訓練模型
clf.fit(x_train, y_train)

# 預測測試集
y_pred = clf.predict(x_test)

In [3]:
acc = metrics.accuracy_score(y_test, y_pred)
print("Acuuracy: ", acc)
print(iris.feature_names)
print("Feature importance: ", clf.feature_importances_)

Acuuracy:  0.9736842105263158
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance:  [0. 0. 0. 1.]


In [4]:
# fine-tunning the hyperparameters
clf = DecisionTreeClassifier(
                                        criterion = 'entropy',
                                        max_depth = 1,
                                        min_samples_split = 2,
                                        min_samples_leaf = 10)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Acuuracy: ", acc)

Acuuracy:  0.6842105263157895


In [5]:
# only when the max_depth is set to any number below 2, the accuracy drastically decreased
# other hyparamter is not so important in this dataset

In [6]:
# get those dataset from sklaern dataset
wine = datasets.load_wine() # classification label: 0 ,1 ,2
boston = datasets.load_boston() # regression label:  values from 5~50
breast_cancer = datasets.load_breast_cancer() # classification label: 0,1,2

# wine
x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.1, random_state=4)
clf = DecisionTreeClassifier(
                                        criterion = 'entropy',
                                        max_depth = 2,
                                        min_samples_split = 4,
                                        min_samples_leaf = 5)

clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

Accuracy:  0.9444444444444444


In [7]:
# boston
x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.1, random_state=4)
clf = DecisionTreeRegressor(
                                        criterion = 'mse',
                                        max_depth = 5,
                                        min_samples_split = 2,
                                        min_samples_leaf = 6)

clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
R2 = r2_score(y_test, y_pred)
print("R2 score: ", R2)

R2 score:  0.8074288749864341
