In [1]:
from sklearn import datasets, metrics

# 如果是分類問題，請使用 DecisionTreeClassifier，若為回歸問題，請使用 DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [2]:
# 讀取鳶尾花資料集
iris = datasets.load_iris()

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=4)

# 建立模型 (使用 20 顆樹，每棵樹的最大深度為 4)
clf = RandomForestClassifier(n_estimators=20, max_depth=4)

# 訓練模型
clf.fit(x_train, y_train)

# 預測測試集
y_pred = clf.predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)
print(iris.feature_names)
print("Feature importance: ", clf.feature_importances_)

Accuracy:  0.9736842105263158
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance:  [0.04599703 0.01676606 0.61062415 0.32661276]


In [3]:
# using entropy as criterion
clf = RandomForestClassifier(criterion = 'entropy', n_estimators=20, max_depth=4)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)
print(iris.feature_names)
print("Feature importance: ", clf.feature_importances_)

Accuracy:  0.9736842105263158
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance:  [0.08548423 0.01420678 0.44912023 0.45118876]


In [4]:
# get those dataset from sklaern dataset
wine = datasets.load_wine() # classification label: 0 ,1 ,2
boston = datasets.load_boston() # regression label:  values from 5~50
breast_cancer = datasets.load_breast_cancer() # classification label: 0,1,2

In [5]:
# boston
x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.25, random_state=4)
clf = RandomForestRegressor(criterion = 'mse', n_estimators=200, max_depth=10)
clf.fit(x_train, y_train)

# testing
y_pred = clf.predict(x_test)
R2 = metrics.r2_score(y_test, y_pred)
print("R2 score: ", R2 )
print(iris.feature_names)
print("Feature importance: ", clf.feature_importances_)

R2 score:  0.8528349536792491
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance:  [0.0567746  0.000987   0.00728137 0.00187051 0.01950913 0.46635337
 0.01186981 0.0515858  0.00349441 0.0193993  0.01745591 0.00969829
 0.33372049]


In [6]:
#wine
x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.25, random_state=4)
clf = RandomForestClassifier(criterion = 'entropy', n_estimators=5, max_depth=4)
clf.fit(x_train, y_train)

# testing
y_pred = clf.predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("R2 score: ", acc )
print(iris.feature_names)
print("Feature importance: ", clf.feature_importances_)

R2 score:  0.9777777777777777
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance:  [0.10257256 0.01928132 0.04209864 0.070268   0.05900861 0.08933133
 0.34217697 0.00608285 0.01545547 0.03765563 0.02975349 0.10819842
 0.07811671]
