## [範例重點]
了解隨機森林的建模方法及其中超參數的意義

In [1]:
from sklearn import datasets, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
# 讀取鳶尾花資料集
iris = datasets.load_iris()

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=4)

# 建立模型 (使用 20 顆樹，每棵樹的最大深度為 4)
clf = RandomForestClassifier(n_estimators=20, max_depth=4)

# 訓練模型
clf.fit(x_train, y_train)

# 預測測試集
y_pred = clf.predict(x_test)

In [3]:
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

Accuracy:  0.9736842105263158


In [4]:
print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [5]:
print("Feature importance: ", clf.feature_importances_)

Feature importance:  [0.10910628 0.03162841 0.42021317 0.43905215]


## [作業重點]
確保你了解隨機森林模型中每個超參數的意義，並觀察調整超參數對結果的影響

## 作業

1. 試著調整 RandomForestClassifier(...) 中的參數，並觀察是否會改變結果？
2. 改用其他資料集 (boston, wine)，並與回歸模型與決策樹的結果進行比較

#### Random Forest Classifier
- n_estimators: default(10) 決策樹數量
- criterion: default(gini) gini, entropy 計算資訊增益度的方式
- max_depth: default(None) 隨機森林最大深度
- min_samples_split: default(2) 最少樣本數才切分
- min_samples_leaf: default(1) 尾端葉子最少需要的樣本數
- oob_score: default(False) 是否利用out of bag評估accuracy(與其他沒有該樣本的樹進行較量)
- n_jobs: default(1) 計算所用的CPU數目

In [6]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold


clf = RandomForestClassifier()

param_grid = {"criterion" : ["gini", "entropy"], 
              "n_estimators" : [10, 20, 50, 100],
              "max_depth" : [None], 
              "min_samples_split" : [2, 4, 8], 
              "min_samples_leaf" : [1, 3, 5],
              "oob_score" : [True],
              "n_jobs" : [-1]}

folder = StratifiedKFold(n_splits=3, shuffle=True)

gs = GridSearchCV(estimator = clf,
                  param_grid=param_grid,
                  scoring='balanced_accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(x_train, y_train)
print(gs.best_estimator_)
print ("oob score: %.4f" % gs.best_estimator_.oob_score_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=True, random_state=None, verbose=0,
                       warm_start=False)
oob score: 0.9196




In [7]:
y_pred = gs.predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

Accuracy:  0.9473684210526315


In [8]:
import pandas as pd

feature_name = iris.feature_names
pd.concat((pd.DataFrame(feature_name, columns=['Feautres']), pd.DataFrame(gs.best_estimator_.feature_importances_, columns=['importance'])),
          axis=1).sort_values(by='importance', ascending=False)

Unnamed: 0,Feautres,importance
3,petal width (cm),0.495973
2,petal length (cm),0.380005
0,sepal length (cm),0.091756
1,sepal width (cm),0.032266


### Boston House-Prices Dataset (Regression)

### Random Forest

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

boston = datasets.load_boston()

x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.25, random_state=4)

rg = RandomForestRegressor(random_state=4)

rg.fit(x_train, y_train)

predict = rg.predict(x_test)



In [10]:
y_pred = rg.predict(x_test)
MSE = mean_squared_error(y_test, y_pred)
print("MSE: ", MSE)

R2 = r2_score(y_test, y_pred)
print("R2: ", R2)

MSE:  21.00935984251969
R2:  0.7906047863477124


In [11]:
feature_name = boston.feature_names
pd.concat((pd.DataFrame(feature_name, columns=['Feautres']), pd.DataFrame(rg.feature_importances_, columns=['importance'])),
          axis=1).sort_values(by='importance', ascending=False)

Unnamed: 0,Feautres,importance
5,RM,0.523235
12,LSTAT,0.278094
0,CRIM,0.064198
7,DIS,0.048455
4,NOX,0.018962
9,TAX,0.018144
10,PTRATIO,0.017282
6,AGE,0.010835
11,B,0.009802
2,INDUS,0.003668


#### Decision Tree

In [12]:
from sklearn.tree import DecisionTreeRegressor

DT_rg = DecisionTreeRegressor(random_state=4)
DT_rg.fit(x_train, y_train)
DT_pred = DT_rg.predict(x_test)

DT_MSE = mean_squared_error(y_test, DT_pred)
print("MSE: ", DT_MSE)

DT_R2 = r2_score(y_test, DT_pred)
print("R2: ", DT_R2)

MSE:  27.938425196850392
R2:  0.7215444660354143


#### Linear Regression

In [13]:
from sklearn.linear_model import LinearRegression

LR_rg = LinearRegression()
LR_rg.fit(x_train, y_train)
LR_pred = LR_rg.predict(x_test)

LR_MSE = mean_squared_error(y_test, LR_pred)
print("MSE: ", LR_MSE)

LR_R2 = r2_score(y_test, LR_pred)
print("R2: ", LR_R2)

MSE:  26.95142562423582
R2:  0.7313816523148398


#### Lasso

In [14]:
from sklearn.linear_model import Lasso

LS_rg = Lasso(random_state=4)
LS_rg.fit(x_train, y_train)
LS_pred = LS_rg.predict(x_test)

LS_MSE = mean_squared_error(y_test, LS_pred)
print("MSE: ", LS_MSE)

LS_R2 = r2_score(y_test, LS_pred)
print("R2: ", LS_R2)

MSE:  34.59249580775357
R2:  0.6552249518916471


#### Ridge

In [15]:
from sklearn.linear_model import Ridge

RD_rg = Ridge(random_state=4)
RD_rg.fit(x_train, y_train)
RD_pred = RD_rg.predict(x_test)

RD_MSE = mean_squared_error(y_test, RD_pred)
print("MSE: ", RD_MSE)

RD_R2 = r2_score(y_test, RD_pred)
print("R2: ", RD_R2)

MSE:  27.46959660445639
R2:  0.726217167345997


In [16]:
sheet = {'Model':['Random Forest', 'Decision Tree', 'Linear Regression', 'Lasso', 'Ridge'],
        'MSE':[MSE, DT_MSE, LR_MSE, LS_MSE, RD_MSE],
        'R2' :[R2, DT_R2, LR_R2, LS_R2, RD_R2]}
sheet = pd.DataFrame(sheet)
sheet

Unnamed: 0,Model,MSE,R2
0,Random Forest,21.00936,0.790605
1,Decision Tree,27.938425,0.721544
2,Linear Regression,26.951426,0.731382
3,Lasso,34.592496,0.655225
4,Ridge,27.469597,0.726217


### Wine Dataset (Classification)

#### Random Forest

In [17]:
wine = datasets.load_wine()

x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.25, random_state=4)

clf = RandomForestClassifier(random_state=4)

clf.fit(x_train, y_train)

predict = clf.predict(x_test)



In [18]:
y_pred = clf.predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

Accuracy:  1.0


In [19]:
feature_name = wine.feature_names
pd.concat((pd.DataFrame(feature_name, columns=['Feautres']), pd.DataFrame(clf.feature_importances_, columns=['importance'])),
          axis=1).sort_values(by='importance', ascending=False)

Unnamed: 0,Feautres,importance
0,alcohol,0.231655
11,od280/od315_of_diluted_wines,0.214509
12,proline,0.134798
6,flavanoids,0.098122
9,color_intensity,0.097076
10,hue,0.063147
4,magnesium,0.057414
5,total_phenols,0.030599
3,alcalinity_of_ash,0.028093
1,malic_acid,0.012822


#### Decision Tree

In [20]:
from sklearn.tree import DecisionTreeClassifier

DT_clf = DecisionTreeClassifier(random_state=4)
DT_clf.fit(x_train, y_train)
DT_pred = DT_clf.predict(x_test)
DT_acc = metrics.accuracy_score(y_test, DT_pred)
print("Accuracy: ", DT_acc)

Accuracy:  0.8888888888888888


#### Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression

LR_clf = LogisticRegression(random_state=4)
LR_clf.fit(x_train, y_train)
LR_pred = LR_clf.predict(x_test)
LR_acc = metrics.accuracy_score(y_test, LR_pred)
print("Accuracy: ", LR_acc)

Accuracy:  0.9333333333333333




In [22]:
sheet = {'Model':['Random Forest', 'Decision Tree', 'Logistic Regression'],
        'Accuray':[acc, DT_acc, LR_acc]}
sheet = pd.DataFrame(sheet)
sheet

Unnamed: 0,Model,Accuray
0,Random Forest,1.0
1,Decision Tree,0.888889
2,Logistic Regression,0.933333
