In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data[:, 2:]
y = iris.target

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [4]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

In [5]:
voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], voting = 'hard'
)

In [6]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [7]:
from sklearn.metrics import accuracy_score

In [8]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("The accuracy score of",clf.__class__.__name__,'is: ',accuracy_score(y_pred, y_test))

The accuracy score of LogisticRegression is:  0.9666666666666667
The accuracy score of RandomForestClassifier is:  0.9666666666666667
The accuracy score of SVC is:  0.9666666666666667
The accuracy score of VotingClassifier is:  0.9666666666666667


## Bagging and Pasting

In [9]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [10]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    max_samples = 100,
    bootstrap=True,
    n_jobs = -1,
    oob_score = True) #n_jobs tells Sklearn the number of CPU cores to use for training and predictions
bag_clf.fit(X_train, y_train)


BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,
                  n_estimators=500, n_jobs=-1, oob_score=True)

In [11]:
y_pred = bag_clf.predict(X_test)

In [12]:
accuracy_score(y_pred, y_test)

0.9666666666666667

In [13]:
bag_clf.oob_score_

0.9666666666666667

In [14]:
bag_clf.oob_decision_function_

array([[0.        , 0.00966184, 0.99033816],
       [0.        , 0.03878357, 0.96121643],
       [0.        , 0.80547138, 0.19452862],
       [0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [0.        , 0.16061947, 0.83938053],
       [0.        , 1.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [0.        , 0.99570815, 0.00429185],
       [0.        , 0.        , 1.        ],
       [0.        , 0.00193237, 0.99806763],
       [0.        , 1.        , 0.        ],
       [0.        , 0.86538462, 0.13461538],
       [0.        , 0.        , 1.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.00938967, 0.99061033],
       [0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.        , 1.        , 0.        ],
       [0.

## Random Forests


In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes = 16, n_jobs = 1)
rnd_clf.fit(X_train, y_train)

RandomForestClassifier(max_leaf_nodes=16, n_estimators=500, n_jobs=1)

In [17]:
y_pred_rf = rnd_clf.predict(X_test)
#Hyperparameters: RandomForest = DecisionTree + Bagging

## Feature Importance

In [18]:
iris = load_iris()

In [19]:
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs = -1)
rnd_clf.fit(iris['data'], iris['target'])

RandomForestClassifier(n_estimators=500, n_jobs=-1)

In [20]:
for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, score)
#use feature_importances_ to get the importance of each feature in order to do feature selection


sepal length (cm) 0.09311938650770295
sepal width (cm) 0.024660247650018296
petal length (cm) 0.4332253636656913
petal width (cm) 0.44899500217658744


## Boosting

### AdaBoost

The general idea of most boosting methods is to train predictors sequentially, each trying to correct its predecessor.

In [21]:
from sklearn.ensemble import AdaBoostClassifier

In [22]:
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth = 1),
    n_estimators=200,
    algorithm='SAMME.R', learning_rate=0.5)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

### Gradient Boosting

Instead of tweaking the instance weights at every iteration like AdaBoost does, this method tries to fit the new predictor to the *residual errors* made by the previous predictor

In [23]:
from sklearn.tree import DecisionTreeRegressor

In [24]:
tree_reg1 = DecisionTreeRegressor(max_depth = 2)
tree_reg1.fit(X, y)

DecisionTreeRegressor(max_depth=2)

In [25]:
y2 = y - tree_reg1.predict(X) #Get the residual errors of previous predictor
tree_reg2 = DecisionTreeRegressor(max_depth = 2)
tree_reg2.fit(X, y2)

DecisionTreeRegressor(max_depth=2)

In [26]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth = 2)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(max_depth=2)

In [27]:
y_pred = sum(tree.predict(X_test) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [28]:
# Now use sklearn built-in method to build a Gradient Boosting 

In [29]:
from sklearn.ensemble import GradientBoostingRegressor

In [30]:
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

### Staged_predict( ) 

In [35]:
from sklearn.metrics import mean_squared_error
import numpy as np

In [32]:
gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators=120)
#先用120个Tree， 然后每一个tree记录下error，之后选tree number that minimizes error
gbrt.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=120)

In [39]:
errors = [mean_squared_error(y_test, y_pred) for y_pred in gbrt.staged_predict(X_test)]

In [40]:
bst_n_estimators = np.argmin(errors) + 1
#这个就是我们找到的最佳的tree number that minimizes the error

In [41]:
gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=34)

### Implement early stopping by warm_start = True

In [42]:
gbrt = GradientBoostingRegressor(max_depth = 2, warm_start = True)

In [43]:
min_val_error = float('inf')
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators #set the number of trees
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_test)
    val_error = mean_squared_error(y_test, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break #if there are already error going up 5 times, we will do early stopping 


## Stochastic Gradient Boosting

The `GradientBoostingRegressor` class also supports a `subsample` hyperparameter, which specifies the fraction of training instances to be used for training each tree. **This technique trades a higher bias for a lower variance.**