# Ensembles: From Decision Trees to Extra Trees

In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor

cars = pd.read_csv('cars.csv')

In [70]:
cars.shape

(261, 8)

In [71]:
cars.head()

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand
0,14.0,8,350,165,4209,12,1972,US.
1,31.9,4,89,71,1925,14,1980,Europe.
2,17.0,8,302,140,3449,11,1971,US.
3,15.0,8,400,150,3761,10,1971,US.
4,30.5,4,98,63,2051,17,1978,US.


In [72]:
# Let's check the datatypes of these columns

cars.dtypes

mpg             float64
 cylinders        int64
 cubicinches     object
 hp               int64
 weightlbs       object
 time-to-60       int64
 year             int64
 brand           object
dtype: object

In [73]:
# Let's look for nulls
cars.isnull().sum()


mpg             0
 cylinders      0
 cubicinches    0
 hp             0
 weightlbs      0
 time-to-60     0
 year           0
 brand          0
dtype: int64

In [74]:
# Cubicinches and weightlbs are objects. Let's see if we can make them
# ints. Notice they have a space.



In [75]:
# Let's look for the problematic rows.
cars[cars[' cubicinches'] == ' ']


Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand
40,16.0,6,,105,3897,19,1976,US.
180,19.8,6,,85,2990,18,1980,US.


In [76]:
# We'll replace the blanks with an average.

cars[' cubicinches'].replace(' ', 0, inplace = True)



In [77]:
cars[' cubicinches'] = cars[' cubicinches'].map(int)

In [78]:
#we don't we don't want those zeros to mess up our mean
cars[' cubicinches'].replace(0, 261/259*cars[' cubicinches'].mean(), inplace = True)

In [79]:
cars.dtypes

mpg             float64
 cylinders        int64
 cubicinches    float64
 hp               int64
 weightlbs       object
 time-to-60       int64
 year             int64
 brand           object
dtype: object

In [80]:
cars[' weightlbs'].replace(' ', 0, inplace=True)

cars[' weightlbs'] = cars[' weightlbs'].map(int)

cars[' weightlbs'].replace(0, 261 / 259 * cars[' weightlbs'].mean(), inplace=True)

## Fitting a Decision Tree

In [81]:
X = cars.drop(['mpg', ' brand'], axis=1)
y = cars['mpg']

X_train, X_test, y_train, y_test = train_test_split(X, y)

rt = DecisionTreeRegressor()
rt.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [18]:
rt.score(X_test, y_test)

0.6422366332358836

A single decision tree will often overfit your training data. There are steps one can take to help with this, like limiting the "depth" of the nodes. But it's often better to do something else: Plant another tree!

Of course, if a second tree is going to be of any value, it has to be *different from* the first. Here's a good algorithm for achieving that:

## Fitting a Set of Bagged Decision Trees

### Bagging Algorithm

Take a sample of your X_train and fit a decision tree to it. <br/>
Replace the first batch of data and repeat. <br/>
When you've got as many trees as you like, make use of all your individual trees' predictions to come up with some holistic prediction. (Most obviously, we could take the average of our predictions, but there are other methods we might try.)

<br/>

Because we're resampling our data with replacement, we're *bootstrapping*. <br/>
Because we're making use of our many samples' predictions, we're *aggregating*. <br/>
Because we're bootstrapping and aggregating all in the same algorithm, we're *bagging*.

In [29]:
# Initializing a bagging regressor object and fitting it to our training data.
bag = BaggingRegressor(n_estimators=100)
bag.fit(X_train, y_train)



BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=100, n_jobs=None, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

In [30]:
# And scoring it on our testing data.
bag.score(X_test, y_test)
#much better

0.8798473369578059

That's a significant improvement in accuracy! Let's see if we can do even better.

## Fitting a Random Forest

### Random Forest Algorithm

Let's add an extra layer of randomization: Instead of using *all* the features of my model to optimize a branch at each node, I'll just choose a subset of my features.

In [31]:
# Same with the Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [32]:
# Scoring
rf.score(X_test, y_test)


0.8749269755702529

## Fitting a Stand of Extremely Randomized Trees

### Extra Trees Algorithm

Sometimes we might want even one more bit of randomization. Instead of always choosing the *optimal* branching path, we might just choose a branching path at random. If we're doing that, then we've got extremely randomized trees.

In [34]:
# And with the Extra Trees Regressor
et = ExtraTreesRegressor(n_estimators=100)
et.fit(X_train, y_train)


ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
          oob_score=False, random_state=None, verbose=0, warm_start=False)

In [48]:
# Scoring
et.score(X_test, y_test)



0.8776927572555683

In [49]:
#try with bootstrapping:
eb = ExtraTreesRegressor(n_estimators=100, bootstrap=True)
eb.fit(X_train, y_train)


ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
          oob_score=False, random_state=None, verbose=0, warm_start=False)

In [50]:
eb.score(X_test, y_test)

0.874684713160444

## Gridsearching

In [39]:
# Let's set a parameter grid of options for our gridsearch to
# optimize over.
param_grid = {
    'n_estimators': [100, 500, 2000],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [2, 4, 8]
}



In [51]:
# Instantiating the gridsearch object on our model and the parameter grid:
gs = GridSearchCV(et, param_grid = param_grid, cv = 5)



In [52]:
# Fitting it to our training data:
gs.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
          oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [100, 500, 2000], 'min_samples_split': [2, 4, 8], 'min_samples_leaf': [2, 4, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [54]:
# And scoring it on our test set:
gs.score(X_test, y_test)



0.8834756786149613

In [55]:
# Let's find out the best parameters!

gs.best_params_


{'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 100}

In [61]:
#look at a prediction
cars.columns

Index(['mpg', ' cylinders', ' cubicinches', ' hp', ' weightlbs', ' time-to-60',
       ' year', ' brand'],
      dtype='object')

In [62]:
gs.predict([[7, 81, 2000, 20, 1, 2100]])

array([26.62811865])

In [63]:
gs.predict([[6, 40, 200, 2000, 3, 1987]])

array([29.14074484])

## Bonus: let's try it with our bootstrapped model!

In [56]:
gb = GridSearchCV(eb, param_grid = param_grid, cv = 5)

In [57]:
gb.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
          oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [100, 500, 2000], 'min_samples_split': [2, 4, 8], 'min_samples_leaf': [2, 4, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [58]:
gb.score(X_test, y_test)

0.8839648648869973

In [59]:
gb.best_params_

{'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 500}

That didn't matter because bootstrapping is already set to True