# Ensembles: Bagging

In [1]:
import numpy as np
import pandas as pd
import xlrd
import os
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV,\
cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier,\
ExtraTreesClassifier, VotingClassifier, StackingRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Objectives

- Use `sklearn` to build voting models
- Describe the algorithm of bagging
- Describe the differences among simple bagging, random forest, and extra trees algorithms
- Implement bagging models in `sklearn`

# Ensemble Methods

Because many heads are better than one!

<img width=50% src='images/captain_planet.jpg'/>

> "With our powers combined..."

These models tend to perform very well and generalize well!

## Advantages &  Disadvantages

- Decreases variance → Less overfitting!
- More complexity (you have to train each model or part of model)
- Tends to take up more space (have to keep each model)

## Bagging 

![](images/bag_of_marbles.jpg)

- Many models naturally overfit
- Randomization → New models
- New models overfit in different ways
- Aggregation → Smooth over different ways of overfitting to reduce variance

> Low variance since it averages out quirks individual trees might've learned

#### Aggregation

- **B**ootstrap **AGG**regating
- Algorithm to repeat many times:
    + Create a sample from your data
    + Train a model (e.g. a decision tree) on that sample
- Final model comes by averaging over those many models

#### Three Varieties, Three Levels of Randomization

1. **Simple Bag**: Train each model on random sample
2. **Random Forest**: Choose a random set of features at each decision point
3. **Extra Trees**: Choose a path at random!

## Data Preparation for Examples

> Let's prepare some data to do some examples

In [2]:
df = pd.read_csv('data/cars.csv')
df.head()

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand
0,14.0,8,350,165,4209,12,1972,US.
1,31.9,4,89,71,1925,14,1980,Europe.
2,17.0,8,302,140,3449,11,1971,US.
3,15.0,8,400,150,3761,10,1971,US.
4,30.5,4,98,63,2051,17,1978,US.


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261 entries, 0 to 260
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           261 non-null    float64
 1    cylinders    261 non-null    int64  
 2    cubicinches  261 non-null    object 
 3    hp           261 non-null    int64  
 4    weightlbs    261 non-null    object 
 5    time-to-60   261 non-null    int64  
 6    year         261 non-null    int64  
 7    brand        261 non-null    object 
dtypes: float64(1), int64(4), object(3)
memory usage: 16.4+ KB


In [4]:
df.isna().sum().sum()

0

### Defining Our Problem

Let's see if we can predict whether a car is American or not.

In [5]:
df[' brand'].value_counts()

 US.        162
 Japan.      51
 Europe.     48
Name:  brand, dtype: int64

In [6]:
df['target'] = df[' brand'] == ' US.'

In [7]:
df.head()

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand,target
0,14.0,8,350,165,4209,12,1972,US.,True
1,31.9,4,89,71,1925,14,1980,Europe.,False
2,17.0,8,302,140,3449,11,1971,US.,True
3,15.0,8,400,150,3761,10,1971,US.,True
4,30.5,4,98,63,2051,17,1978,US.,True


### Fix Columns with Missing Values

In [8]:
df[' cubicinches'].value_counts(dropna=False).sort_index()

        2
101     1
105     3
107     3
108     4
       ..
90      6
91      6
96      1
97     16
98     13
Name:  cubicinches, Length: 75, dtype: int64

In [9]:
# Convert cubicinches and weightlbs to numbers
df[df[' cubicinches'] == ' ']

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand,target
40,16.0,6,,105,3897,19,1976,US.,True
180,19.8,6,,85,2990,18,1980,US.,True


In [10]:
df[' cubicinches'] = df[' cubicinches'].map(lambda x: np.nan if x == ' ' else int(x))

In [11]:
df[' weightlbs'].value_counts(dropna=False).sort_index()

        3
1613    1
1649    1
1755    1
1760    1
       ..
4735    1
4906    1
4951    1
4952    1
4997    1
Name:  weightlbs, Length: 240, dtype: int64

In [12]:
df[df[' weightlbs'] == ' ']

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand,target
14,19.1,6,225.0,90,,19,1981,US.,True
33,21.0,6,199.0,90,,15,1971,US.,True
172,29.0,4,68.0,49,,20,1974,Europe.,False


In [13]:
df[' weightlbs'] = df[' weightlbs'].map(lambda x: np.nan if x == ' ' else int(x))

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['target', ' brand'], axis=1), df['target'], random_state=42)

In [15]:
si = SimpleImputer()

si.fit(X_train)

X_tr_im = si.transform(X_train)
X_te_im = si.transform(X_test)

## Averaging

> Each model uses the same data to train and then we "vote" to make a prediction

### Model 1 - Logistic Regression

In [16]:
lr = LogisticRegression(max_iter=1000, random_state=42)

lr.fit(X_tr_im, y_train)

LogisticRegression(max_iter=1000, random_state=42)

In [17]:
scores = cross_val_score(estimator=lr, X=X_tr_im,
                        y=y_train, cv=5)
scores

array([0.84615385, 0.92307692, 0.76923077, 0.94871795, 0.79487179])

In [18]:
np.median(scores)

0.8461538461538461

In [19]:
lr.score(X_te_im, y_test)

0.8939393939393939

### Model 2 - KNN

In [20]:
knn = KNeighborsClassifier(3)

knn.fit(X_tr_im, y_train)

KNeighborsClassifier(n_neighbors=3)

In [21]:
scores = cross_val_score(estimator=knn, X=X_tr_im,
                y=y_train, cv=5)
np.median(scores)

0.7692307692307693

In [22]:
knn.score(X_te_im, y_test)

0.7878787878787878

### Model 3 - Decision Tree

In [23]:
ct = DecisionTreeClassifier(random_state=42)

ct.fit(X_tr_im, y_train)

DecisionTreeClassifier(random_state=42)

In [24]:
scores = cross_val_score(estimator=ct, X=X_tr_im,
               y=y_train, cv=5)
scores

array([0.8974359 , 0.87179487, 0.82051282, 0.8974359 , 0.79487179])

In [25]:
np.median(scores)

0.8717948717948718

In [26]:
ct.score(X_te_im, y_test)

0.7878787878787878

### Averaging the Models

#### Building a `VotingClassifier`

> Of course there's a Scikit-Learn class for that!

In [31]:
#Instantiate and fit our Voting Classifier
avg = VotingClassifier(estimators=[('lr', lr), ('knn', knn), ('ct', ct)])

avg.fit(X_tr_im, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(max_iter=1000,
                                                 random_state=42)),
                             ('knn', KNeighborsClassifier(n_neighbors=3)),
                             ('ct', DecisionTreeClassifier(random_state=42))])

In [32]:
#Get the cross validation scores
scores = cross_val_score(estimator=avg, X=X_tr_im, y=y_train, cv=5)
scores

array([0.87179487, 0.8974359 , 0.82051282, 0.8974359 , 0.79487179])

In [33]:
np.median(scores)

0.8717948717948718

In [34]:
#Get the scores on our test set
avg.score(X_te_im, y_test)

0.8636363636363636

#### Weighted Averaging with the `VotingClassifier`

> Even if the vote is 50-50, you'd probably side with the "smart" ones more

This meta-estimator is not as good as one of our base estimators, so in this case the averaging did not work very well. Realizing that the logistic regression is performing better than the decision tree and the k-nearest-neighbors model, however, we might decide to build a meta-estimator by calculating a **weighted average** of the base estimators' predictions. And we can weight, or bias, this estimator in favor of the best-performing base estimator. Suppose we weight the logistic regression 50%, the knn model 25%, and the logistic regression 25%:

In [35]:
w_avg = VotingClassifier(estimators=[
    ('lr', lr),
    ('knn', knn),
    ('ct', ct)],
    weights=[0.5,0.25,0.25])
w_avg.fit(X_tr_im, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(max_iter=1000,
                                                 random_state=42)),
                             ('knn', KNeighborsClassifier(n_neighbors=3)),
                             ('ct', DecisionTreeClassifier(random_state=42))],
                 weights=[0.5, 0.25, 0.25])

In [36]:
scores = cross_val_score(estimator=w_avg, X=X_tr_im,
                        y=y_train, cv=5)
scores

array([0.87179487, 0.92307692, 0.79487179, 0.92307692, 0.79487179])

In [37]:
np.median(scores)

0.8717948717948718

In [38]:
w_avg.score(X_te_im, y_test)

0.9242424242424242

## Bagging

A single decision tree will often overfit your training data. Let's see if we have evidence of that in the current case:

In [39]:
ct.score(X_tr_im, y_train)

1.0

<details>
    <summary><b>🧠 Knowledge Check</b>: What is this score? And why is it equal to 1?</summary>
    <br/>
    <quote>
    This perfect score on the training data is already evidence of model overfitting. There are steps one can take to help with this, like limiting the "depth" of the nodes. And of course we can use cross-validation to get a more honest estimate of model quality.
    </quote>
</details>




In [40]:
scores = cross_val_score(estimator=ct, X=X_tr_im,
                y=y_train, cv=5)
scores

array([0.8974359 , 0.87179487, 0.82051282, 0.8974359 , 0.79487179])

In [41]:
np.median(scores)

0.8717948717948718

In [42]:
ct.score(X_te_im, y_test)

0.7878787878787878

But it's often better to do something else: Plant another tree!

Of course, if a second tree is going to be of any value, it has to be *different from* the first. Here's a good algorithm for achieving that:

### Bagging Algorithm

- Take a sample of your X_train and fit a decision tree to it.
- Replace the first batch of data and repeat.
- When you've got as many trees as you like, make use of all your individual trees' predictions to come up with some holistic prediction. 
    - (Most obviously, we could take the average of our predictions, but there are other methods we might try.)

* Because we're resampling our data with replacement, we're *bootstrapping*.
* Because we're making use of our many samples' predictions, we're *aggregating*.
* Because we're bootstrapping and aggregating all in the same algorithm, we're *bagging*.

### Bagging by Hand

In [58]:
def simple_bagger(X_train, y_train, X_test=None, n_trees=10):
    """
    This function will build `n_trees`-many decision tree classifiers
    with random_state=42 on subsets of X_train (and y_train), returning
    average predictions (averaging on the .predict_proba() method of the
    decision trees as opposed to the .predict() method) on X_test. If X_test
    is not specified, the function will predict on X_train.
    """
    import numpy as np
    if X_test is None:
        X_test = X_train
    y_train = y_train.values.reshape(-1, 1)
    data = np.hstack((X_train, y_train))
    num_recs = y_train.shape[0]
    preds = []
    for _ in range(n_trees):
        
        # Train on 10% of the training data
        subset = np.random.choice(num_recs, size=num_recs//10)
            # Note that there is still randomness here!
        training = data[subset, :]
        ct = DecisionTreeClassifier(random_state=42)
        ct.fit(training[:, :-1], training[:, -1])
        preds.append(ct.predict_proba(X_test))
    
    #using our predicted probabilities, we take the average of all of our tree's predicted probabilities,
    #and then we pick the class with the highest probability
    return np.mean(preds, axis=0).argmax(axis=1)

In [59]:
simple_bagger(X_tr_im, y_train)

array([1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0])

In [60]:
ct = DecisionTreeClassifier(random_state=42)

ct.fit(X_tr_im, y_train)
ct.score(X_te_im, y_test)

0.7878787878787878

In [61]:
accuracy_score(y_test, simple_bagger(X_tr_im, y_train, X_te_im, n_trees=10))

0.8636363636363636

### Bagging with `sklearn`

In [62]:
# Instatiate a BaggingClassifier
# Note the base estimator is by default a decision tree
bag = BaggingClassifier(n_estimators=100, random_state=42)

In [63]:
# Fit it

bag.fit(X_tr_im, y_train)

BaggingClassifier(n_estimators=100, random_state=42)

In [64]:
# Cross-validation

scores = cross_val_score(estimator=bag, X=X_tr_im,
               y=y_train, cv=5)
scores

array([0.8974359 , 0.92307692, 0.84615385, 0.92307692, 0.82051282])

In [65]:
np.median(scores)

0.8974358974358975

In [66]:
# Score on test

bag.score(X_te_im, y_test)

0.8484848484848485

### Fitting a Random Forest

### An Aside Story - Bananas 🍌

Banana trees can be susceptible to [Panama's disease](https://en.wikipedia.org/wiki/Panama_disease)

![Many individual yellow bananas](images/bananas.jpg)

They're all clones!

Similarly, all the Decision Trees will be the same if given the same data! (A clone!!!)

### The Goods & The Bads

**The Goods**

- Super friend! 
- High performance 
    + low variance
- Transparent
    + inherited from Decision Trees
    

**The Bads**

- We got so many trees to plant...
- Computationally expensive
- Memory
    + all trees stored in memory
    + think back to k-Nearest Neighbors

### Breed a Variety of Trees

Let's add an extra layer of randomization: Instead of using *all* the features of my model to optimize a branch at each node, I'll just choose a subset of my features.

That's the essence of a random forest model. Note that there are now **two** levels of random sampling happening: To build a new tree, I'll be taking only some of my data points; and at any branching point in a tree, I'll be using only some of my features to determine the split.

#### Steps:

1. Save a portion of data for validation (**out-of-bag**), the rest for training (**bag**)
2. The data for training (**bag**) is then split up by randomly selecting predictors
3. Grow/train your tree with the training data using just those features
4. Use our validation set (**out-of-bag**), take out the columns used in our tree from the previous step, and predict using the tree & this *out-of-bag* data
5. Compare on how well the tree did *out-of-bag error*
6. Repeat to make new trees and use the result to "vote" for the final decision

### Random Forest by Hand

In [67]:
def bagger(X_train, y_train, X_test=None, n_trees=10, random_forest=False):
    """
    This function will build `n_trees`-many decision tree classifiers
    with random_state=42 on subsets of X_train (and y_train), returning
    average predictions (averaging on the .predict_proba() method of the
    decision trees as opposed to the .predict() method) on X_test. If X_test
    is not specified, the function will predict on X_train. If `random_forest`
    is set to True, a number of features equal to sqrt(n_features) will be
    used to build each tree.
    """
    import numpy as np
    if X_test is None:
        X_test = X_train
    y_train = y_train.values.reshape(-1, 1)
    data = np.hstack((X_train, y_train))
    num_recs = X_train.shape[0]
    num_feats = X_train.shape[1]
    preds = []
    for _ in range(n_trees):
        
        # Train on 10% of the training data
        subset = np.random.choice(num_recs, size=num_recs//10).reshape(-1, 1)
        
        # For random forest, choose only some features
        if random_forest == True:
            subfeatures = np.random.choice(num_feats, size=int(np.sqrt(num_feats)))
        else:
            subfeatures = np.arange(num_feats)
        subfeatures_y = np.append(subfeatures, -1).reshape(1, -1)
        training = data[subset, subfeatures_y]
        ct = DecisionTreeClassifier(random_state=42)
        ct.fit(training[:, :-1], training[:, -1])
        preds.append(ct.predict_proba(X_test[:, subfeatures]))
    return np.mean(preds, axis=0).argmax(axis=1)

In [68]:
accuracy_score(y_test, bagger(X_tr_im, y_train, X_te_im, random_forest=True))

0.8181818181818182

### Random Forest with `sklearn`

> Here's the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier) on `RandomForestClassifier`

In [69]:
# Instantiate a RandomForestClassifier

rfc = RandomForestClassifier(max_features='sqrt', max_samples=0.3, random_state=42)

In [70]:
# Fit it

rfc.fit(X_tr_im, y_train)

RandomForestClassifier(max_features='sqrt', max_samples=0.3, random_state=42)

In [71]:
# Cross-validation

scores = cross_val_score(estimator=rfc, X=X_tr_im,
               y=y_train, cv=5)
scores

array([0.87179487, 0.94871795, 0.84615385, 0.87179487, 0.79487179])

In [72]:
np.median(scores)

0.8717948717948718

In [73]:
# Score on test

score = rfc.score(X_te_im, y_test)
score

0.8636363636363636

### Cool Features of Random Forests

There are some extra investigations we can do with random forests since they're built of decision trees.

> **NOTE**
>
> Not all of these are _specific_ to random forests and can be applied to other (ensemble) models

#### Investigate Your Forest 🌲🌲👀🌲🌲

We can check out our trained estimators after training the ensemble. This isn't necessarily unique to random forests, but since the base model is always a decision tree we can really investigate how the model is working!

In [74]:
#Get our trained estimators
model_estimators = rfc.estimators_

print(len(model_estimators))
model_estimators

100


[DecisionTreeClassifier(max_features='sqrt', random_state=1608637542),
 DecisionTreeClassifier(max_features='sqrt', random_state=1273642419),
 DecisionTreeClassifier(max_features='sqrt', random_state=1935803228),
 DecisionTreeClassifier(max_features='sqrt', random_state=787846414),
 DecisionTreeClassifier(max_features='sqrt', random_state=996406378),
 DecisionTreeClassifier(max_features='sqrt', random_state=1201263687),
 DecisionTreeClassifier(max_features='sqrt', random_state=423734972),
 DecisionTreeClassifier(max_features='sqrt', random_state=415968276),
 DecisionTreeClassifier(max_features='sqrt', random_state=670094950),
 DecisionTreeClassifier(max_features='sqrt', random_state=1914837113),
 DecisionTreeClassifier(max_features='sqrt', random_state=669991378),
 DecisionTreeClassifier(max_features='sqrt', random_state=429389014),
 DecisionTreeClassifier(max_features='sqrt', random_state=249467210),
 DecisionTreeClassifier(max_features='sqrt', random_state=1972458954),
 DecisionTreeC

In [76]:
print(f'Overall model\'s score was {score:.3f}')
print('='*70)

for model in model_estimators[-5:]:
    display(model)
    model_score = model.score(X_te_im, y_test)
    print(f'\tModel gave score of {model_score:.3f}')

Overall model's score was 0.864


DecisionTreeClassifier(max_features='sqrt', random_state=1608637542)

	Model gave score of 0.697


DecisionTreeClassifier(max_features='sqrt', random_state=1273642419)

	Model gave score of 0.848


DecisionTreeClassifier(max_features='sqrt', random_state=1935803228)

	Model gave score of 0.803


DecisionTreeClassifier(max_features='sqrt', random_state=787846414)

	Model gave score of 0.773


DecisionTreeClassifier(max_features='sqrt', random_state=996406378)

	Model gave score of 0.803


DecisionTreeClassifier(max_features='sqrt', random_state=1201263687)

	Model gave score of 0.788


DecisionTreeClassifier(max_features='sqrt', random_state=423734972)

	Model gave score of 0.788


DecisionTreeClassifier(max_features='sqrt', random_state=415968276)

	Model gave score of 0.742


DecisionTreeClassifier(max_features='sqrt', random_state=670094950)

	Model gave score of 0.833


DecisionTreeClassifier(max_features='sqrt', random_state=1914837113)

	Model gave score of 0.606


DecisionTreeClassifier(max_features='sqrt', random_state=669991378)

	Model gave score of 0.712


DecisionTreeClassifier(max_features='sqrt', random_state=429389014)

	Model gave score of 0.727


DecisionTreeClassifier(max_features='sqrt', random_state=249467210)

	Model gave score of 0.818


DecisionTreeClassifier(max_features='sqrt', random_state=1972458954)

	Model gave score of 0.833


DecisionTreeClassifier(max_features='sqrt', random_state=1572714583)

	Model gave score of 0.864


DecisionTreeClassifier(max_features='sqrt', random_state=1433267572)

	Model gave score of 0.697


DecisionTreeClassifier(max_features='sqrt', random_state=434285667)

	Model gave score of 0.864


DecisionTreeClassifier(max_features='sqrt', random_state=613608295)

	Model gave score of 0.803


DecisionTreeClassifier(max_features='sqrt', random_state=893664919)

	Model gave score of 0.742


DecisionTreeClassifier(max_features='sqrt', random_state=648061058)

	Model gave score of 0.682


DecisionTreeClassifier(max_features='sqrt', random_state=88409749)

	Model gave score of 0.682


DecisionTreeClassifier(max_features='sqrt', random_state=242285876)

	Model gave score of 0.727


DecisionTreeClassifier(max_features='sqrt', random_state=2018247425)

	Model gave score of 0.652


DecisionTreeClassifier(max_features='sqrt', random_state=953477463)

	Model gave score of 0.682


DecisionTreeClassifier(max_features='sqrt', random_state=1427830251)

	Model gave score of 0.712


DecisionTreeClassifier(max_features='sqrt', random_state=1883569565)

	Model gave score of 0.803


DecisionTreeClassifier(max_features='sqrt', random_state=911989541)

	Model gave score of 0.803


DecisionTreeClassifier(max_features='sqrt', random_state=3344769)

	Model gave score of 0.652


DecisionTreeClassifier(max_features='sqrt', random_state=780932287)

	Model gave score of 0.879


DecisionTreeClassifier(max_features='sqrt', random_state=2114032571)

	Model gave score of 0.742


DecisionTreeClassifier(max_features='sqrt', random_state=787716372)

	Model gave score of 0.727


DecisionTreeClassifier(max_features='sqrt', random_state=504579232)

	Model gave score of 0.788


DecisionTreeClassifier(max_features='sqrt', random_state=1306710475)

	Model gave score of 0.848


DecisionTreeClassifier(max_features='sqrt', random_state=479546681)

	Model gave score of 0.697


DecisionTreeClassifier(max_features='sqrt', random_state=106328085)

	Model gave score of 0.742


DecisionTreeClassifier(max_features='sqrt', random_state=30349564)

	Model gave score of 0.909


DecisionTreeClassifier(max_features='sqrt', random_state=1855189739)

	Model gave score of 0.773


DecisionTreeClassifier(max_features='sqrt', random_state=99052376)

	Model gave score of 0.742


DecisionTreeClassifier(max_features='sqrt', random_state=1250819632)

	Model gave score of 0.803


DecisionTreeClassifier(max_features='sqrt', random_state=106406362)

	Model gave score of 0.803


DecisionTreeClassifier(max_features='sqrt', random_state=480404538)

	Model gave score of 0.788


DecisionTreeClassifier(max_features='sqrt', random_state=1717389822)

	Model gave score of 0.591


DecisionTreeClassifier(max_features='sqrt', random_state=599121577)

	Model gave score of 0.742


DecisionTreeClassifier(max_features='sqrt', random_state=200427519)

	Model gave score of 0.864


DecisionTreeClassifier(max_features='sqrt', random_state=1254751707)

	Model gave score of 0.909


DecisionTreeClassifier(max_features='sqrt', random_state=2034764475)

	Model gave score of 0.652


DecisionTreeClassifier(max_features='sqrt', random_state=1573512143)

	Model gave score of 0.803


DecisionTreeClassifier(max_features='sqrt', random_state=999745294)

	Model gave score of 0.682


DecisionTreeClassifier(max_features='sqrt', random_state=1958805693)

	Model gave score of 0.712


DecisionTreeClassifier(max_features='sqrt', random_state=389151677)

	Model gave score of 0.788


DecisionTreeClassifier(max_features='sqrt', random_state=1224821422)

	Model gave score of 0.712


DecisionTreeClassifier(max_features='sqrt', random_state=508464061)

	Model gave score of 0.833


DecisionTreeClassifier(max_features='sqrt', random_state=857592370)

	Model gave score of 0.818


DecisionTreeClassifier(max_features='sqrt', random_state=1642661739)

	Model gave score of 0.727


DecisionTreeClassifier(max_features='sqrt', random_state=61136438)

	Model gave score of 0.727


DecisionTreeClassifier(max_features='sqrt', random_state=2075460851)

	Model gave score of 0.712


DecisionTreeClassifier(max_features='sqrt', random_state=396917567)

	Model gave score of 0.818


DecisionTreeClassifier(max_features='sqrt', random_state=2004731384)

	Model gave score of 0.848


DecisionTreeClassifier(max_features='sqrt', random_state=199502978)

	Model gave score of 0.879


DecisionTreeClassifier(max_features='sqrt', random_state=1545932260)

	Model gave score of 0.833


DecisionTreeClassifier(max_features='sqrt', random_state=461901618)

	Model gave score of 0.848


DecisionTreeClassifier(max_features='sqrt', random_state=774414982)

	Model gave score of 0.636


DecisionTreeClassifier(max_features='sqrt', random_state=732395540)

	Model gave score of 0.758


DecisionTreeClassifier(max_features='sqrt', random_state=1934879560)

	Model gave score of 0.712


DecisionTreeClassifier(max_features='sqrt', random_state=279394470)

	Model gave score of 0.591


DecisionTreeClassifier(max_features='sqrt', random_state=56972561)

	Model gave score of 0.788


DecisionTreeClassifier(max_features='sqrt', random_state=1927948675)

	Model gave score of 0.833


DecisionTreeClassifier(max_features='sqrt', random_state=1899242072)

	Model gave score of 0.576


DecisionTreeClassifier(max_features='sqrt', random_state=1999874363)

	Model gave score of 0.758


DecisionTreeClassifier(max_features='sqrt', random_state=271820813)

	Model gave score of 0.576


DecisionTreeClassifier(max_features='sqrt', random_state=1324556529)

	Model gave score of 0.818


DecisionTreeClassifier(max_features='sqrt', random_state=1655351289)

	Model gave score of 0.727


DecisionTreeClassifier(max_features='sqrt', random_state=1308306184)

	Model gave score of 0.742


DecisionTreeClassifier(max_features='sqrt', random_state=68574553)

	Model gave score of 0.727


DecisionTreeClassifier(max_features='sqrt', random_state=419498548)

	Model gave score of 0.697


DecisionTreeClassifier(max_features='sqrt', random_state=991681409)

	Model gave score of 0.773


DecisionTreeClassifier(max_features='sqrt', random_state=791274835)

	Model gave score of 0.803


DecisionTreeClassifier(max_features='sqrt', random_state=1035196507)

	Model gave score of 0.652


DecisionTreeClassifier(max_features='sqrt', random_state=1890440558)

	Model gave score of 0.894


DecisionTreeClassifier(max_features='sqrt', random_state=787110843)

	Model gave score of 0.727


DecisionTreeClassifier(max_features='sqrt', random_state=524150214)

	Model gave score of 0.727


DecisionTreeClassifier(max_features='sqrt', random_state=472432043)

	Model gave score of 0.773


DecisionTreeClassifier(max_features='sqrt', random_state=2126768636)

	Model gave score of 0.788


DecisionTreeClassifier(max_features='sqrt', random_state=1431061255)

	Model gave score of 0.758


DecisionTreeClassifier(max_features='sqrt', random_state=147697582)

	Model gave score of 0.758


DecisionTreeClassifier(max_features='sqrt', random_state=744595490)

	Model gave score of 0.818


DecisionTreeClassifier(max_features='sqrt', random_state=1758017741)

	Model gave score of 0.848


DecisionTreeClassifier(max_features='sqrt', random_state=1679592528)

	Model gave score of 0.909


DecisionTreeClassifier(max_features='sqrt', random_state=1111451555)

	Model gave score of 0.758


DecisionTreeClassifier(max_features='sqrt', random_state=782698033)

	Model gave score of 0.803


DecisionTreeClassifier(max_features='sqrt', random_state=698027879)

	Model gave score of 0.909


DecisionTreeClassifier(max_features='sqrt', random_state=1096768899)

	Model gave score of 0.652


DecisionTreeClassifier(max_features='sqrt', random_state=1338788865)

	Model gave score of 0.712


DecisionTreeClassifier(max_features='sqrt', random_state=1826030589)

	Model gave score of 0.636


DecisionTreeClassifier(max_features='sqrt', random_state=86191493)

	Model gave score of 0.742


DecisionTreeClassifier(max_features='sqrt', random_state=893102645)

	Model gave score of 0.742


DecisionTreeClassifier(max_features='sqrt', random_state=200619113)

	Model gave score of 0.712


DecisionTreeClassifier(max_features='sqrt', random_state=290770691)

	Model gave score of 0.864


DecisionTreeClassifier(max_features='sqrt', random_state=793943861)

	Model gave score of 0.818


DecisionTreeClassifier(max_features='sqrt', random_state=134489564)

	Model gave score of 0.773


#### Feature Importance

We can use [`.feature_importances_`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier.feature_importances_) property of the trained model to get an idea of what features mattered the most

In [77]:
rfc.feature_importances_

array([0.14360934, 0.11362139, 0.2917449 , 0.11350194, 0.19769359,
       0.05948543, 0.08034342])

In [78]:
feat_import = {name: score 
                   for name, score 
                       in zip(X_train.columns, rfc.feature_importances_)
}
feat_import

{'mpg': 0.143609339528514,
 ' cylinders': 0.11362138678976928,
 ' cubicinches': 0.29174490390579544,
 ' hp': 0.11350193639792439,
 ' weightlbs': 0.19769358621453248,
 ' time-to-60': 0.059485431111608145,
 ' year': 0.0803434160518563}

### Extremely Randomized Trees (Extra Trees)

Sometimes we might want even one more bit of randomization. Instead of always choosing the *optimal* branching path, we might just choose a branching path at random. If we're doing that, then we've got extremely randomized trees.

There are now **three** levels of randomization: sampling of data, sampling of features, and random selection of branching paths.

Computationally less expensive than our Random Forests

In [79]:
# Instantiate an ExtraTreesClassifier

etc = ExtraTreesClassifier(max_features='sqrt', max_samples=0.3, bootstrap=True, random_state=1)

In [80]:
# Fit it

etc.fit(X_tr_im, y_train)

ExtraTreesClassifier(bootstrap=True, max_features='sqrt', max_samples=0.3,
                     random_state=1)

In [81]:
# Cross-validation

scores = cross_val_score(estimator=etc, X=X_tr_im,
               y=y_train, cv=5)
scores

array([0.87179487, 0.92307692, 0.87179487, 0.8974359 , 0.82051282])

In [82]:
np.median(scores)

0.8717948717948718

In [83]:
# Score on test

etc.score(X_te_im, y_test)

0.8636363636363636

# Level Up: Stacking

#### Meta-Classifier/Meta-Regressor

- First, we ask several different models to make predictions about the target
- Rather than taking a simple average or vote to determine the outcome, feed these results into a final model that makes the prediction based on the other models’ predictions
- If it seems like we are approaching a neural network...you are correct!

Remember weighted averaging? Stacking is about using DS models to estimate those weights for us. This means we'll have one layer of base estimators and another layer that is "**trained to optimally combine the model predictions to form a new set of predictions**". See [this short blog post](https://blogs.sas.com/content/subconsciousmusings/2017/05/18/stacked-ensemble-models-win-data-science-competitions/) for more.

## Initial Data Prep

In [84]:
wb = xlrd.open_workbook('data/Sales Report.xls',
                        logfile=open(os.devnull, 'w'))

sales = pd.read_excel(wb)
sales = sales.dropna()

In [85]:
sales.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2017-152156,2017-11-08,2017-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420.0,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2017-152156,2017-11-08,2017-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420.0,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2017-138688,2017-06-12,2017-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036.0,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2016-108966,2016-10-11,2016-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311.0,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2016-108966,2016-10-11,2016-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311.0,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [86]:
sales.dtypes

Row ID                    int64
Order ID                 object
Order Date       datetime64[ns]
Ship Date        datetime64[ns]
Ship Mode                object
Customer ID              object
Customer Name            object
Segment                  object
Country                  object
City                     object
State                    object
Postal Code             float64
Region                   object
Product ID               object
Category                 object
Sub-Category             object
Product Name             object
Sales                   float64
Quantity                  int64
Discount                float64
Profit                  float64
dtype: object

In [87]:
sales['Category'].value_counts()

Office Supplies    6020
Furniture          2119
Technology         1844
Name: Category, dtype: int64

In [88]:
sales['Sub-Category'].value_counts()

Binders        1523
Paper          1368
Furnishings     957
Phones          888
Storage         845
Art             795
Accessories     773
Chairs          616
Appliances      465
Labels          364
Tables          319
Envelopes       253
Bookcases       227
Fasteners       217
Supplies        190
Machines        115
Copiers          68
Name: Sub-Category, dtype: int64

In [89]:
X_num = sales[['Discount', 'Profit']].columns
X_cat = sales[['Category', 'Sub-Category']].columns

In [90]:
X = sales[['Discount', 'Profit',
          'Category', 'Sub-Category']]
y = sales['Sales']

## Splitting

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Setting Up a Pipeline

In [92]:
numTrans = Pipeline(steps=[
    ('scaler', StandardScaler())
])
catTrans = Pipeline(steps=[
    ('ohe', OneHotEncoder(drop='first',
                          sparse=False))
])

In [93]:
pp = ColumnTransformer(transformers=[
    ('num', numTrans, X_num),
    ('cat', catTrans, X_cat)
])

In [94]:
pp.fit(X_train)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 Index(['Discount', 'Profit'], dtype='object')),
                                ('cat',
                                 Pipeline(steps=[('ohe',
                                                  OneHotEncoder(drop='first',
                                                                sparse=False))]),
                                 Index(['Category', 'Sub-Category'], dtype='object'))])

In [95]:
X_tr_pp = pp.transform(X_train)

## Setting Up a Stack

In [96]:
estimators = [
    ('lr', LinearRegression()),
    ('knn', KNeighborsRegressor()),
    ('rt', DecisionTreeRegressor())
]

sr = StackingRegressor(estimators)

In [97]:
sr.fit(X_tr_pp, y_train)

StackingRegressor(estimators=[('lr', LinearRegression()),
                              ('knn', KNeighborsRegressor()),
                              ('rt', DecisionTreeRegressor())])

In [98]:
X_test_pp = pp.transform(X_test)

In [99]:
sr.score(X_test_pp, y_test)

0.8106553602701593

In [100]:
linreg = LinearRegression().fit(X_tr_pp, y_train)

linreg.score(X_test_pp, y_test)

0.429412381411298

## Comparison with Base Estimators

In [101]:
lr = LinearRegression().fit(X_tr_pp, y_train)
lr.score(X_test_pp, y_test)

0.429412381411298

In [102]:
knn = KNeighborsRegressor().fit(X_tr_pp, y_train)
knn.score(X_test_pp, y_test)

0.7963215988357311

In [103]:
rt = DecisionTreeRegressor().fit(X_tr_pp, y_train)
rt.score(X_test_pp, y_test)

0.24593850039142062

In [104]:
rt.score(X_tr_pp, y_train)

0.9955666197715463