In [None]:
## Code from Brian Bargh
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.misc import comb

## Ensembles: Intuition

Suppose we have 5 *independent* hard binary classifers (they only give 0 or 1 probability). If they are each  70% accurate, what's the accuracy of an ensemble of them?

In [None]:
def find_ensemble_accuracy(n, p):
    '''Given a n independent classifiers each of p accuracy,
    return the emsumble accuracy'''
    ensemble_accuracy = 0
    for k in range((n + 1) / 2, n+1):
        ensemble_accuracy += comb(n, k) * p**k * (1-p)**(n-k)
    return ensemble_accuracy

In [None]:
find_ensemble_accuracy(5, 0.7)

In [None]:
ns = np.arange(1, 55, 2)
vfea = np.vectorize(find_ensemble_accuracy, excluded=['p'])
ensemble_accuracies = vfea(ns, p=0.7)

fig, ax = plt.subplots()
ax.plot(ns, ensemble_accuracies, '.')
ax.set_ylabel("Ensemble accuracy")
ax.set_xlabel("Number of independent 0.7-accuracy classifiers")
ax.set_title("Accuracy of an Ensemble of Independent Classifiers")

## Random Forest 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

# Load Boston data
data = load_boston()

# Split into test/train
X_train, X_test, y_train, y_test = train_test_split(data.data, 
                                                    data.target, 
                                                    test_size=.33,
                                                    random_state=0)

# Train and fit model                                                   
rf = RandomForestRegressor(n_estimators=1000,
                                  max_features='auto',
                                  oob_score=True,
                                  random_state=0)
rf.fit(X_train,y_train)

                                                    
# Test Prediction
pred = rf.predict(X_test)
mse = np.mean((y_test - pred)**2)
print('MSE Score: ' + str(mse))

# R2 score using built in scoring method
r2 = rf.score(X_test,y_test)
print('R2 Score: ' + str(r2))

### Random Forest Interpretation

#### Mean Decrease Impurity

In [None]:
# Plot the feature importance
feat_scores = pd.DataFrame({'Mean Decrease Impurity' : rf.feature_importances_},
                           index=data.feature_names)
feat_scores = feat_scores.sort_values(by='Mean Decrease Impurity')
feat_scores.plot(kind='barh')

### Mean Decrease Accuracy

In [None]:
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import r2_score
from collections import defaultdict

 
boston = load_boston()   
names = boston.feature_names
X = boston["data"]
Y = boston["target"]
 
rf = RandomForestRegressor()
scores = defaultdict(list)
 
# crossvalidate the scores on a number of 
# different random splits of the data
for train_idx, test_idx in ShuffleSplit(len(X), 100, .3):
    X_train, X_test = X[train_idx], X[test_idx]
    Y_train, Y_test = Y[train_idx], Y[test_idx]
    r = rf.fit(X_train, Y_train)
    acc = r2_score(Y_test, rf.predict(X_test))
    for i in range(X.shape[1]):
        X_t = X_test.copy()
        np.random.shuffle(X_t[:, i])
        shuff_acc = r2_score(Y_test, rf.predict(X_t))
        scores[names[i]].append((acc-shuff_acc)/acc)

score_series = pd.DataFrame(scores).mean()
scores = pd.DataFrame({'Mean Decrease Accuracy' : score_series})
scores.sort_values(by='Mean Decrease Accuracy').plot(kind='barh')