**Important: This notebook will only work with fastai-0.7.x. Do not try to run any fastai-1.x code from this path in the repository because it will load fastai-0.7.x**

# Random Forest Model interpretation

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics

In [3]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [4]:
set_plot_sizes(12,14,16)

## Load in our data from last lesson

In [5]:
def drop_irrelevant_columns(df):
    #Drop saleElapsed (epoc time) and kcal_na (Boolean), both of which are redundant anyway
    df = df.drop(['saleElapsed', 'kcal_na'], axis=1)
    return df

In [6]:
PATH = "./"

df_raw = pd.read_feather('tmp/lunchbox-raw')

df_raw = drop_irrelevant_columns(df_raw)

df_trn, y_trn, nas = proc_df(df_raw, 'responsevalue')

In [7]:
def split_vals(a,n): return a[:n], a[n:]
n_valid = 40 # same as test set sample number
n_trn = len(df_trn)-n_valid
X_train, X_valid = split_vals(df_trn, n_trn)
y_train, y_valid = split_vals(y_trn, n_trn)
raw_train, raw_valid = split_vals(df_raw, n_trn)

In [8]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

# Confidence based on tree variance

For model interpretation, there's no need to use the full dataset on each tree - using a subset will be both faster, and also provide better interpretability (since an overfit model will not provide much variance across trees).

In [9]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)

[6.4106829813542605, 17.5657657106088, 0.9592991021976386, 0.13079721116940746]


In [10]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1)
#Taking the whole Training set including the Validation set separated earlier
m.fit(df_trn, y_trn)
print_score(m)

[6.200368204249121, 5.078653241756125, 0.9619258384262565, 0.9273418958414028]


In [11]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, bootstrap=False)
m.fit(X_train, y_train)
print_score(m)

[0.0, 21.859490215922236, 1.0, -0.34606799535194877]


**- Overfitting exmaple, using the entire Training set**

In [12]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[6.3725749761260015, 18.29259071318221, 0.9597815531371131, 0.057378428817916105, 0.7056476016264986]


- As known, 1/3 of samples are NOT used with bootstrap=True (default), so oob_score use that one-third for Validation set

In [13]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
#Using the whole Training set with oob_score
m.fit(df_trn, y_trn)
print_score(m)

[6.105885180081208, 5.443239786652799, 0.9630773678141703, 0.9165354854044157, 0.775880772133376]


In [14]:
m = RandomForestRegressor(n_estimators=80, n_jobs=-1, oob_score=True)
#Using the whole Training set with oob_score
m.fit(df_trn, y_trn)
print_score(m)

[5.920713743716203, 5.08381008693283, 0.9652828939611495, 0.9271942674037819, 0.7723377887407956]


In [15]:
m = RandomForestRegressor(n_estimators=120, n_jobs=-1, oob_score=True)
#Using the whole Training set with oob_score
m.fit(df_trn, y_trn)
print_score(m)

[5.930399569609361, 5.893963042573495, 0.9651692120944404, 0.9021407786267904, 0.7699606942219437]


In [16]:
m = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
#Using the whole Training set with oob_score
m.fit(df_trn, y_trn)
print_score(m)

[5.752744775764855, 5.472464709799415, 0.9672247811015486, 0.9156368322828268, 0.7840647951686446]


In [17]:
m = RandomForestRegressor(n_estimators=90, n_jobs=-1, oob_score=True)
#Using the whole Training set with oob_score
m.fit(df_trn, y_trn)
print_score(m)

[5.845486690155448, 5.260203781904319, 0.966159502469809, 0.9220543150754308, 0.7780874961569884]


In [18]:
m = RandomForestRegressor(n_estimators=110, n_jobs=-1, oob_score=True)
#Using the whole Training set with oob_score
m.fit(df_trn, y_trn)
print_score(m)

[5.6060757943366575, 5.275082741899344, 0.9688747163577269, 0.921612738662955, 0.7825761591524527]


In [19]:
# Comment out this, as our training test is with too few samples!
# Set to 167 (207 train samples (November 18, 2013 to September 30, 2014) - 40 test sample)
set_rf_samples(167)

- Now randomise the same 167 samples from Training set, which is the same size in fact!

In [20]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
# Way too few samples we have, so omit min_samples_leaf=3, max_features=0.5
#m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[6.282300962961068, 18.021954926561104, 0.960912952758559, 0.08506395471671524, 0.7010578921345584]


In [21]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, bootstrap=False)
m.fit(X_train, y_train)
print_score(m)

[0.0, 21.79760630493633, 1.0, -0.3384573840628191]


In [22]:
set_rf_samples(110)

In [23]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
# Way too few samples we have, so omit min_samples_leaf=3, max_features=0.5
#m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[8.909530110864836, 16.766817117449573, 0.9213850780866538, 0.2080674495580831, 0.7257816410708554]


In [24]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, bootstrap=False)
m.fit(X_train, y_train)
print_score(m)

[0.0, 21.986642251603584, 1.0, -0.3617731258142891]


**- Although using randomly chosen samples of 110, fewer than Training set (167), it still use the ENTIRE Training set samples due to bootstrap=False!**

# Remove Only Two Irrelevant Features

In [27]:
m = RandomForestRegressor(n_estimators=190, n_jobs=-1, oob_score=True)
#m = RandomForestRegressor(n_estimators=160, max_features=0.5, n_jobs=-1, oob_score=True)
#%time m.fit(df_ext, y_train)
m.fit(df_trn, y_trn)
print_score(m)

[9.386164433395923, 9.346113113610919, 0.9127487406785195, 0.7539354756648896, 0.7650441926753309]


In [28]:
df_test = pd.read_feather('tmp/lunchbox-test')
#convert every feature values to numeric
df_test, _, _ = proc_df(df_test)

#Number of features of the model must match the input
df_test = drop_irrelevant_columns(df_test)

predictions = m.predict(df_test)

In [29]:
predictions

array([ 61.10526,  54.42632,  53.83158,  57.77368,  56.51053,  54.79474,  53.93684,  92.02105,  53.47895,
       116.91579,  68.6    ,  58.45263,  59.78947,  61.52105, 116.98947, 112.19474,  83.54211,  62.02105,
        66.4    ,  62.62105,  64.16842,  67.72632,  68.64211, 119.53684,  64.78421,  89.56842,  65.94737,
       119.58421, 117.37895,  66.92632, 107.28947, 120.61579, 117.41053, 115.20526, 109.18421, 111.98421,
       138.85263, 137.93158, 134.5    , 130.57368])

# K-Fold Cross Validation Models

In [32]:
#REFERENCE: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
#Turn off Autosave
%autosave 0

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state = 42)

# Look at parameters used by our current forest
print ("Parameters currently in use:\n")
rf.get_params()

Autosave disabled
Parameters currently in use:



{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [34]:
from sklearn.model_selection import GridSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 40, stop = 150, num = 20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
#max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
#max_depth.append(None)
# Minimum number of samples required to split a node
#min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
#min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the grid
grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'bootstrap': bootstrap}

print(grid)

{'n_estimators': [40, 45, 51, 57, 63, 68, 74, 80, 86, 92, 97, 103, 109, 115, 121, 126, 132, 138, 144, 150], 'max_features': ['auto', 'sqrt'], 'bootstrap': [True, False]}


In [37]:
rf = RandomForestRegressor()

#cv = 5 as test sample (40) is roughly one-fifth of training test sample (207 train samples (November 18, 2013 to September 30, 2014)) 
rf_grid = GridSearchCV(estimator = rf, param_grid = grid, cv = 5, verbose=2, n_jobs = -1)

# Fit the random search model
rf_grid.fit(df_trn, y_trn)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:   22.6s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:   25.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [40, 45, 51, 57, 63, 68, 74, 80, 86, 92, 97, 103, 109, 115, 121, 126, 132, 138, 144, 150], 'max_features': ['auto', 'sqrt'], 'bootstrap': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [38]:
rf_grid.best_params_

{'bootstrap': True, 'max_features': 'auto', 'n_estimators': 121}

In [39]:
m = RandomForestRegressor(n_estimators=121, n_jobs=-1)
m.fit(df_trn, y_trn)

predictions = m.predict(df_test)

In [40]:
predictions

array([ 61.30579,  55.23967,  57.91736,  60.6281 ,  57.17355,  55.42975,  54.73554,  91.30579,  54.36364,
       118.17355,  70.93388,  59.71074,  58.52066,  58.80992, 117.1157 , 108.68595,  81.12397,  60.54545,
        65.32231,  61.04132,  60.61983,  66.66116,  71.91736, 118.41322,  67.47934,  88.95041,  65.89256,
       116.45455, 114.77686,  70.43802, 107.33058, 123.47107, 118.36364, 115.77686, 108.67769, 111.92562,
       137.03306, 136.35537, 130.42975, 125.91736])