# Random Forests

## Prerequisites

In [1]:
# Helper packages
import numpy as np
import pandas as pd
from plotnine import *
from scipy.stats import uniform
from scipy.stats import randint

# Modeling packages
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from category_encoders.ordinal import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.inspection import partial_dependence
from sklearn.pipeline import Pipeline

In [2]:
# Ames housing data
ames = pd.read_csv("../data/ames.csv")

# create train/test split
train, test = train_test_split(ames, train_size=0.7, random_state=123)

# separate features from labels and only use numeric features
X_train = train.drop("Sale_Price", axis=1)
y_train = train[["Sale_Price"]]

## Out-of-the-box performance

In [3]:
# Ordinal encode our quality-based features 
ord_cols = list(X_train.filter(regex=("Qual$|QC$|Cond$")).columns)
lvs = ["Very_Poor", "Poor", "Fair", "Below_Average", "Average", "Typical", 
       "Above_Average", "Good", "Very_Good", "Excellent", "Very_Excellent"]
val = range(0, len(lvs))
lvl_map = dict(zip(lvs, val))
category_mapping = [{'col': col, 'mapping': lvl_map} for col in ord_cols]
ord_encoder = OrdinalEncoder(cols=ord_cols, mapping=category_mapping)

# one hot encode remaining nominal features
encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)

# combine into a pre-processing pipeline
preprocessor = ColumnTransformer(
  remainder="passthrough",
  transformers=[
   ("ord_encode", ord_encoder, ord_cols),
   ("one-hot", encoder, selector(dtype_include="object")),
   ]
  )

In [4]:
# create random forest estimator
rf_mod = RandomForestRegressor()

# create modeling pipeline
model_pipeline = Pipeline(steps=[
  ("preprocessor", preprocessor),
  ("rf_mod", rf_mod),
])

# define loss function
loss = 'neg_root_mean_squared_error'

# create 5 fold CV object
kfold = KFold(n_splits=5, random_state=123, shuffle=True)

# fit model with 5-fold CV
results = cross_val_score(model_pipeline, X_train, y_train, cv=kfold, scoring=loss)

np.abs(np.mean(results))



28062.467835371743

## Tuning strategies

### Cartesian grid search

In [None]:
# create random forest estimator with 1,000 trees
rf_mod = RandomForestRegressor(n_estimators=1000)

# create modeling pipeline
model_pipeline = Pipeline(steps=[
  ("preprocessor", preprocessor),
  ("rf_mod", rf_mod),
])

# Create grid of hyperparameter values
hyper_grid = {
  'rf_mod__max_features': [.05, .15, .25, .333, .4],
  'rf_mod__min_samples_leaf': [1, 3, 5, 10],
  'rf_mod__bootstrap': [True, False],
  'rf_mod__max_samples': [.5, .63, .8]
  }
  
# Tune a knn model using grid search
grid_search = GridSearchCV(model_pipeline, hyper_grid, cv=kfold, scoring=loss, n_jobs=-1)
results = grid_search.fit(X_train, y_train)

# best model score
np.abs(results.best_score_)

In [None]:
# best hyperparameter values
results.best_params_

### Random grid search

In [None]:
# Create grid of hyperparameter values
hyper_distributions = {
  'rf_mod__max_features': uniform(.05, .35),
  'rf_mod__min_samples_leaf': randint(1, 9),
  'rf_mod__bootstrap': [True, False],
  'rf_mod__max_samples': uniform(.5, .3)
  }
  
# Tune a knn model using grid search
random_search = RandomizedSearchCV(
  model_pipeline, 
  param_distributions=hyper_distributions, 
  n_iter=20,
  cv=kfold, 
  scoring=loss, 
  n_jobs=-1, 
  random_state=13
  )
random_search_results = random_search.fit(X_train, y_train)

# best model score
np.abs(random_search_results.best_score_)

In [None]:
# best hyperparameter values
random_search_results.best_params_

## Feature interpretation