# Random Forest

## Import and load data

In [17]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [56]:
housing_prices_df_raw = pd.read_csv('../src/data/train.csv')

In [57]:
hdf = housing_prices_df_raw.copy()
hdf = hdf.drop(columns = 'Id')

## Inspect data

In [59]:
hdf.shape

(1460, 80)

In [22]:
hdf.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
hdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-

## Data cleaning

### Identify features by variable type

In [108]:
ord_feat_num = ['OverallQual', 'OverallCond', 'BsmtFullBath', 
                'BsmtHalfBath', 'FullBath', 'HalfBath',
                'TotRmsAbvGrd', 'Fireplaces', 'BedroomAbvGr', 
                'KitchenAbvGr']

ord_feat_cat = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
                'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
                'HeatingQC', 'KitchenQual', 'FireplaceQu', 
                'GarageQual', 'GarageCond', 'PoolQC']

ord_feat = ord_feat_num + ord_feat_cat

nom_feat  = ['MSSubClass', 'MSZoning', 'Alley', 'LotShape', 
             'LandContour', 'Utilities', 'Neighborhood', 
             'Condition1', 'Condition2', 'BldgType', 'RoofStyle', 
             'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 
             'Foundation', 'Heating', 'CentralAir', 'Electrical', 
             'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition', 
             'GarageFinish', 'PavedDrive', 'Fence', 'Functional', 
             'GarageCars', 'HouseStyle','LotConfig', 'Street']

cat_feat = nom_feat + ord_feat

cont_feat = ['LotFrontage', 'LotArea', 'LandSlope', 'YearBuilt', 
             'YearRemodAdd', 'MasVnrType', 'MasVnrArea', 'BsmtFinSF1', 
             'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 
             'GarageYrBlt', 'GarageCars', 'WoodDeckSF', 'OpenPorchSF', 
             'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 
             'MiscVal', 'MoSold', 'YrSold','BsmtUnfSF', 'GarageArea', 
             'LowQualFinSF', 'GrLivArea']

# set(hdf.columns) - (set(cat_feat + cont_feat))

### Removing outliers

In [24]:
import statsmodels.api as sm
from statsmodels.api import OLS

In [25]:
# Variable Influence Plot
# fig, ax = plt.subplots(figsize=(12, 8))
# fig = sm.graphics.influence_plot(hdf[['GrLivArea', 'SalePrice']], ax = ax, criterion = 'cooks')

In [26]:
# hdf.plot(kind = 'scatter', x = 'GrLivArea', y = 'SalePrice')
hdf = hdf[hdf['GrLivArea'] < 4000]
# hdf.plot(kind = 'scatter', x = 'GrLivArea', y = 'SalePrice')

In [27]:
# hdf.plot(kind = 'scatter', x = 'LotArea', y = 'SalePrice')
hdf = hdf.loc[hdf['LotArea'] < 100000,:]

In [28]:
# hdf.plot(kind = 'scatter', x = 'LotFrontage', y = 'SalePrice')
hdf = hdf.loc[hdf['LotFrontage'] < 250, ]

### Finding NAs

Check for NAs in all predictors:

In [29]:
hdf.loc[:,hdf.isnull().sum() > 0]
hdf.isnull().sum()[hdf.isnull().sum() > 0]

Alley           1109
MasVnrType         6
MasVnrArea         6
BsmtQual          31
BsmtCond          31
BsmtExposure      32
BsmtFinType1      31
BsmtFinType2      32
Electrical         1
FireplaceQu      600
GarageType        74
GarageYrBlt       74
GarageFinish      74
GarageQual        74
GarageCond        74
PoolQC          1191
Fence            968
MiscFeature     1156
dtype: int64

In [30]:
# In terms of proportion
round(100*hdf.isnull().sum()[hdf.isnull().sum() > 0]/len(hdf), 2)

Alley           92.80
MasVnrType       0.50
MasVnrArea       0.50
BsmtQual         2.59
BsmtCond         2.59
BsmtExposure     2.68
BsmtFinType1     2.59
BsmtFinType2     2.68
Electrical       0.08
FireplaceQu     50.21
GarageType       6.19
GarageYrBlt      6.19
GarageFinish     6.19
GarageQual       6.19
GarageCond       6.19
PoolQC          99.67
Fence           81.00
MiscFeature     96.74
dtype: float64

Impute missing values for `GarageYrBlt`:

In [31]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='mean')
hdf.loc[:,'GarageYrBlt'] = imp.fit_transform(hdf[['GarageYrBlt']])

Drop `MiscFeature` since feature is already in `miscval`:

In [32]:
hdf = hdf.drop(columns = 'MiscFeature') 
nom_feat.remove('MiscFeature')

Drop `MasVnrType`, `MasVnrArea`, and `Electrical`:

In [33]:
round(100*hdf.isnull().sum()[hdf.isnull().sum() > 0]/len(hdf), 2)

Alley           92.80
MasVnrType       0.50
MasVnrArea       0.50
BsmtQual         2.59
BsmtCond         2.59
BsmtExposure     2.68
BsmtFinType1     2.59
BsmtFinType2     2.68
Electrical       0.08
FireplaceQu     50.21
GarageType       6.19
GarageFinish     6.19
GarageQual       6.19
GarageCond       6.19
PoolQC          99.67
Fence           81.00
dtype: float64

In [46]:
hdf = hdf.drop(labels = hdf.loc[hdf['MasVnrType'].isnull(),:].index)
hdf = hdf.drop(labels = hdf.loc[hdf['Electrical'].isnull(),:].index)

### Visualizing data

In [18]:
corr = hdf[cont_feat].corr()
mask = np.zeros(corr.shape, dtype=bool)
mask[np.triu_indices(len(mask))] = True
with sns.set_style(style = 'white'):
    fig, ax = plt.subplots(figsize = (12,6))
    sns.heatmap(corr, annot=True, fmt='.2f', mask=mask, ax=ax, cmap='coolwarm', vmin = -1, vmax = 1, center = 0,
           cbar_kws = {'shrink': .8,'ticks': [-1, -.5, 0, 0.5, 1]})
fig.subplots_adjust(top=0.93)

# corr.style.background_gradient(cmap='coolwarm').set_precision(2)
# sns.heatmap(corr, vmin = -1, vmax = 1, center = 0, 
#             mask=mask, ax=ax, cmap="coolwarm", 
#             cbar_kws = {'shrink': .6,'ticks': [-1, -.5, 0, 0.5, 1]},
#             annot=True, fmt='.2f')
# fig.subplots_adjust(top=0.93)
# # ax.set_yticklabels(corr.columns)
# ax.set_xticklabels(corr.columns, rotation = 20)
# sns.set_style({'xtick.bottom': True}, {'ytick.left': True})

IndentationError: expected an indented block (<ipython-input-18-4b9502f0399e>, line 5)

In [19]:
# List top 10 correlations
corr_asc = corr.abs().where(np.triu(np.ones(corr.abs().shape), k=1).astype(np.bool)).stack().sort_values(ascending=False)
print(corr_asc[:10])
# Filter correlations over 0.69
corr_asc[corr_asc > 0.69] 

NameError: name 'corr' is not defined

## Data preprocessing

### Define features and target

In [47]:
X = hdf.loc[:,'MSSubClass':'SaleCondition']
y = np.log1p(hdf.loc[:, 'SalePrice'])

### Encode categorical features

In [48]:
import category_encoders as ce

#### Ordinal

In [49]:
# Convert ord_feat_num to ord_feat
for ord_ in ord_feat_cat:
    hdf.loc[hdf[ord_].isnull(), ord_] = 0
    
hdf[ord_feat_cat] = hdf[ord_feat_cat].replace({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})
hdf[['BsmtExposure']] = hdf[['BsmtExposure']].replace({'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1})
hdf[['BsmtFinType1', 'BsmtFinType2']] = hdf[['BsmtFinType1', 'BsmtFinType2']].replace({'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1})

In [50]:
ord_enc = ce.OrdinalEncoder(cols=ord_feat).fit(X,y)
X = ord_enc.transform(X)

#### Nominal

In [51]:
# Convert fence to nominal
hdf.loc[hdf['Fence'].isnull(), 'Fence'] = 'NoFence'
hdf[['Fence']] = hdf[['Fence']].replace({'MnPrv': 'HasFence', 'GdWo': 'HasFence', 'GdPrv': 'HasFence', 'MnWw': 'HasFence'})

In [52]:
perm = np.random.permutation(len(X))
X = X.iloc[perm].reset_index(drop=True)
y = y.iloc[perm].reset_index(drop=True)

nom_enc = ce.CatBoostEncoder(cols=nom_feat).fit(X,y)
X = nom_enc.transform(X)

### Final NA checks

In [53]:
round(100*X.isnull().sum()[X.isnull().sum() > 0]/len(hdf), 2)

Series([], dtype: float64)

### Split dataset into training and testing data sets

In [43]:
from sklearn.model_selection import train_test_split

In [54]:
np.random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [55]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (831, 78)
Training Labels Shape: (831,)
Testing Features Shape: (357, 78)
Testing Labels Shape: (357,)


## Model: Random Forest

In [11]:
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

In [12]:
# Create base model
rf = RandomForestRegressor()

In [13]:
# Check out base/default parameters
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


### Choose hyperparameters

Narrow down search for wide range of values for hyperparameters:

In [31]:
from sklearn.model_selection import RandomizedSearchCV

In [32]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 50, 100, 500, 1000]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10, 15, 35, 50, 100, 150]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Experimenting with random_state
random_state = [0, 10, 50, 100]

In [33]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'oob_score': 
               'random_state': random_state}

In [34]:
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 5, 10, 15, 35, 50, 100, 150],
 'min_samples_split': [2, 5, 10, 50, 100, 500, 1000],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'random_state': [0, 10, 50, 100]}


In [35]:
# Use the random grid to search for best hyperparameters
# Random search of parameters, using 5-fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 100, cv = 5, verbose=2, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   50.1s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.8min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_sta...


In [36]:
rf_random.best_params_

{'random_state': 100,
 'n_estimators': 1800,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 110,
 'bootstrap': False}

Evaluate combination of settings from randomized search:

In [38]:
from sklearn.model_selection import GridSearchCV

In [39]:
# Create parameter grid based on results of random search
param_grid = {
    'random_state': [0, 10, 50, 100],
    'bootstrap': [True],
    'max_depth': list(range(10, 120, 10)),
    'max_features': ['sqrt'],
    'min_samples_split': [2, 15, 50, 100],
    'min_samples_leaf': [1, 15, 35, 50],
    'n_estimators': [400, 600, 800, 1000, 1200]
}

# Initlize grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                          cv = 5, n_jobs = -1, verbose = 2)

In [40]:
#Fit grid search to data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 3520 candidates, totalling 17600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   42.8s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 17.4min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 21.0min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 6829 tasks      | elapsed: 29.7min
[Parallel(n_jobs=-1)]: Done 7922 tasks      | elapsed: 33.9min
[Parallel(n_jobs=-1)]: Done 9097 tasks      | e

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'bootstrap': [True],
            

In [41]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 40,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 800,
 'random_state': 100}

### Make predictions

Predict test values:

In [None]:
predictions = rf.predict(X_test)

Calculate absolute errors:

In [None]:
abs_err = abs(predictions - y_test)

Calculate mean absolute error:

In [None]:
print('Mean Absolute Error:', round(np.mean(abs_err), 2), 'degrees.')

Calculate relative feature importance:

In [None]:
# 
importances = list(rf.feature_importances_)

# 

# Sort by descending order
sorted(, key = lambda f: f[1], reverse = True)

# Print
[print(f'Feature: {:} Importance: {}') for pair in ]

### Evaluate model performance

Calculate mean absolute percentage error (MAPE):

In [None]:
mape = (abs_errors/y_test)*100

Calculate and display accuracy:

In [None]:
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2),'%.')

### Data visualization

#### Feature importance

In [None]:
importance:

# feature_importance = list(zip(spam_train.columns[:-2], randomForest.feature_importances_))
# dtype = [('feature', 'S10'), ('importance', 'float')]
# feature_importance = np.array(feature_importance, dtype=dtype)
# feature_sort = np.sort(feature_importance, order='importance')[::-1]
# name, score = zip(*list(feature_sort))
# pd.DataFrame({'name':name,'score':score})[:15].plot.bar(x='name', y='score')