# Random Forest
Tutorial: https://towardsdatascience.com/deep-neural-networks-for-regression-problems-81321897ca33

In [24]:
# Libraries and options
# If error [No module named 'sklearn'], in terminal: conda install -c conda-forge scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 
%matplotlib notebook
from matplotlib import pyplot as plt
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
from xgboost import XGBRegressor

# 1. Data preprocessing
Data: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data

In [9]:
# Load data
gt = pd.read_csv('train.csv')
gt.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [10]:
# Number of rows & columns
gt.shape

(1460, 81)

In [11]:
# Exclude columns containing missing values
def get_cols_with_no_nans(df):
    cols_with_no_nans = []
    for col in df.columns:
        if not df[col].isnull().any():
            cols_with_no_nans.append(col)
    return cols_with_no_nans

cols_no_nans = get_cols_with_no_nans(gt)
gt = gt[cols_no_nans]
gt.describe()

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,10516.828082,6.099315,5.575342,1971.267808,1984.865753,443.639726,46.549315,567.240411,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,9981.264932,1.382997,1.112799,30.202904,20.645407,456.098091,161.319273,441.866955,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,223.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,9478.5,6.0,5.0,1973.0,1994.0,383.5,0.0,477.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,11601.5,7.0,6.0,2000.0,2004.0,712.25,0.0,808.0,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,215245.0,10.0,9.0,2010.0,2010.0,5644.0,1474.0,2336.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [12]:
# One-hot encoding 
# (Random Forest need only renaming categorical values using numbered ID for the class) 
# (but we already a function for one-hot-encoding, and decision trees will function well with that too)
def oneHotEncode(df,colNames):
    for col in colNames:
        if( df[col].dtype == np.dtype('object')):
            dummies = pd.get_dummies(df[col],prefix=col)
            df = pd.concat([df,dummies],axis=1)

            #drop the encoded column
            df.drop([col],axis = 1 , inplace=True)
    return df

# Get index of categorical columns 
cat_cols = gt.select_dtypes(include=['object'])
cat_cols = cat_cols.columns

# Transform data
print('There were {} columns before encoding categorical features'.format(gt.shape[1]))
gt = oneHotEncode(gt, cat_cols)
print('There are {} columns after encoding categorical features'.format(gt.shape[1]))


There were 62 columns before encoding categorical features
There are 217 columns after encoding categorical features


In [13]:
# Extract the variable to predict
target = gt.SalePrice
gt.drop(['SalePrice'],axis = 1 , inplace = True)

# Drop column ID
gt.drop(['Id'],axis = 1 , inplace = True)

In [14]:
# Split into train & test set
train_X, test_X, train_y, test_y = train_test_split(gt, target, test_size = 0.4, random_state = 14)

# 2. Random Forest Model

In [15]:
# Instantiate a Random Forest model
model = RandomForestRegressor(min_samples_split=5, n_estimators=20)

# Train the model
model.fit(train_X,train_y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=5, min_weight_fraction_leaf=0.0,
                      n_estimators=20, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [16]:
# Test the model
RF_prediction = model.predict(test_X)

# Compute errors
MAE = mean_absolute_error(test_y , RF_prediction)
print('Random Forest MAE = ', MAE)

Random Forest MAE =  17793.190953547975


In [18]:
plt.figure()
plt.scatter(RF_prediction, test_y, alpha=0.2)
plt.xlabel('Random Forest Predictions')
plt.ylabel('True Values')
lims = [0, 800000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)
plt.show()

<IPython.core.display.Javascript object>

In [19]:
error = RF_prediction - test_y
plt.figure()
plt.hist(error, bins = 30)
plt.xlabel("Absolute Prediction Error for Random Forest")
plt.ylabel("Count")
plt.show()

<IPython.core.display.Javascript object>

# 3. Boosting Model

In [29]:
# Instantiate Boosting model
XGBModel = XGBRegressor(n_estimators=50, max_depth=2)

# Train Boosting model
XGBModel.fit(train_X, train_y, verbose=False)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=2, min_child_weight=1, missing=None, n_estimators=50,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [30]:
# Test the model
XGB_prediction = XGBModel.predict(test_X)

# Compute error
MAE = mean_absolute_error(test_y , XGB_prediction)
print('Boosting MAE = ',MAE)

Boosting MAE =  19442.022327161816


In [31]:
plt.figure()
plt.scatter(XGB_prediction, test_y, alpha=0.2)
plt.xlabel('Boosting Predictions')
plt.ylabel('True Values')
lims = [0, 800000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)
plt.show()

<IPython.core.display.Javascript object>

In [32]:
error = XGB_prediction - test_y
plt.figure()
plt.hist(error, bins = 30)
plt.xlabel("Absolute Prediction Error for Boosting")
plt.ylabel("Count")
plt.show()

<IPython.core.display.Javascript object>