In [399]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from scipy.stats.distributions import randint, uniform
import category_encoders as ce
import warnings

# Strategy

* Target Encode Nominal features with high cardinality
* One Hot encode just a few nominal features not to add sparsity to the dataset
* Use Extra Trees instead of Random Forest
* Keep **max_features** hyperparameter below 0.65 to increase diversity among trees given that there is a lot of multicolinearity
* High **n_estimators** to add stability to the forest

In [400]:
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('train.csv')

In [3]:
X = df.drop(['SalePrice'], axis=1)
y = df['SalePrice']

In [282]:
def write_test(mod,name):
    df_t = pd.read_csv('test.csv')
    df_pred = pd.concat([df_t.Id, pd.Series(mod.predict(df_t), name='SalePrice')], axis=1)
    df_pred.to_csv(name, index=False)

In [862]:
cols_subjective = ['OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
                  'BsmtExposure', 'BsmtFinType1', 'HeatingQC', 'KitchenQual', 'FireplaceQu',
                  'GarageQual','GarageCond', 'Fence']
#Ou só manter 1stFlr e 2nd Floor
cols_SF = ['House_Age', 'GrLivArea', '1stFlrSF', '2ndFlrSF', 'BsmtFinSF1', 'GarageArea']
cols_int = ['TotRmsAbvGrd', 'KitchenAbvGr', 'BedroomAbvGr', 'FullBath', 'BsmtFullBath', 'GarageCars']

In [872]:
pipe_pre = ColumnTransformer([
    ('Already_Encoded_Subjective', 'passthrough', ['OverallQual', 'OverallCond']),
    ('Encode_Subjective', ce.OrdinalEncoder(drop_invariant=True,
                                           mapping = [
                                            {'col':'ExterQual', 'mapping':{'Po':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4}},
                                            #{'col':'ExterCond', 'mapping':{'Po':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4}},
                                            {'col':'BsmtQual', 'mapping':{np.nan:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}},
                                            #{'col':'BsmtCond', 'mapping':{np.nan:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}},
                                            {'col':'BsmtExposure', 'mapping':{np.nan:0, 'No':1, 'Mn':2, 'Av':3, 'Gd':4}},
                                            {'col':'BsmtFinType1', 'mapping':{np.nan:0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4,'ALQ':5, 'GLQ':6}},
                                            #{'col':'BsmtFinType2', 'mapping:':{np.nan:0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4,'ALQ':5, 'GLQ':6}},
                                            {'col':'HeatingQC', 'mapping':{'Po':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4}},
                                            {'col':'KitchenQual', 'mapping':{'Po':0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4}},
                                            {'col':'FireplaceQu', 'mapping':{np.nan:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}},
                                            {'col':'GarageQual', 'mapping':{np.nan:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}},
                                            #{'col':'GarageCond', 'mapping':{np.nan:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}},
                                            {'col':'Fence', 'mapping':{np.nan:0, 'MnWw':1, 'GdWo':2, 'MnPrv':3, 'GdPrv':4}}
                                                     ]),
    ['ExterQual', #'ExterCond', 
     'BsmtQual', #'BsmtCond', 
     'BsmtExposure', 'BsmtFinType1', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 
     #'GarageCond', 
     'Fence']),
    ('Measures', 'passthrough', ['YearBuilt', 'GrLivArea', '1stFlrSF', 'BsmtFinSF1']),
    ('Integers', 'passthrough', ['TotRmsAbvGrd', 'KitchenAbvGr', 'BedroomAbvGr', 'FullBath', 'BsmtFullBath', 'GarageCars']),
    ('OneHot', ce.OneHotEncoder(), ['SaleCondition', 'SaleType', 'BldgType']),
    ('Target', ce.TargetEncoder(), ['MSZoning', 'Neighborhood', 'Condition1',  'MSSubClass',])
])                                            

In [873]:
pipe_pred = Pipeline([
 ('Process', pipe_pre),
 ('Impute', SimpleImputer(strategy='median')),
 ('Model', ExtraTreesRegressor(n_jobs=-1))
])

In [936]:
param_d = {
    'Model__n_estimators':[200],
    'Model__max_features':uniform(0.62, 0.01),
}

In [937]:
grid_RF = RandomizedSearchCV(pipe_pred, param_d, cv=10, scoring='neg_mean_squared_log_error', n_iter=15,
                            verbose=4)

In [938]:
grid_RF.fit(X, y)

Fitting 10 folds for each of 15 candidates, totalling 150 fits
[CV 1/10] END Model__max_features=0.6230859882890797, Model__n_estimators=200;, score=-0.018 total time=   0.8s
[CV 2/10] END Model__max_features=0.6230859882890797, Model__n_estimators=200;, score=-0.012 total time=   0.8s
[CV 3/10] END Model__max_features=0.6230859882890797, Model__n_estimators=200;, score=-0.019 total time=   0.8s
[CV 4/10] END Model__max_features=0.6230859882890797, Model__n_estimators=200;, score=-0.029 total time=   0.7s
[CV 5/10] END Model__max_features=0.6230859882890797, Model__n_estimators=200;, score=-0.025 total time=   0.8s
[CV 6/10] END Model__max_features=0.6230859882890797, Model__n_estimators=200;, score=-0.012 total time=   0.8s
[CV 7/10] END Model__max_features=0.6230859882890797, Model__n_estimators=200;, score=-0.017 total time=   0.9s
[CV 8/10] END Model__max_features=0.6230859882890797, Model__n_estimators=200;, score=-0.013 total time=   0.7s
[CV 9/10] END Model__max_features=0.62308

[CV 4/10] END Model__max_features=0.6213261505101142, Model__n_estimators=200;, score=-0.029 total time=   1.0s
[CV 5/10] END Model__max_features=0.6213261505101142, Model__n_estimators=200;, score=-0.025 total time=   1.1s
[CV 6/10] END Model__max_features=0.6213261505101142, Model__n_estimators=200;, score=-0.012 total time=   1.0s
[CV 7/10] END Model__max_features=0.6213261505101142, Model__n_estimators=200;, score=-0.017 total time=   1.1s
[CV 8/10] END Model__max_features=0.6213261505101142, Model__n_estimators=200;, score=-0.014 total time=   1.2s
[CV 9/10] END Model__max_features=0.6213261505101142, Model__n_estimators=200;, score=-0.013 total time=   1.1s
[CV 10/10] END Model__max_features=0.6213261505101142, Model__n_estimators=200;, score=-0.021 total time=   1.2s
[CV 1/10] END Model__max_features=0.622264360114284, Model__n_estimators=200;, score=-0.018 total time=   1.1s
[CV 2/10] END Model__max_features=0.622264360114284, Model__n_estimators=200;, score=-0.013 total time= 

[CV 8/10] END Model__max_features=0.621088086224182, Model__n_estimators=200;, score=-0.014 total time=   0.9s
[CV 9/10] END Model__max_features=0.621088086224182, Model__n_estimators=200;, score=-0.014 total time=   1.0s
[CV 10/10] END Model__max_features=0.621088086224182, Model__n_estimators=200;, score=-0.020 total time=   1.1s


RandomizedSearchCV(cv=10,
                   estimator=Pipeline(steps=[('Process',
                                              ColumnTransformer(transformers=[('Already_Encoded_Subjective',
                                                                               'passthrough',
                                                                               ['OverallQual',
                                                                                'OverallCond']),
                                                                              ('Encode_Subjective',
                                                                               OrdinalEncoder(drop_invariant=True,
                                                                                              mapping=[{'col': 'ExterQual',
                                                                                                        'mapping': {'Ex': 4,
                                                          

In [940]:
grid_RF.best_params_, grid_RF.best_score_

({'Model__max_features': 0.6213261505101142, 'Model__n_estimators': 200},
 -0.01796155307502295)

In [941]:
write_test(grid_RF.best_estimator_, 'Extra_Tuned.csv')