In [192]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import category_encoders as ce
from scipy.stats.distributions import randint, uniform
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score

In [193]:
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('train.csv')

In [174]:
X = df.drop(['SalePrice'], axis=1)

In [12]:
num_cols = df.columns[df.dtypes == 'int64'].drop(['Id', 'MSSubClass', 'MoSold', 'MiscVal', 'SalePrice','OverallQual',
                                                 'OverallCond'])

In [13]:
num_cols

Index(['LotArea', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'YrSold'],
      dtype='object')

In [14]:
X_num = df[num_cols]
y = df['SalePrice']

* **GrLivArea = 1stFlrSF + 2ndFlrSF**

    Pop: 1stFlrSF, 2ndFlrSF   
    Keep: GrLivArea  
    Create: 1stFlrRatio = 1stFlrSF/GrLivArea
* **Merge All Porchs**
    Porch = EnclosedPorch, 3SsnPorch, ScreenPorch, OpenPorchSF
    

In [175]:
def write_test(mod,name):
    df_t = pd.read_csv('test.csv')
    df_pred = pd.concat([df_t.Id, pd.Series(mod.predict(df_t), name='SalePrice')], axis=1)
    df_pred.to_csv(name, index=False)

In [254]:
class PreProcessor(TransformerMixin, BaseEstimator):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        Xt = X.copy()
        Xt['House_Age'] = (2*Xt['YrSold'] - Xt['YearBuilt'] - Xt['YearRemodAdd'])/2
        Xt['1stFlrRatio'] = Xt['1stFlrSF']/Xt['GrLivArea']
        Xt['Porch'] = (Xt['EnclosedPorch'] + Xt['3SsnPorch'] + Xt['ScreenPorch'] + Xt['OpenPorchSF'])
        Xt['Bath'] = (3*Xt['FullBath']/4 + Xt['HalfBath']/4)
         
        return Xt.drop(['YrSold', 'YearBuilt', 'YearRemodAdd', '2ndFlrSF',
                       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'OpenPorchSF',
                       'PoolArea', 'WoodDeckSF', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                       'LowQualFinSF', 'BsmtHalfBath', 'FullBath', 'HalfBath'], axis=1)

In [243]:
pipe_num = Pipeline([
 ('Basic_Pre', PreProcessor()),
 ('Impute_Basic', SimpleImputer(strategy='median')),
 #('Numeric_Scalor', StandardScaler()),
])

In [270]:
pipe_all = ColumnTransformer([
    ('Numeric', pipe_num, num_cols),
    ('Categorical', ce.TargetEncoder(drop_invariant=True), ['Neighborhood', 'SaleType', 'SaleCondition',
                                                           'Electrical', 'Heating', 'Foundation', 'Exterior1st',
                                                           'Condition1']),
])

In [272]:
?ce.TargetEncoder

In [273]:
model_pipe = Pipeline([
    ('Transform', pipe_all),
    ('Model', RandomForestRegressor(n_jobs=-1))
])

In [295]:
param_dist = {
    'Model__n_estimators': [200],
    'Model__max_features':[0.6],
    'Transform__Categorical__smoothing':[0.6]
}

In [296]:
grid_RF = RandomizedSearchCV(model_pipe, param_dist, cv=10, n_iter=10, 
                             scoring='neg_mean_squared_log_error',
                             verbose=4, random_state=42)

In [297]:
grid_RF.fit(X, y)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV 1/10] END Model__max_features=0.6, Model__n_estimators=200, Transform__Categorical__smoothing=0.6;, score=-0.020 total time=   0.7s
[CV 2/10] END Model__max_features=0.6, Model__n_estimators=200, Transform__Categorical__smoothing=0.6;, score=-0.019 total time=   0.7s
[CV 3/10] END Model__max_features=0.6, Model__n_estimators=200, Transform__Categorical__smoothing=0.6;, score=-0.020 total time=   0.7s
[CV 4/10] END Model__max_features=0.6, Model__n_estimators=200, Transform__Categorical__smoothing=0.6;, score=-0.033 total time=   0.8s
[CV 5/10] END Model__max_features=0.6, Model__n_estimators=200, Transform__Categorical__smoothing=0.6;, score=-0.032 total time=   0.8s
[CV 6/10] END Model__max_features=0.6, Model__n_estimators=200, Transform__Categorical__smoothing=0.6;, score=-0.015 total time=   0.9s
[CV 7/10] END Model__max_features=0.6, Model__n_estimators=200, Transform__Categorical__smoothing=0.6;, score=-0.022 total 

RandomizedSearchCV(cv=10,
                   estimator=Pipeline(steps=[('Transform',
                                              ColumnTransformer(transformers=[('Numeric',
                                                                               Pipeline(steps=[('Basic_Pre',
                                                                                                PreProcessor()),
                                                                                               ('Impute_Basic',
                                                                                                SimpleImputer(strategy='median'))]),
                                                                               Index(['LotArea', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', '...
                                                                               TargetEncoder(drop_invari

In [305]:
grid_RF.best_score_, grid_RF.best_params_

(-0.02266955976236511,
 {'Transform__Categorical__smoothing': 0.6,
  'Model__n_estimators': 200,
  'Model__max_features': 0.6})

In [306]:
write_test(grid_RF.best_estimator_, 'RF_estimator_numeric_target_tuned(N3).csv')