In [154]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

In [155]:
df_train = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv')

In [6]:
X_train = df_train.drop(['SalePrice'], axis=1)
y_train = df_train['SalePrice']

# 1- Basic Preprocessing

Basic preprocessing transformer to perform feature_engineer and so on. Reduce uninportant variables.

In [7]:
num_cols = X_train.columns[X_train.dtypes != 'object']

In [8]:
num_cols

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold'],
      dtype='object')

## 1.1 Numeric Variables

* **GrLivArea = 1stFlrSF + 2ndFlrSF + LowQualFinSF, one variable is redundant. Drop 2ndFlrSF. Drop LowQualFinSF, because it has few non zero values**

In [40]:
np.mean(X_train['GrLivArea'] == (X_train['1stFlrSF'] + X_train['2ndFlrSF'] + X_train['LowQualFinSF']))

1.0

In [107]:
np.mean(X_train['LowQualFinSF'] > 0)

0.01780821917808219

* **TotalBsmtSF = BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF. Redundancy, drop BsmtFinSF2**

In [41]:
np.mean(X_train['TotalBsmtSF'] == (X_train['BsmtFinSF1']+X_train['BsmtFinSF2']+X_train['BsmtUnfSF']))

1.0

* **BsmtHalfBath has almost no variance. Most of it is zero. Can be removed**

In [45]:
np.mean(X_explore['BsmtHalfBath'] > 0)

0.056164383561643834

* **MiscVal should be removed, its number represents different things. To hard to deal with at this moment**

In [58]:
(X_train['MiscVal'] > 0).describe()

count      1460
unique        2
top       False
freq       1408
Name: MiscVal, dtype: object

* **Lot Frontage has missing values, which probably mean some houses have no Lot Frontage, input with  zeros**

In [64]:
sum(pd.isna(X_train['LotFrontage']))

259

* **GarageYearBlt has missing values (When there is no garage). Input with zero**

In [70]:
sum(pd.isna(X_train['GarageYrBlt'])) 

81

In [85]:
np.mean((X_train['GarageYrBlt'] == X_train['YearBuilt']) | (X_train['GarageYrBlt'] == X_train['YearRemodAdd']))

0.7828767123287671

* **MoSold doesn't add much value to the model, would only add noise. The month a house is sold doesn't, have much impact on its price. Drop**

* **Most values of Pool Area are 0 (Means the house doesn't have a pool). Drop**

In [94]:
np.mean(X_train['PoolArea'] > 0), np.sum(X_train['PoolArea'] > 0)

(0.004794520547945206, 7)

In [239]:
class Preprocessing_Numeric(BaseEstimator, TransformerMixin):
    def __init__(self,age=True, grl=False):
        self.age = age
        self.grl = grl
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        Xt = X.copy()
        # Setting index as Id
        Xt = Xt.set_index(['Id'])
        
        # Since GrLivArea = 1stFlrSF + 2ndFlrSF, one variable is redundant. Drop 2ndFlrSF.Drop LowQualFinSF, 
        #because it has few non zero values
        Xt = Xt.drop(['2ndFlrSF', 'LowQualFinSF'], axis=1)
        
        # Since TotalBsmtSF = BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF. Redundancy, drop BsmtFinSF2
        Xt = Xt.drop(['BsmtFinSF2'], axis=1)
        
        # BsmtHalfBath has almost no variance. Most of it is zero. Can be removed
        Xt = Xt.drop(['BsmtHalfBath'], axis=1)
        
        # MiscVal should be removed, its number represents different things. To hard to deal with at this moment
        Xt = Xt.drop(['MiscVal'], axis=1)
        
        # Lot Frontage has missing values, which probably mean some houses have no Lot Frontage, input with  zeros
        Xt['LotFrontage'] = Xt['LotFrontage'].fillna(0)
        
        # GarageYearBlt has missing values (When there is no garage). Input with zero
        Xt['GarageYrBlt'] = Xt['GarageYrBlt'].fillna(0)
        
        # MiscVal should be removed, its number represents different things. To hard to deal with at this moment
        Xt = Xt.drop(['MoSold'], axis=1)
        
        # Most values of Pool Area are 0 (Means the house doesn't have a pool). Drop
        Xt = Xt.drop(['PoolArea'], axis=1)
        
        ''' Feature engineering options '''
        if self.age:
            # Takes the mean of the house age and the remodel age at the time of sale
            Xt['Age'] = (2*Xt['YrSold'] - Xt['YearBuilt'] - Xt['YearRemodAdd'])/2
            Xt = Xt.drop(['YrSold', 'YearBuilt', 'YearRemodAdd'], axis=1)
        if self.grl:
            # Multicollinearity between GrLivArea and 1stFlrSF --> Keep GrLivArea and 1stFlrSF/GrLivArea
            Xt['1stFlr_ratio'] = Xt['1stFlrSF']/Xt['GrLivArea']
            Xt = Xt.drop(['1stFlrSF'], axis=1)            
        return Xt

## 1.2 Ordinal Variables

## 1.3 Nominal Variables

# 2 - Full preprocessing Pipeline

Includes, Feature Engineering, Imputation, and Scaling

In [240]:
pipe_preprocessing = Pipeline([
    ('Preprocessing_Numeric', Preprocessing_Numeric()),
    #('Preprocessing_Ordinal', Preprocessing_Ordinal()),
    #('Preprocessing_Nominal', Preprocessing_Nominal())
    #('Imputation', SimpleImputer(strategy='constant', fill_value=0))
    ('Scaling', StandardScaler())
])

# 3 - Creating a generic Sequential Neural Network Regresor with Keras

### 3.1 Creating the function to generate an RNA with given parameters

In [237]:
def Sequential_RNA_Regressor(n_hidden=2, n_neurons_hidden=30, 
                             activation_hidden='relu', weights_method='he_normal',
                            loss='mse',
                            lr=1e-3):
    # n_hidden = Number of Hidden Layers (Integer)
    # n_neurons_hidden = Number of neurons for each layer (Integer (Broadcast to all layers) or List (Each entry for each layer))
    # activation_hidden = Activation function for the hidden layers
    # weights_method = Method used to initialize weights (Default: he_normal | Good for RElU and variants, 
    # for SElU must be lecun_normal)
    
    
    # loss = Loss function for to be minimized
    # lr = Learning rate for the optimizer (In this case the keras.optimizers.SGD (Stochastic Gradient Descent))
    
    # If n_neurons_hidden is an integer, broadcast
    if isinstance(n_neurons_hidden, int):
        n_neurons_hidden = [n_neurons_hidden for i in range(n_hidden)]
    
    model = keras.models.Sequential()

    
    ## In this approach Keras will have to guess the input shape at the start of training. Because the input_shape depends
    # on the Preprocessing Pipeline hyperparameters chosen
    
    for i in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons_hidden[i], activation=activation_hidden, kernel_initializer=weights_method,
                 name='Hidden_{}'.format(i+1)))
    

    # Output layer. Regressor so just one neuron with no activation function
    model.add(keras.layers.Dense(1,name='Output_Layer'))
    
    # Compiling the model
    model.compile(loss=loss,
                 optimizer=keras.optimizers.SGD(learning_rate=lr))
    return model

### 3.2 Wrapping the keras model as an Sckit-Learn Regressor to make it compatible for hyperparameter tuning

In [247]:
rna_sk_regressor = keras.wrappers.scikit_learn.KerasRegressor(Sequential_RNA_Regressor)

  rna_sk_regressor = keras.wrappers.scikit_learn.KerasRegressor(Sequential_RNA_Regressor)
