# BI Monthly Challenge - House Price Prediction

#### Import of data
Defining the path and names of the input csv.

In [1]:
import random
import numpy as np
import os
random.seed(42)
np.random.seed(42)

PATH = os.getcwd()

DATA_PATH = '/obj/data/'
MODEL_PATH = '/obj/model/'

train_path = 'input_data/train.csv'
test_path = 'input_data/test.csv'

In [2]:
import pandas as pd
from load_data import load_data

X_train, X_test, y_train, y_test = load_data(PATH + DATA_PATH + train_path)

#X_test = pd.read_csv(PATH + DATA_PATH + test_path)
#submission_index = X_test['Id']

The data set contains 43 categorical features and 37 numerical features. The train data set contains 1168 rows and the test data set contains 292 rows.
The categorical features are ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'] and the numerical features are ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQual

### Exploratory Data Analysis
Get an overview of the data and most important features by plotting a correlation matrix and a scatter plot.

In [3]:
from exploratory_data_analysis.eda import plot_correlation_matrix, plot_scatter_matrix

plot_correlation_matrix(X_train)
#plot_scatter_matrix(X_train, columns=['SalePrice', 'LotArea'])

<Figure size 1000x800 with 2 Axes>

0

### Data preprocessing

#### Replacement of NAs
NA values are replaced with either a 0 (in numerical columns) or 'None' (in categorical columns) or with the median of the column if feasible.

In [4]:
from feature_preprocessing import preprocess_alley, preprocess_LotFrontage, preprocess_MasVnrType, preprocess_BsmtQual, preprocess_BsmtCond, preprocess_BsmtExposure, preprocess_BsmtFinType1, preprocess_BsmtFinType2, preprocess_Electrical, preprocess_FireplaceQu, preprocess_GarageType, preprocess_GarageFinish, preprocess_GarageQual, preprocess_GarageCond, preprocess_PoolQC, preprocess_Fence, preprocess_MiscFeature, preprocess_MSZoning, preprocess_Utilities, preprocess_Exterior1st, preprocess_Exterior2nd, preprocess_KitchenQual, preprocess_Functional, preprocess_SaleType, preprocess_GarageYrBlt, preprocess_MasVnrArea, preprocess_MoSold
import pandas as pd
import numpy as np

X_train, X_test = preprocess_alley(X_train, X_test)
X_train, X_test = preprocess_MSZoning(X_train, X_test)
X_train, X_test = preprocess_LotFrontage(X_train, X_test)
X_train, X_test = preprocess_MasVnrType(X_train, X_test)
X_train, X_test = preprocess_BsmtQual(X_train, X_test)
X_train, X_test = preprocess_BsmtCond(X_train, X_test)
X_train, X_test = preprocess_BsmtExposure(X_train, X_test)
X_train, X_test = preprocess_BsmtFinType1(X_train, X_test)
X_train, X_test = preprocess_BsmtFinType2(X_train, X_test)
X_train, X_test = preprocess_Electrical(X_train, X_test)
X_train, X_test = preprocess_FireplaceQu(X_train, X_test)
X_train, X_test = preprocess_GarageType(X_train, X_test)
X_train, X_test = preprocess_GarageFinish(X_train, X_test)
X_train, X_test = preprocess_GarageQual(X_train, X_test)
X_train, X_test = preprocess_GarageCond(X_train, X_test)
X_train, X_test = preprocess_PoolQC(X_train, X_test)
X_train, X_test = preprocess_Fence(X_train, X_test)
X_train, X_test = preprocess_MiscFeature(X_train, X_test)
X_train, X_test = preprocess_Utilities(X_train, X_test)
X_train, X_test = preprocess_Exterior1st(X_train, X_test)
X_train, X_test = preprocess_Exterior2nd(X_train, X_test)
X_train, X_test = preprocess_KitchenQual(X_train, X_test)
X_train, X_test = preprocess_Functional(X_train, X_test)
X_train, X_test = preprocess_SaleType(X_train, X_test)
X_train, X_test = preprocess_GarageYrBlt(X_train, X_test)
X_train, X_test = preprocess_MasVnrArea(X_train, X_test)
X_train, X_test = preprocess_MoSold(X_train, X_test)

X_train.drop(columns=['Street', 'Alley', 'Utilities'], inplace=True)
X_test.drop(columns=['Street', 'Alley', 'Utilities'], inplace=True)


#### One Hot Encoding
Dummy Coding of categorical columns.

In [5]:
from feature_preprocessing import one_hot_encoding

X_train.fillna(value=0, inplace=True)
X_test.fillna(value=0, inplace=True)
X_train, X_test = one_hot_encoding(X_train, X_test)


#### Scaling
Scaling to a range between 0 and 1 // Mean of 0 and Variance of 1

In [6]:
# SCALING
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train = pd.DataFrame(data=X_train_scaled, columns=list(X_train), index=X_train.index)
X_test = pd.DataFrame(data=X_test_scaled, columns=list(X_train), index=X_test.index)


  return self.partial_fit(X, y)
  
  import sys


#### Principal Component Analysis
Dimensionality Reduction

In [7]:
from feature_preprocessing import principal_component_analysis

#X_train, X_test = principal_component_analysis(X_train, X_test, PATH)

In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.8)

pca.fit(X=X_train)
X_train_decomposed = pca.transform(X_train)
X_test_decomposed = pca.transform(X_test)

X_train = pd.DataFrame(data=X_train_decomposed, index=X_train.index)
X_test = pd.DataFrame(data=X_test_decomposed, index=X_test.index)

print(X_train.shape)

(1168, 19)


### Building models

#### XGBoost

In [9]:
import xgboost as xgb
from model import train_gridsearch_cv
from model import negative_mean_absolute_percentage_error

XGBoostRegressor = xgb.XGBRegressor()
param_grid = {'eta':[0.2, 0.3, 0.5], 
              'max_deth':[3, 5], 
              'learning_rate':[0.01, 0.07], 
              'n_estimators':[1000,],
              'booster':['gbtree',], 
              'min_child_weight':[0.5, 1.0, 2.0], 
              'subsample':[0.7, 1.0],
              'random_state':[42,], 
              'tree_method': ['auto',], 
              'alpha': [2,],
              'gamma': [1,],
              'lambda':[1,], 
              'colsample_bytree': [1,]}
param_grid = {}
model_xgb = train_gridsearch_cv(XGBoostRegressor, X_train, y_train, param_grid)
print(-negative_mean_absolute_percentage_error(estimator=model_xgb, X=X_test, y_true=y_test))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits
MSE 932387309.1625006
MAPE: 7.1598547230548535
10.832583533665966


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.4s finished


#### sklearn Random Forest Regressor

In [10]:
from sklearn.ensemble import RandomForestRegressor
from model import train_gridsearch_cv
from model import negative_mean_absolute_percentage_error

param_grid = {'n_estimators': [50, 100],
              'max_features': [None, 'auto'],
              'min_samples_leaf': [2]}

model_rf = train_gridsearch_cv(RandomForestRegressor(), X_train, y_train, param_grid)
print(-negative_mean_absolute_percentage_error(estimator=model_rf, X=X_train, y_true=y_train))

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    5.0s finished


MSE 945692249.8547063
MAPE: 4.393567844575243
4.393567844575243


#### Elastic Net

In [11]:
from sklearn.linear_model import ElasticNet
from model import negative_mean_absolute_percentage_error
from model import train_gridsearch_cv

param_grid = {'alpha': [0.5, 1.0, 2.0],
              'fit_intercept': ['True'],
              'normalize': [True, False],
              'positive': [True, False],
              'max_iter': [5000,],
              'tol': [0.0001, 0.00001]}
model_el = train_gridsearch_cv(ElasticNet(), X_train, y_train, param_grid)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


MSE 1498292145.889163
MAPE: 12.650870103489186


[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:    0.3s finished


#### Neural Network

In [12]:
from sklearn.neural_network import MLPRegressor
from model import train_gridsearch_cv

model_nn = MLPRegressor()

inputs = int(len(list(X_train)))
hidden_layers = (int(0.2*inputs), int(0.3*inputs), int(0.1*inputs))
print("Layer Sizes:", inputs, "->", hidden_layers, "->", 1)

param_grid = {'hidden_layer_sizes': [hidden_layers],
              'activation': ['relu'],
              'alpha': [.001, 0.0001],
              'solver': ['adam', 'lbfgs'],
              'batch_size': [32, 128],
              'learning_rate': ['invscaling',],
              'learning_rate_init':[0.001, 0.01],
              'max_iter': [7000,],
              'early_stopping': [True,],
              'n_iter_no_change': [15,],
              'random_state':[42]}

model_nn = train_gridsearch_cv(model_nn, X_train, y_train, param_grid)
print(-negative_mean_absolute_percentage_error(estimator=model_nn, X=X_test, y_true=y_test))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Layer Sizes: 19 -> (3, 5, 1) -> 1
Fitting 3 folds for each of 16 candidates, totalling 48 fits




KeyboardInterrupt: 

#### Save model

In [None]:
from pickle import dump, load
#from sklearn.joblib import dump, load

model = XGBoostRegressor
dump(model, PATH+MODEL_PATH+str(model)+'.joblib')

model = model_nn
dump(model, PATH+MODEL_PATH+str(model)+'.joblib')

### Prediction
Predicting results and combining regressors.

In [None]:
prediction_xgboost = model_xgb.predict(X_test)
prediction_rf = model_rf.predict(X_test)
prediction_el = model_el.predict(X_test)
prediction_nn = model_nn.predict(X_test)

prediction = pd.DataFrame(data=prediction_xgboost,
                          columns=['XGBoost'],
                          index=submission_index)
#prediction['RandomForest'] = prediction_rf
#prediction['ElasticNet'] = prediction_el
#prediction['NeuralNet'] = prediction_nn
prediction['SalePrice'] = prediction.mean(axis=1)
prediction = prediction['SalePrice']

In [None]:
prediction = pd.DataFrame(data=prediction, columns=['SalePrice'], index=submission_index)
prediction.to_csv(PATH + DATA_PATH + 'prediction_xgb.csv')