# BI Monthly Challenge - House Price Prediction

#### Import of data
Defining the path and names of the input csv.

In [1]:
import random
import numpy as np
import os
random.seed(42)
np.random.seed(42)

PATH = os.getcwd()

DATA_PATH = '/obj/data/'
MODEL_PATH = '/obj/model/'

train_path = 'input_data/train.csv'
pred_path = 'input_data/test.csv'

In [2]:
import pandas as pd
from load_data import load_data

X_train, X_test, y_train, y_test = load_data(PATH + DATA_PATH + train_path)

X_pred = pd.read_csv(PATH + DATA_PATH + pred_path)
submission_index = X_pred['Id']

The data set contains 43 categorical features and 37 numerical features. The train data set contains 1168 rows and the test data set contains 292 rows.
The categorical features are ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'] and the numerical features are ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQual

### Exploratory Data Analysis
Get an overview of the data and most important features by plotting a correlation matrix and a scatter plot.

In [3]:
from exploratory_data_analysis.eda import plot_correlation_matrix, plot_scatter_matrix

plot_correlation_matrix(X_train)
#plot_scatter_matrix(X_train, columns=['SalePrice', 'LotArea'])

<Figure size 1000x800 with 2 Axes>

0

### Data preprocessing

#### Replacement of NAs
NA values are replaced with either a 0 (in numerical columns) or 'None' (in categorical columns) or with the median of the column if feasible.

In [4]:
from feature_preprocessing import preprocess_alley, preprocess_LotFrontage, preprocess_MasVnrType, preprocess_BsmtQual, preprocess_BsmtCond, preprocess_BsmtExposure, preprocess_BsmtFinType1, preprocess_BsmtFinType2, preprocess_Electrical, preprocess_FireplaceQu, preprocess_GarageType, preprocess_GarageFinish, preprocess_GarageQual, preprocess_GarageCond, preprocess_PoolQC, preprocess_Fence, preprocess_MiscFeature, preprocess_MSZoning, preprocess_Utilities, preprocess_Exterior1st, preprocess_Exterior2nd, preprocess_KitchenQual, preprocess_Functional, preprocess_SaleType, preprocess_GarageYrBlt, preprocess_MasVnrArea, preprocess_MoSold, calculate_TotalSF, preprocess_YrSold
import pandas as pd
import numpy as np

X_train, X_test, X_pred = preprocess_alley(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_MSZoning(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_LotFrontage(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_MasVnrType(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_BsmtQual(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_BsmtCond(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_BsmtExposure(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_BsmtFinType1(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_BsmtFinType2(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_Electrical(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_FireplaceQu(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_GarageType(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_GarageFinish(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_GarageQual(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_GarageCond(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_PoolQC(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_Fence(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_MiscFeature(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_Utilities(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_Exterior1st(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_Exterior2nd(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_KitchenQual(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_Functional(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_SaleType(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_GarageYrBlt(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_MasVnrArea(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_MoSold(X_train, X_test, X_pred)
X_train, X_test, X_pred = preprocess_YrSold(X_train, X_test, X_pred)
X_train, X_test, X_pred = calculate_TotalSF(X_train, X_test, X_pred)

X_train.drop(columns=['Street', 'Alley', 'Utilities'], inplace=True)
X_test.drop(columns=['Street', 'Alley', 'Utilities'], inplace=True)
X_pred.drop(columns=['Street', 'Alley', 'Utilities'], inplace=True)


#### One Hot Encoding
Dummy Coding of categorical columns.

In [5]:
from feature_preprocessing import one_hot_encoding

X_train.fillna(value=0, inplace=True)
X_test.fillna(value=0, inplace=True)
X_pred.fillna(value=0, inplace=True)

X_train, X_test, X_pred = one_hot_encoding(X_train, X_test, X_pred)


#### Scaling
Scaling to a range between 0 and 1 // Mean of 0 and Variance of 1

In [6]:
from feature_preprocessing import normalize_features, normalize_targets
#X_train, X_test = normalize_features(X_train, X_test)

#### Principal Component Analysis
Dimensionality Reduction

In [7]:
from feature_preprocessing import principal_component_analysis

#X_train, X_test = principal_component_analysis(X_train, X_test, PATH)

In [8]:
from sklearn.decomposition import PCA

#pca = PCA(n_components=0.8)

#pca.fit(X=X_train)
#X_train_decomposed = pca.transform(X_train)
#X_test_decomposed = pca.transform(X_test)

#X_train = pd.DataFrame(data=X_train_decomposed, index=X_train.index)
#X_test = pd.DataFrame(data=X_test_decomposed, index=X_test.index)

print(X_train.shape)

(1168, 37)


### Building models

#### XGBoost

In [9]:
import xgboost as xgb
from model import train_gridsearch_cv
from model import negative_mean_absolute_percentage_error

XGBoostRegressor = xgb.XGBRegressor()
param_grid = {'eta':[0.2, 0.3, 0.5], 
              'max_deth':[3, 5], 
              'learning_rate':[0.01, 0.07], 
              'n_estimators':[1000,],
              'booster':['gbtree',], 
              'min_child_weight':[0.5, 1.0, 2.0], 
              'subsample':[0.7, 1.0],
              'random_state':[42,], 
              'tree_method': ['auto',], 
              'alpha': [2,],
              'gamma': [1,],
              'lambda':[1,], 
              'colsample_bytree': [1,]}
param_grid = {}
model_xgb = train_gridsearch_cv(XGBoostRegressor, X_train, y_train, param_grid)
print(-negative_mean_absolute_percentage_error(estimator=model_xgb, X=X_test, y_true=y_test))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.9s finished


MSE 893373418.7976304
10.312361442559633


#### sklearn Gradient Boosting Regressor

In [10]:
from sklearn.ensemble import GradientBoostingRegressor
from model import train_gridsearch_cv
from model import negative_mean_absolute_percentage_error

param_grid = {'n_estimators': [1000, 2000],
              'learning_rate': [0.05],
              'max_depth': [5],
              'max_features': ['sqrt'],
              'min_samples_leaf': [10],
              'min_samples_split': [5],
              'loss': ['huber']}

model_gb = train_gridsearch_cv(GradientBoostingRegressor(), X_train, y_train, param_grid)
print(-negative_mean_absolute_percentage_error(estimator=model_gb, X=X_test, y_true=y_test))

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   31.2s finished


MSE 826394824.6326499
9.799971113832003


#### sklearn Random Forest Regressor

In [11]:
from sklearn.ensemble import RandomForestRegressor
from model import train_gridsearch_cv
from model import negative_mean_absolute_percentage_error

param_grid = {'n_estimators': [50, 100],
              'max_features': [None, 'auto'],
              'min_samples_leaf': [2]}

model_rf = train_gridsearch_cv(RandomForestRegressor(), X_train, y_train, param_grid)
print(-negative_mean_absolute_percentage_error(estimator=model_rf, X=X_test, y_true=y_test))

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    9.0s finished


MSE 917463152.0080231
10.87995937623422


#### Elastic Net

In [18]:
from sklearn.linear_model import ElasticNet
from model import negative_mean_absolute_percentage_error
from model import train_gridsearch_cv

param_grid = {'alpha': [1.0, 2.0],
              'fit_intercept': ['True'],
              'normalize': [True, False],
              'positive': [True, False],
              'max_iter': [20000,],
              'tol': [0.0001, 0.00001]}
model_el = train_gridsearch_cv(ElasticNet(), X_train, y_train, param_grid)
print(-negative_mean_absolute_percentage_error(estimator=model_el, X=X_test, y_true=y_test))

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   11.0s finished


MSE 1557528422.2164705
13.14901089920045


#### Neural Network

In [13]:
from sklearn.neural_network import MLPRegressor
from model import train_gridsearch_cv

model_nn = MLPRegressor()

inputs = int(len(list(X_train)))
hidden_layers = (int(0.2*inputs), int(0.3*inputs), int(0.1*inputs))
print("Layer Sizes:", inputs, "->", hidden_layers, "->", 1)

param_grid = {'hidden_layer_sizes': [hidden_layers],
              'activation': ['relu'],
              'alpha': [.001, 0.0001],
              'solver': ['adam', 'lbfgs'],
              'batch_size': [32, 128],
              'learning_rate': ['invscaling',],
              'learning_rate_init':[0.001, 0.01],
              'max_iter': [7000,],
              'early_stopping': [True,],
              'n_iter_no_change': [15,],
              'random_state':[42]}

#model_nn = train_gridsearch_cv(model_nn, X_train, y_train, param_grid)
#print(-negative_mean_absolute_percentage_error(estimator=model_nn, X=X_test, y_true=y_test))

Layer Sizes: 37 -> (7, 11, 3) -> 1


#### Save model

In [14]:
from sklearn.externals import joblib

model = XGBoostRegressor
joblib.dump(model, PATH+MODEL_PATH+'XGBoostRegressor'+'.joblib')

#model = model_nn
#joblib.dump(model, PATH+MODEL_PATH+'NeuralNetwork'+'.joblib')

model = model_rf
joblib.dump(model, PATH+MODEL_PATH+'RandomForestRegressor'+'.joblib')

model = model_el
joblib.dump(model, PATH+MODEL_PATH+'ElasticNet'+'.joblib')


['/Users/tobias/PycharmProjects/house-price-prediction/obj/model/ElasticNet.joblib']

### Prediction
Predicting results and combining regressors.

In [15]:
prediction_xgboost = model_xgb.predict(X_pred)
prediction_rf = model_rf.predict(X_pred)
#prediction_el = model_el.predict(X_pred)
#prediction_nn = model_nn.predict(X_pred)
prediction_gb = model_gb.predict(X_pred)

prediction = pd.DataFrame(data=prediction_xgboost,
                          columns=['XGBoost'],
                          index=submission_index)
#prediction['RandomForest'] = prediction_rf
#prediction['ElasticNet'] = prediction_el
#prediction['NeuralNet'] = prediction_nn
prediction['GradientBoost'] = prediction_gb
prediction['SalePrice'] = prediction.mean(axis=1)
prediction = prediction['SalePrice']

In [16]:
prediction = pd.DataFrame(data=prediction, columns=['SalePrice'], index=submission_index)
prediction.to_csv(PATH + DATA_PATH + 'predictions/' + 'prediction.csv')