# BI Monthly Challenge - House Price Prediction

#### Import of data
Defining the path and names of the input csv.

In [11]:
import random
import numpy as np
random.seed(42)
np.random.seed(42)
import os

PATH = os.getcwd()

DATA_PATH = '/obj/data/'
MODEL_PATH = '/obj/model'

train_path = 'input_data/train.csv'
test_path = 'input_data/test.csv'

In [12]:
import pandas as pd
from load_data import load_data

X_train, X_test, y_train, y_test = load_data(PATH + DATA_PATH + train_path)

#X_test = pd.read_csv(PATH + DATA_PATH + test_path)
#submission_index = X_test['Id']

The data set contains 43 categorical features and 37 numerical features. The train data set contains 1168 rows and the test data set contains 292 rows.
The categorical features are ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'] and the numerical features are ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQual

### Exploratory Data Analysis
Get an overview of the data and most important features by plotting a correlation matrix and a scatter plot.

In [6]:
from exploratory_data_analysis.eda import plot_correlation_matrix, plot_scatter_matrix

plot_correlation_matrix(X_train)
#plot_scatter_matrix(X_train, columns=['SalePrice', 'LotArea'])

<Figure size 1000x800 with 2 Axes>

0

### Data ranges of each individual feature

#### Categorical Features 

In [15]:
for feature in list(X_train.select_dtypes(include=['object'])):
    print(feature)
    print(X_train[feature].value_counts(dropna=False))

MSZoning
RL         924
RM         172
FV          53
RH          15
C (all)      4
Name: MSZoning, dtype: int64
Street
Pave    1164
Grvl       4
Name: Street, dtype: int64
Alley
NaN     1094
Grvl      44
Pave      30
Name: Alley, dtype: int64
LotShape
Reg    729
IR1    394
IR2     37
IR3      8
Name: LotShape, dtype: int64
LandContour
Lvl    1059
Bnk      48
HLS      35
Low      26
Name: LandContour, dtype: int64
Utilities
AllPub    1167
NoSeWa       1
Name: Utilities, dtype: int64
LotConfig
Inside     822
Corner     221
CulDSac     84
FR2         38
FR3          3
Name: LotConfig, dtype: int64
LandSlope
Gtl    1108
Mod      51
Sev       9
Name: LandSlope, dtype: int64
Neighborhood
NAmes      181
CollgCr    115
OldTown     91
Edwards     87
Somerst     69
NWAmes      66
Gilbert     65
NridgHt     61
Sawyer      58
BrkSide     45
Crawfor     44
SawyerW     44
Mitchel     40
NoRidge     33
Timber      28
IDOTRR      26
SWISU       21
StoneBr     20
ClearCr     19
Blmngtn     15
BrDale  

The feature "Street" is 1164 times Pave.
The feature "Alley" is 1094 times NaN.
The feature "Utilities" is 1167 times AllPub.

#### Numerical features

In [31]:
numerical_features = list(X_train.select_dtypes(include=['integer', 'float']))

print(X_train[numerical_features].describe())
print(X_train[numerical_features].isnull().any())

                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1168.000000  1168.000000   951.000000    1168.000000  1168.000000   
mean    730.904966    56.849315    70.343849   10689.642123     6.121575   
std     425.369088    42.531862    24.897021   10759.366198     1.367619   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     360.750000    20.000000    59.000000    7587.250000     5.000000   
50%     732.500000    50.000000    70.000000    9600.000000     6.000000   
75%    1101.750000    70.000000    80.000000   11700.000000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  ...  \
count  1168.000000  1168.000000   1168.000000  1162.000000  1168.000000  ...   
mean      5.584760  1970.965753   1984.897260   103.771945   446.023973  ...   
std       1.116062    30.675495     20.733955   173.032238   459.070977  ..

###### First Step:
(YearBuilt,) YrSold and MoSold as categorical features (as str)
###### Second Step:
OverallQual and OverallCond as categorical features.
###### Third Step:
YearRemodAdd as is_remod or difference to YearRemodAdd.

### Data preprocessing

#### Replacement of NAs
NA values are replaced with either a 0 (in numerical columns) or 'None' (in categorical columns) or with the median of the column if feasible.

In [3]:
from feature_preprocessing import preprocess_alley, preprocess_LotFrontage, preprocess_MasVnrType, preprocess_BsmtQual, preprocess_BsmtCond, preprocess_BsmtExposure, preprocess_BsmtFinType1, preprocess_BsmtFinType2, preprocess_Electrical, preprocess_FireplaceQu, preprocess_GarageType, preprocess_GarageFinish, preprocess_GarageQual, preprocess_GarageCond, preprocess_PoolQC, preprocess_Fence, preprocess_MiscFeature, preprocess_MSZoning, preprocess_Utilities, preprocess_Exterior1st, preprocess_Exterior2nd, preprocess_KitchenQual, preprocess_Functional, preprocess_SaleType
import pandas as pd
import numpy as np

X_train, X_test = preprocess_alley(X_train, X_test)
X_train, X_test = preprocess_MSZoning(X_train, X_test)
X_train, X_test = preprocess_LotFrontage(X_train, X_test)
X_train, X_test = preprocess_MasVnrType(X_train, X_test)
X_train, X_test = preprocess_BsmtQual(X_train, X_test)
X_train, X_test = preprocess_BsmtCond(X_train, X_test)
X_train, X_test = preprocess_BsmtExposure(X_train, X_test)
X_train, X_test = preprocess_BsmtFinType1(X_train, X_test)
X_train, X_test = preprocess_BsmtFinType2(X_train, X_test)
X_train, X_test = preprocess_Electrical(X_train, X_test)
X_train, X_test = preprocess_FireplaceQu(X_train, X_test)
X_train, X_test = preprocess_GarageType(X_train, X_test)
X_train, X_test = preprocess_GarageFinish(X_train, X_test)
X_train, X_test = preprocess_GarageQual(X_train, X_test)
X_train, X_test = preprocess_GarageCond(X_train, X_test)
X_train, X_test = preprocess_PoolQC(X_train, X_test)
X_train, X_test = preprocess_Fence(X_train, X_test)
X_train, X_test = preprocess_MiscFeature(X_train, X_test)
X_train, X_test = preprocess_Utilities(X_train, X_test)
X_train, X_test = preprocess_Exterior1st(X_train, X_test)
X_train, X_test = preprocess_Exterior2nd(X_train, X_test)
X_train, X_test = preprocess_KitchenQual(X_train, X_test)
X_train, X_test = preprocess_Functional(X_train, X_test)
X_train, X_test = preprocess_SaleType(X_train, X_test)

NameError: name 'X_train' is not defined