# "House Prices"
> "Predict the sales price for each house"
- toc: false
- branch: master
- badges: true
- comments: true
- categories: [fastpages, jupyter]
- image: images/some_folder/your_image.png
- hide: false
- search_exclude: true
- metadata_key1: metadata_value1
- metadata_key2: metadata_value2

Import Libraries

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

Read datasets into data frames

In [4]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
df = pd.concat([train,test],keys=['train','test'],sort=False)

### Data Cleaning

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2919 entries, (train, 0) to (test, 1458)
Data columns (total 81 columns):
Id               2919 non-null int64
MSSubClass       2919 non-null int64
MSZoning         2915 non-null object
LotFrontage      2433 non-null float64
LotArea          2919 non-null int64
Street           2919 non-null object
Alley            198 non-null object
LotShape         2919 non-null object
LandContour      2919 non-null object
Utilities        2917 non-null object
LotConfig        2919 non-null object
LandSlope        2919 non-null object
Neighborhood     2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
BldgType         2919 non-null object
HouseStyle       2919 non-null object
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
RoofStyle        2919 non-null object
RoofMatl         2919 non-null object
Exterior1

In [10]:
df.isnull().sum()/df.isnull().count()

Id               0.000000
MSSubClass       0.000000
MSZoning         0.001370
LotFrontage      0.166495
LotArea          0.000000
Street           0.000000
Alley            0.932169
LotShape         0.000000
LandContour      0.000000
Utilities        0.000685
LotConfig        0.000000
LandSlope        0.000000
Neighborhood     0.000000
Condition1       0.000000
Condition2       0.000000
BldgType         0.000000
HouseStyle       0.000000
OverallQual      0.000000
OverallCond      0.000000
YearBuilt        0.000000
YearRemodAdd     0.000000
RoofStyle        0.000000
RoofMatl         0.000000
Exterior1st      0.000343
Exterior2nd      0.000343
MasVnrType       0.008222
MasVnrArea       0.007879
ExterQual        0.000000
ExterCond        0.000000
Foundation       0.000000
                   ...   
BedroomAbvGr     0.000000
KitchenAbvGr     0.000000
KitchenQual      0.000343
TotRmsAbvGrd     0.000000
Functional       0.000685
Fireplaces       0.000000
FireplaceQu      0.486468
GarageType  

#### Numerical Features

In [13]:
df_num = df.select_dtypes('number')

In [14]:
df_num.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2919 entries, (train, 0) to (test, 1458)
Data columns (total 38 columns):
Id               2919 non-null int64
MSSubClass       2919 non-null int64
LotFrontage      2433 non-null float64
LotArea          2919 non-null int64
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
MasVnrArea       2896 non-null float64
BsmtFinSF1       2918 non-null float64
BsmtFinSF2       2918 non-null float64
BsmtUnfSF        2918 non-null float64
TotalBsmtSF      2918 non-null float64
1stFlrSF         2919 non-null int64
2ndFlrSF         2919 non-null int64
LowQualFinSF     2919 non-null int64
GrLivArea        2919 non-null int64
BsmtFullBath     2917 non-null float64
BsmtHalfBath     2917 non-null float64
FullBath         2919 non-null int64
HalfBath         2919 non-null int64
BedroomAbvGr     2919 non-null int64
KitchenAbvGr     2919 non-null int64
TotRmsAbv

In [15]:
df_num = df_num.fillna(df.mean())

In [16]:
df_num.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2919 entries, (train, 0) to (test, 1458)
Data columns (total 38 columns):
Id               2919 non-null int64
MSSubClass       2919 non-null int64
LotFrontage      2919 non-null float64
LotArea          2919 non-null int64
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
MasVnrArea       2919 non-null float64
BsmtFinSF1       2919 non-null float64
BsmtFinSF2       2919 non-null float64
BsmtUnfSF        2919 non-null float64
TotalBsmtSF      2919 non-null float64
1stFlrSF         2919 non-null int64
2ndFlrSF         2919 non-null int64
LowQualFinSF     2919 non-null int64
GrLivArea        2919 non-null int64
BsmtFullBath     2919 non-null float64
BsmtHalfBath     2919 non-null float64
FullBath         2919 non-null int64
HalfBath         2919 non-null int64
BedroomAbvGr     2919 non-null int64
KitchenAbvGr     2919 non-null int64
TotRmsAbv

In [17]:
df_num['log_sp'] = np.log(df_num['SalePrice'])

In [18]:
df_num.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2919 entries, (train, 0) to (test, 1458)
Data columns (total 39 columns):
Id               2919 non-null int64
MSSubClass       2919 non-null int64
LotFrontage      2919 non-null float64
LotArea          2919 non-null int64
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
MasVnrArea       2919 non-null float64
BsmtFinSF1       2919 non-null float64
BsmtFinSF2       2919 non-null float64
BsmtUnfSF        2919 non-null float64
TotalBsmtSF      2919 non-null float64
1stFlrSF         2919 non-null int64
2ndFlrSF         2919 non-null int64
LowQualFinSF     2919 non-null int64
GrLivArea        2919 non-null int64
BsmtFullBath     2919 non-null float64
BsmtHalfBath     2919 non-null float64
FullBath         2919 non-null int64
HalfBath         2919 non-null int64
BedroomAbvGr     2919 non-null int64
KitchenAbvGr     2919 non-null int64
TotRmsAbv

#### Categorical Variables

In [6]:
df_cat = df.select_dtypes('object')

In [7]:
df_cat.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2919 entries, (train, 0) to (test, 1458)
Data columns (total 43 columns):
MSZoning         2915 non-null object
Street           2919 non-null object
Alley            198 non-null object
LotShape         2919 non-null object
LandContour      2919 non-null object
Utilities        2917 non-null object
LotConfig        2919 non-null object
LandSlope        2919 non-null object
Neighborhood     2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
BldgType         2919 non-null object
HouseStyle       2919 non-null object
RoofStyle        2919 non-null object
RoofMatl         2919 non-null object
Exterior1st      2918 non-null object
Exterior2nd      2918 non-null object
MasVnrType       2895 non-null object
ExterQual        2919 non-null object
ExterCond        2919 non-null object
Foundation       2919 non-null object
BsmtQual         2838 non-null object
BsmtCond         2837 non-null object
Bsm

In [8]:
df_cat = pd.get_dummies(df_cat,dummy_na=True,drop_first=True)

In [11]:
df_cat.columns

Index(['MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM',
       'MSZoning_nan', 'Street_Pave', 'Street_nan', 'Alley_Pave', 'Alley_nan',
       'LotShape_IR2',
       ...
       'SaleType_New', 'SaleType_Oth', 'SaleType_WD', 'SaleType_nan',
       'SaleCondition_AdjLand', 'SaleCondition_Alloca', 'SaleCondition_Family',
       'SaleCondition_Normal', 'SaleCondition_Partial', 'SaleCondition_nan'],
      dtype='object', length=252)

In [19]:
df_all = pd.concat([df_num,df_cat],axis=1)

### Model Building

In [20]:
X = df_all.drop(['SalePrice','log_sp','Id'],axis=1)
y = df_num['log_sp']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X[:'train'],y[:'train'],test_size=0.3)

In [22]:
clf = RandomForestRegressor(n_jobs=-1)
clf.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [23]:
clf.score(X_train,y_train)

0.9719286284854999

In [24]:
clf.score(X_test, y_test)

0.8430682997448733

In [25]:
feature_importances = pd.DataFrame(clf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importances

Unnamed: 0,importance
OverallQual,0.535933
GrLivArea,0.131919
GarageCars,0.060035
CentralAir_Y,0.032251
TotalBsmtSF,0.027301
BsmtFinSF1,0.027256
GarageArea,0.023395
1stFlrSF,0.015684
YearBuilt,0.010039
YearRemodAdd,0.009184


In [29]:
test['log_sp']= clf.predict(X.loc['test'])
test['SalePrice']= np.exp(test.log_sp)
submission = test[['Id','SalePrice']]
submission.to_csv('./data/submission.csv',index=False)