In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
data = pd.read_csv("data/house-price/train.csv").drop("Id",axis=1)
data

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
# Check the missing value and the percentage of missing values
for col in data.columns:
    na = data[col].isna().sum()
    if na !=0:
        print(f"{col}: {na}, {na*100/len(data):.2f}%")

LotFrontage: 259, 17.74%
Alley: 1369, 93.77%
MasVnrType: 8, 0.55%
MasVnrArea: 8, 0.55%
BsmtQual: 37, 2.53%
BsmtCond: 37, 2.53%
BsmtExposure: 38, 2.60%
BsmtFinType1: 37, 2.53%
BsmtFinType2: 38, 2.60%
Electrical: 1, 0.07%
FireplaceQu: 690, 47.26%
GarageType: 81, 5.55%
GarageYrBlt: 81, 5.55%
GarageFinish: 81, 5.55%
GarageQual: 81, 5.55%
GarageCond: 81, 5.55%
PoolQC: 1453, 99.52%
Fence: 1179, 80.75%
MiscFeature: 1406, 96.30%


In [4]:
data.drop(["GarageYrBlt"],axis=1,inplace=True)

In [5]:
# Preprocessing, replacing missing value by hand to see the insight each the column
data["Alley"].fillna("NA",inplace=True)
data["LotFrontage"].fillna(data["LotFrontage"].median(),inplace=True)
data["MasVnrType"].fillna("None",inplace=True)
data["MasVnrArea"].fillna(data["MasVnrArea"].median(),inplace=True)
data["BsmtQual"].fillna("NA",inplace=True)
data["BsmtCond"].fillna("NA",inplace=True)
data["BsmtExposure"].fillna("NA",inplace=True)
data["BsmtFinType1"].fillna("NA",inplace=True)
data["BsmtFinType2"].fillna("NA",inplace=True)
data["Electrical"].fillna("SBrkr",inplace=True)
data["FireplaceQu"].fillna("NA",inplace=True)
data["GarageType"].fillna("NA",inplace=True)
data["GarageFinish"].fillna("NA",inplace=True)
data["GarageQual"].fillna("NA",inplace=True)
data["GarageCond"].fillna("NA",inplace=True)
data["PoolQC"].fillna("NA",inplace=True)
data["Fence"].fillna("NA",inplace=True)
data["MiscFeature"].fillna("NA",inplace=True)

In [6]:
data.isna().sum().max()

0

In [7]:
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [8]:
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

numerical_columns = []
categorical_columns = []
for col in data.columns:
    if is_numeric_dtype(data[col]):
        numerical_columns.append(col)
        continue
    if is_string_dtype(data[col]):
        categorical_columns.append(col)
len(numerical_columns),len(categorical_columns)

(36, 43)

In [9]:
numerical_data = pd.DataFrame(data[numerical_columns])
categorical_data = pd.DataFrame(data[categorical_columns])

In [10]:
# Train with only numerical data
X_numerical = numerical_data.drop("SalePrice",axis=1)
y = numerical_data["SalePrice"]

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_numerical)
X_scaled = scaler.transform(X_numerical)

In [12]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.2,random_state=0)

In [13]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train,y_train)
rf.score(X_test,y_test)

0.832545768561054

In [14]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test,rf.predict(X_test),squared=False)

34006.08472251532

In [15]:
# Linear Regression
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train,y_train)
reg.score(X_test,y_test)

0.634174114602846

In [16]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test,reg.predict(X_test),squared=False)

50262.69667956056

In [17]:
test_data = pd.read_csv("data/house-price/test.csv").drop("Id",axis=1)
test_data

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
1455,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
1456,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
1457,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [18]:
test_data.drop(["GarageYrBlt"],axis=1,inplace=True)

In [19]:
# Preprocessing, replacing missing value by hand to see the insight each the column
test_data["Alley"].fillna("NA",inplace=True)
test_data["LotFrontage"].fillna(test_data["LotFrontage"].median(),inplace=True)
test_data["MasVnrType"].fillna("None",inplace=True)
test_data["MasVnrArea"].fillna(test_data["MasVnrArea"].median(),inplace=True)
test_data["BsmtQual"].fillna("NA",inplace=True)
test_data["BsmtCond"].fillna("NA",inplace=True)
test_data["BsmtExposure"].fillna("NA",inplace=True)
test_data["BsmtFinType1"].fillna("NA",inplace=True)
test_data["BsmtFinType2"].fillna("NA",inplace=True)
test_data["Electrical"].fillna("SBrkr",inplace=True)
test_data["FireplaceQu"].fillna("NA",inplace=True)
test_data["GarageType"].fillna("NA",inplace=True)
test_data["GarageFinish"].fillna("NA",inplace=True)
test_data["GarageQual"].fillna("NA",inplace=True)
test_data["GarageCond"].fillna("NA",inplace=True)
test_data["PoolQC"].fillna("NA",inplace=True)
test_data["Fence"].fillna("NA",inplace=True)
test_data["MiscFeature"].fillna("NA",inplace=True)

In [20]:
numerical_columns.pop(-1)

'SalePrice'

In [21]:
test_data_numerical = pd.DataFrame(test_data[numerical_columns])

In [22]:
test_data_numerical.isna().sum().max()

2

In [23]:
X_test_scaled = scaler.transform(test_data_numerical)
X_test_scaled

array([[-8.72562756e-01,  4.60319735e-01,  1.10762574e-01, ...,
        -8.76878115e-02, -1.19109702e-01,  1.64520971e+00],
       [-8.72562756e-01,  5.05732724e-01,  3.75849846e-01, ...,
         2.51163088e+01, -1.19109702e-01,  1.64520971e+00],
       [ 7.33749635e-02,  1.87841806e-01,  3.32052818e-01, ...,
        -8.76878115e-02, -1.22911075e+00,  1.64520971e+00],
       ...,
       [-8.72562756e-01,  4.09335880e+00,  9.50422748e-01, ...,
        -8.76878115e-02,  9.90891347e-01, -1.36765473e+00],
       [ 6.64586038e-01, -3.57114054e-01, -7.59964439e-03, ...,
         1.32373600e+00,  2.50890648e-01, -1.36765473e+00],
       [ 7.33749635e-02,  1.87841806e-01, -8.91803775e-02, ...,
        -8.76878115e-02,  1.73089205e+00, -1.36765473e+00]])

In [24]:
rf.predict(X_test_scaled)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').