# [Housing Prices Competition for Kaggle Learn Users](https://www.kaggle.com/competitions/home-data-for-ml-course/overview)

In [1]:
import pandas as pd

# Read the data
X_full = pd.read_csv('./data/train.csv', index_col='Id')
X_tst_full = pd.read_csv('./data/test.csv', index_col='Id')

In [2]:
X_full.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


# Data Preprocessing
Remove rows with missing target, separate target from predictors

In [3]:
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y_trn = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

## Numeric Values

In [4]:
# Numeric only
X_trn_num = X_full.select_dtypes(exclude=['object'])
X_tst_num = X_tst_full.select_dtypes(exclude=['object'])

X_trn_num.shape, X_tst_num.shape

((1460, 36), (1459, 36))

### Missing Values

In [5]:
missing_trn_num = (X_trn_num.isnull().sum())
print(missing_trn_num[missing_trn_num > 0])

LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
dtype: int64


In [6]:
missing_tst_num = (X_tst_num.isnull().sum())
print(missing_tst_num[missing_tst_num > 0])

LotFrontage     227
MasVnrArea       15
BsmtFinSF1        1
BsmtFinSF2        1
BsmtUnfSF         1
TotalBsmtSF       1
BsmtFullBath      2
BsmtHalfBath      2
GarageYrBlt      78
GarageCars        1
GarageArea        1
dtype: int64


- Drop
```
LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
```

In [7]:
all_columns = X_trn_num.columns
X_trn_num = X_trn_num.drop(all_columns[missing_trn_num > 0], axis=1)
X_tst_num = X_tst_num.drop(all_columns[missing_trn_num > 0], axis=1)
X_trn_num.shape, X_tst_num.shape

((1460, 33), (1459, 33))

- Missing values in test dataset

In [8]:
missing_tst_num = X_tst_num.isna().sum()
print(missing_tst_num[missing_tst_num > 0])

BsmtFinSF1      1
BsmtFinSF2      1
BsmtUnfSF       1
TotalBsmtSF     1
BsmtFullBath    2
BsmtHalfBath    2
GarageCars      1
GarageArea      1
dtype: int64


- Fill the missing values of the test dataset with the minima of the training dataset.

In [9]:
X_tst_num = X_tst_num.fillna(X_trn_num.min())

In [10]:
missing_tst_num = X_tst_num.isna().sum()
print(missing_tst_num[missing_tst_num > 0])

Series([], dtype: int64)


## Categorical Values

In [11]:
X_trn_cat = X_full.select_dtypes(include=['object'])
X_tst_cat = X_tst_full.select_dtypes(include=['object'])

X_trn_cat.shape, X_tst_cat.shape

((1460, 43), (1459, 43))

## One-Hot Encoding

In [12]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(dtype=np.float32, sparse_output=False, drop='if_binary', handle_unknown='ignore')
X_trn_onehot = enc.fit_transform(X_trn_cat)
X_tst_onehot = enc.transform(X_tst_cat)

X_trn_onehot.shape, X_tst_onehot.shape



((1460, 264), (1459, 264))

In [13]:
np.isnan(X_tst_onehot).any()

False

## Concatenate

In [14]:
X_trn = np.concatenate([X_trn_num, X_trn_onehot], axis=1)
X_tst = np.concatenate([X_tst_num, X_tst_onehot], axis=1)

X_trn.shape, X_tst.shape

((1460, 297), (1459, 297))

## Save to csv file

In [15]:
X_trn = pd.DataFrame(X_trn, index=X_full.index)
X_tst = pd.DataFrame(X_tst, index=X_tst_full.index)

In [16]:
# X_trn.to_csv('./trn_X.csv')
# X_tst.to_csv('./tst_X.csv')
# y_trn.to_csv('./trn_y.csv')