# Importing Libraries

In [1]:
import os
from pathlib import Path

# data manipulation
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True)

# plotting
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer



# Load dataset

In [2]:
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

In [3]:
df = pd.concat([train_df, test_df], ignore_index=True)

In [4]:
df.shape, train_df.shape, test_df.shape

((2919, 81), (1460, 81), (1459, 80))

In [5]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.0


In [6]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Encoding Labels

## CentralAir

In [7]:
# Convert CentralAir to binary 0/1 classification
lb = LabelBinarizer()
df['CentralAir'] = lb.fit_transform(df['CentralAir'])

## Encoding to integer

In [8]:
cat_df = df.select_dtypes(include=['object'])
for col in cat_df.columns.values:
    # fill missing value
    df[col].fillna('None', inplace=True)

    # label encode
    le = LabelEncoder()
    col_enc = str(col) + '_label'
    le_labels = le.fit_transform(df[col])
    df[col_enc] = le_labels

    # one hot encode
    ohe = OneHotEncoder()
    arr_enc = ohe.fit_transform(df[[col_enc]]).toarray()
    labels_enc = list(le.classes_)
    ohe_enc_df = pd.DataFrame(arr_enc, columns=labels_enc)

    # add encoded attributes to categorical dataframe
    df[labels_enc] = ohe_enc_df[labels_enc]

In [9]:
df.shape

(2919, 297)

In [10]:
def is_missing(df, columns):
    """Helper function to check missing values on dataset"""
    for column in columns:
        if df[column].isnull().values.any():
            return True
        else:
            return False

In [11]:
assert (not is_missing(df, df.columns)), AssertionError

In [12]:
# return datasets to train and test
n_train = train_df.shape[0]
train_df = df[:n_train]
test_df = df[n_train:]

In [13]:
train_df.shape, test_df.shape

((1460, 297), (1459, 297))

In [14]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,New,Oth,WD,SaleCondition_label,Abnorml,AdjLand,Alloca,Family,Normal,Partial
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,1.0,4,0.0,0.0,0.0,0.0,1.0,0.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,1.0,4,0.0,0.0,0.0,0.0,1.0,0.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,1.0,4,0.0,0.0,0.0,0.0,1.0,0.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,1.0,0,1.0,0.0,0.0,0.0,0.0,0.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,1.0,4,0.0,0.0,0.0,0.0,1.0,0.0


In [15]:
# Statistics missing values of train after impute
# Count unique missing value of each column
count = 0
for col in train_df.columns:
    if train_df[col].isnull().values.any():
        print(col)
        print(train_df[col].isnull().sum())
        count += 1
        
print('==========================')
print('Missing counter %d' % count)

LotFrontage
259
MasVnrArea
8
GarageYrBlt
81
Missing counter 3


# Encoded Labels

In [16]:
train_categorized = pd.read_csv('../data/interim/train_categorized.csv')
test_categorized = pd.read_csv('../data/interim/test_categorized.csv')

In [17]:
train_categorized.shape, test_categorized.shape

((1460, 297), (1459, 297))

In [18]:
train_categorized.shape, test_categorized.shape

((1460, 297), (1459, 297))

In [19]:
train_categorized.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,New,Oth,WD,SaleCondition_label,Abnorml,AdjLand,Alloca,Family,Normal,Partial
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,1.0,4,0.0,0.0,0.0,0.0,1.0,0.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,1.0,4,0.0,0.0,0.0,0.0,1.0,0.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,1.0,4,0.0,0.0,0.0,0.0,1.0,0.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,1.0,0,1.0,0.0,0.0,0.0,0.0,0.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,1.0,4,0.0,0.0,0.0,0.0,1.0,0.0
