# Importing Libraries

In [1]:
import os
from pathlib import Path

# data manipulation
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True)

# plotting
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline

import yaml

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer



# Load categorized dataset

In [2]:
train_df = pd.read_csv('../data/interim/train_categorized.csv')
test_df = pd.read_csv('../data/interim/test_categorized.csv')

In [3]:
train_df.shape, test_df.shape

((1460, 297), (1459, 297))

In [4]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,New,Oth,WD,SaleCondition_label,Abnorml,AdjLand,Alloca,Family,Normal,Partial
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,1.0,4,0.0,0.0,0.0,0.0,1.0,0.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,1.0,4,0.0,0.0,0.0,0.0,1.0,0.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,1.0,4,0.0,0.0,0.0,0.0,1.0,0.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,1.0,0,1.0,0.0,0.0,0.0,0.0,0.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,1.0,4,0.0,0.0,0.0,0.0,1.0,0.0


# Replace NaN

## Numerical

In [5]:
def is_missing(df, columns):
    """Helper function to check missing values on dataset"""
    for column in columns:
        if df[column].isnull().values.any():
            return True
        else:
            return False

In [6]:
def load_params(filepath='params.yaml') -> dict:
    """Helper function to load params.yaml

    Args:
        filepath (str): filename or full filepath to yaml file with parameters

    Returns:
        dict: dictionary of parameters
    """

    assert (os.path.isfile(filepath)), FileNotFoundError

    # read params.yaml
    with open(filepath, 'r') as file:
        params = yaml.safe_load(file)

    return params

In [7]:
params = load_params('../params.yaml')

In [8]:
params['ignore_cols']

['Id', 'SalePrice']

In [9]:
params['imputation']['method']

'mean'

In [10]:
# concatenate df
df = pd.concat([train_df, test_df], ignore_index=True)

# fill NaNs with the default strategy is mean
num_cols = df.select_dtypes(include=[np.number]).columns.difference(params['ignore_cols'])
imputer = SimpleImputer(missing_values=np.NaN, strategy=params['imputation']['method'])
for col in num_cols:
    # fit imputing with numerical column
    imputer = imputer.fit(df[[col]])

    # assign imputed value for numerical column
    df[col] = imputer.transform(df[[col]]).ravel()

# make sure no missing values
assert (not is_missing(df, df.columns)), AssertionError

# return datasets to train and test
n_train = train_df.shape[0]
train_df = df[:n_train]
test_df = df[n_train:]

In [11]:
train_df.shape, test_df.shape

((1460, 297), (1459, 297))

In [12]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,New,Oth,WD,SaleCondition_label,Abnorml,AdjLand,Alloca,Family,Normal,Partial
0,1,60.0,RL,65.0,8450.0,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2,20.0,RL,80.0,9600.0,Pave,,Reg,Lvl,AllPub,...,0.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3,60.0,RL,68.0,11250.0,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4,70.0,RL,60.0,9550.0,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,5,60.0,RL,84.0,14260.0,Pave,,IR1,Lvl,AllPub,...,0.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0


In [5]:
train_df = replace_num_missing(train_df)

In [6]:
test_df = replace_num_missing(test_df)

In [13]:
train_total_missing = train_df.isnull().sum().sort_values(ascending=False)
train_ratio_missing = (train_df.isnull().sum()/train_df.isnull().count()).sort_values(ascending=False)
missing_train_data = pd.concat([train_total_missing, train_ratio_missing], axis=1, keys=['Total', 'Ratio'])
missing_train_data['Type'] = train_df[missing_train_data.index].dtypes

missing_train_data = missing_train_data[(missing_train_data['Total'] > 0)]

# view missing data stats
print('=== Missing Train Data Stats ===')
missing_train_data

print('Numerical Missing Values:')
print('=========================')
[print(col_missing,  '\t', missing_train_data['Total'][col_missing], 'NaNs') \
 for col_missing in missing_train_data[(missing_train_data['Total'] > 0) & \
                                 (missing_train_data['Type'] != 'object')].index.values]
print('=========================')

=== Missing Train Data Stats ===
Numerical Missing Values:


In [14]:
# Statistics missing values of train after impute
# Count unique missing value of each column
count = 0
for col in train_df.columns:
    if train_df[col].isnull().values.any():
        print(col)
        print(train_df[col].isnull().sum())
        count += 1
        
print('==========================')
print('Missing counter %d' % count)

Missing counter 0
