## Measure quality of each approach for drop, imputation, extension to imputation 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('dataset/melb_data.csv')

y = data.Price

melb_predictors = data.drop(['Price'], axis=1)

X = melb_predictors.select_dtypes(exclude=['object'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=0)

In [3]:
# Get names of columns with missing values

In [5]:
cols_with_missings = [col for col in X_train.columns if X_train[col].isnull().any()]

In [6]:
cols_with_missings

['Car', 'BuildingArea', 'YearBuilt']

## method1: drop columns in dataset

In [7]:
dropped_X_train = X_train.drop(cols_with_missings, axis='columns')
dropped_X_valid = X_valid.drop(cols_with_missings, axis=1)

## Score function

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10)
    model.fit(X_train, y_train)
    prediction = model.predict(X_valid)
    return mean_absolute_error(prediction, y_valid)

In [9]:
## Origin method without 

In [11]:
print(score_dataset(dropped_X_train, dropped_X_valid, y_train, y_valid))

190185.0117538939


## method2: Impulation

### Make copy to avoid chaing original data 

In [12]:
X_train_impulation = X_train.copy()
X_valid_impulation = X_valid.copy()

In [13]:
for col in cols_with_missings:
    X_train_impulation[col + '_was_missing'] = X_train_impulation[col].isnull()
    X_valid_impulation[col + '_was_missing'] = X_valid_impulation[col].isnull()

In [14]:
from sklearn.impute import SimpleImputer
my_impulater = SimpleImputer()
imputed_X_train_impulation = pd.DataFrame(my_impulater.fit_transform(X_train_impulation))
imputed_X_valid_impulation = pd.DataFrame(my_impulater.fit_transform(X_train_impulation))

array([[ 3.0000000e+00,  1.3800000e+01,  3.1880000e+03, ...,
        -3.7944350e+01,  1.4500927e+02,  5.4540000e+03],
       [ 1.0000000e+00,  4.6000000e+00,  3.1420000e+03, ...,
        -3.7844000e+01,  1.4500500e+02,  7.2170000e+03],
       [ 4.0000000e+00,  5.4000000e+00,  3.1010000e+03, ...,
        -3.7801270e+01,  1.4503733e+02,  1.0331000e+04],
       ...,
       [ 4.0000000e+00,  6.7000000e+00,  3.0580000e+03, ...,
        -3.7735720e+01,  1.4497256e+02,  1.1204000e+04],
       [ 3.0000000e+00,  1.2000000e+01,  3.0730000e+03, ...,
        -3.7720570e+01,  1.4502615e+02,  2.1650000e+04],
       [ 4.0000000e+00,  6.4000000e+00,  3.0110000e+03, ...,
        -3.7794300e+01,  1.4488750e+02,  7.5700000e+03]])