## Measure quality of each approach for drop, imputation, extension to imputation 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('dataset/melb_data.csv')

y = data.Price

melb_predictors = data.drop(['Price'], axis=1)

X = melb_predictors.select_dtypes(exclude=['object'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=0)

In [3]:
# Get names of columns with missing values

In [4]:
cols_with_missings = [col for col in X_train.columns if X_train[col].isnull().any()]

In [5]:
cols_with_missings

['Car', 'BuildingArea', 'YearBuilt']

## method1: drop columns in dataset

In [6]:
dropped_X_train = X_train.drop(cols_with_missings, axis='columns')
dropped_X_valid = X_valid.drop(cols_with_missings, axis=1)

## Score function

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10)
    model.fit(X_train, y_train)
    prediction = model.predict(X_valid)
    return mean_absolute_error(prediction, y_valid)

In [8]:
## Origin method without 

In [9]:
print(score_dataset(dropped_X_train, dropped_X_valid, y_train, y_valid))

192762.49186375132


## method2: Impulation

### Make copy to avoid chaing original data 

In [10]:
X_train_impulation = X_train.copy()
X_valid_impulation = X_valid.copy()

In [11]:
X_train_impulation.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
12520,3,13.8,3188.0,3.0,1.0,2.0,697.0,,,-37.94435,145.00927,5454.0
6371,1,4.6,3142.0,1.0,1.0,1.0,0.0,,,-37.844,145.005,7217.0
12339,4,5.4,3101.0,4.0,2.0,4.0,350.0,168.0,1900.0,-37.80127,145.03733,10331.0
5625,1,3.3,3141.0,1.0,1.0,1.0,0.0,41.0,1980.0,-37.8415,144.991,14887.0
9223,2,4.3,3032.0,2.0,2.0,2.0,37000.0,90.0,2004.0,-37.77518,144.89254,4918.0


In [12]:
from sklearn.impute import SimpleImputer
my_impulater = SimpleImputer()
imputed_X_train_impulation = pd.DataFrame(my_impulater.fit_transform(X_train_impulation))
imputed_X_valid_impulation = pd.DataFrame(my_impulater.transform(X_valid_impulation))

In [13]:
imputed_X_train_impulation.columns = X_train_impulation.columns
imputed_X_valid_impulation.columns = X_valid_impulation.columns

In [14]:
print(score_dataset(imputed_X_train_impulation, imputed_X_valid_impulation, y_train, y_valid))

179068.99536417273


## Method3: Extension to impulation

In [15]:
for col in cols_with_missings:
    X_train_impulation[col + '_was_missing'] = X_train_impulation[col].isnull()
    X_valid_impulation[col + '_was_missing'] = X_valid_impulation[col].isnull()

In [18]:
ext_X_train_impulation = pd.DataFrame(my_impulater.fit_transform(X_train_impulation))
ext_X_valid_impulation = pd.DataFrame(my_impulater.transform(X_valid_impulation))

In [None]:
print(score_dataset(ext_X_train_impulation, ext_X_valid_impulation, y_train, y_valid))