In [76]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


In [89]:
# Path of the file to read
iowa_file_path = 'housing_price_iowa.csv'
#import the data
home_data = pd.read_csv(iowa_file_path)

target_y = home_data.SalePrice
#print(target_y)

#get all other column excpet the target column  
features_X = home_data.drop(['SalePrice'], axis=1)
#print(features_X)


def calculate_MAE(train_X, val_X, train_y, val_y):
    # Define the model
    rf_model = RandomForestRegressor()

    # fit your model
    rf_model.fit(train_X, train_y)
    #make predictions
    rf_predictions = rf_model.predict(val_X)
    # Calculate the mean absolute error of your Random Forest model on the validation data
    rf_val_mae = mean_absolute_error(val_y,rf_predictions)
    
    return rf_val_mae


# Ways of Handling missing/nan values 

### Drop columns with NAN values(missing values) 

In [93]:
def drop_nan_values(train_X, val_X, train_y, val_y):
    #get the column with nan values 
    cols_with_nan = [col for col in train_X.columns if train_X[col].isnull().any()]

    #drop the columns with nan values
    print(cols_with_nan)
    modified_train_X = train_X.drop(cols_with_nan, axis=1)
    modified_val_X = val_X.drop(cols_with_nan, axis=1)

    drop_nan_MAE=calculate_MAE(modified_train_X, modified_val_X, train_y, val_y)
    return drop_nan_MAE


### Imputation - fills in the missing value with some number. 

In [91]:
from sklearn.preprocessing import Imputer
def impute_values(train_X, val_X, train_y, val_y):

    my_imputer = Imputer(missing_values='NaN', strategy='mean', axis=1)
    #my_imputer = SimpleImputer()

    #Impute values 
    imputed_train_X = my_imputer.fit_transform(train_X)
    imputed_val_X = my_imputer.fit_transform(val_X)

    impute_MAE=calculate_MAE(imputed_train_X, imputed_val_X, train_y, val_y)
    return impute_MAE



### Imputation with Extra Columns Showing What Was Imputed

In [92]:
def extended_impute_values(train_X, val_X, train_y, val_y):
    new_imputed_train_X = train_X.copy()
    new_imputed_val_X = val_X.copy()

    for col in cols_with_nan:
        new_imputed_train_X[col + 'missing'] = new_imputed_train_X[col].isnull()
        new_imputed_val_X[col + 'missing'] = new_imputed_val_X[col].isnull()

    new_imputed_train_X = my_imputer.fit_transform(new_imputed_train_X)
    new_imputed_val_X = my_imputer.fit_transform(new_imputed_val_X)

    new_impute_MAE=calculate_MAE(new_imputed_train_X, new_imputed_val_X, train_y, val_y)
    
    return new_impute_MAE


# Handling catergorical data that is column with object type (not float values)

### Drop columns with object type 

In [94]:
#print(features_X.dtypes)

#Remove columns that are non float from the features
drop_catergorical_X = features_X.select_dtypes(exclude=['object'])
#Split the data to train and text set 
train_X, val_X, train_y, val_y = train_test_split(drop_catergorical_X, target_y, random_state = 1)

drop_nan_MAE = drop_nan_values(train_X, val_X, train_y, val_y)
impute_MAE = impute_values(train_X, val_X, train_y, val_y)
extended_impute_MAE = extended_impute_values(train_X, val_X, train_y, val_y)

print("MAE from dropping columns with Missing Values is {}".format(drop_nan_MAE))
print("MAE from Imputation is {}".format(impute_MAE))
print("MAE from Imputation while Track What Was Imputed is {}".format(extended_impute_MAE))

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
MAE from dropping columns with Missing Values is 17720.05506849315
MAE from Imputation is 18286.271232876712
MAE from Imputation while Track What Was Imputed is 18544.07123287671


### One-Hot Encoding : The Standard Approach for Categorical Data

In [98]:
#Handling non float columns with one-hot-encoding
one_hot_features_X = pd.get_dummies(features_X)

train_X, val_X, train_y, val_y = train_test_split(one_hot_features_X, target_y, random_state = 1)

 
#one_hot_train_X = pd.get_dummies(train_X)
#one_hot_val_X = pd.get_dummies(val_X)

drop_nan_MAE = drop_nan_values(train_X, val_X, train_y, val_y)
impute_MAE = impute_values(train_X, val_X, train_y, val_y)
extended_impute_MAE = extended_impute_values(train_X, val_X, train_y, val_y)

print("Using One hot encoder MAE from dropping columns with Missing Values is {}".format(drop_nan_MAE))
print("Using One hot encoder  MAE from Imputation is {}".format(impute_MAE))
print("Using One hot encoder MAE from Imputation while Track What Was Imputed is {}".format(extended_impute_MAE))


['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
Using One hot encoder MAE from dropping columns with Missing Values is 17641.221369863015
Using One hot encoder  MAE from Imputation is 17974.474520547945
Using One hot encoder MAE from Imputation while Track What Was Imputed is 18490.50383561644
