# Setup

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

# Data

In [2]:
melb_data = pd.read_csv('melb_data.csv')

# Defining target Variable and features

In [3]:
# Price column will be the target variable
y = melb_data.Price

In [4]:
# Now taking features with numeric values and dropping Price
mel_demo = melb_data.drop(['Price'], axis=1)
X = mel_demo.select_dtypes(exclude=['object'])

# Train and Test data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8, random_state=0)

In [7]:
# Defining a function that shows quality of each approach
def score_data(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

# Approach 1

## removing null columns

In [8]:
# using the data after removing null values
col_null=[col for col in X_train.columns if X_train[col].isnull().any()]

# dropping those columns in train and test data
modi_X_train = X_train.drop(col_null, axis=1)
modi_X_test = X_test.drop(col_null, axis=1)

# now calling the function
print('MAE with first aprroach')
print(score_data(modi_X_train, modi_X_test, y_train, y_test))

MAE with first aprroach
183550.22137772635


# Approach 2

## Imputation

In [10]:
# definin imputer
imputer = SimpleImputer()

# applying imputation to train and test data
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_test = pd.DataFrame(imputer.fit_transform(X_test))

# adding columns removed by imputation
imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns

c

MAE with Imputation(Approach 2)
179816.89508731329


# Approach 3

## Extension to Imputation

In [12]:
# making copy of the original data
X_train_plus = X_train.copy()
X_test_plus = X_test.copy()

# adding columns to indicate imputed data
for col in col_null:
    X_train_plus[col + '_missing'] = X_train_plus[col].isnull()
    X_test_plus[col + '_missing'] = X_test_plus[col].isnull()
    
# imputer
imputer = SimpleImputer()

#applying
imputed_X_train_plus = pd.DataFrame(imputer.fit_transform(X_train_plus))
imputed_X_test_plus = pd.DataFrame(imputer.fit_transform(X_test_plus))

# adding columns removed by imputation
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_test_plus.columns = X_test_plus.columns

# printing the results using this approach
print("MAE with Extension toImputation(Approach 3)")
print(score_data(imputed_X_train_plus, imputed_X_test_plus, y_train, y_test))



MAE with Extension toImputation(Approach 3)
179986.2708570026


# Categorical Variables

In [13]:
# new features including categorical values
X_new = melb_data.drop(['Price'], axis=1)

In [14]:
# Train and Test
X2_train, X2_test, y_train,y_test = train_test_split(X_new, y, test_size=0.2, random_state=0)


In [15]:
# dropping columns with missing values
col_miss  = [col for col in X2_train.columns if X2_train[col].isnull().any()]
X2_train.drop(col_miss, axis=1, inplace=True)
X2_test.drop(col_miss, axis=1, inplace=True)

In [17]:
# selecting low number of unique values in a object column
low_card = [cname for cname in X2_train.columns if X2_train[cname].nunique()<10
           and X2_train[cname].dtype == 'object']

# selecting low number of unique values in a numeric column
num_cols = [cname for cname in X2_train.columns 
            if X2_train[cname].dtype in ['int64', 'float64']]

# keeping low cardinality and numerical columns only
keep_cols = low_card + num_cols
X_new_train = X2_train[keep_cols].copy()
X_new_test = X2_test[keep_cols].copy()

In [18]:
X_new_train

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.98670,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.85800,144.90050,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.79880,144.82200,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.70830,144.91580,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.76230,144.82720,4217.0
...,...,...,...,...,...,...,...,...,...,...,...,...
13123,h,SP,Northern Metropolitan,3,5.2,3056.0,3.0,1.0,212.0,-37.77695,144.95785,11918.0
3264,h,S,Eastern Metropolitan,3,10.5,3081.0,3.0,1.0,748.0,-37.74160,145.04810,2947.0
9845,h,PI,Northern Metropolitan,4,6.7,3058.0,4.0,2.0,441.0,-37.73572,144.97256,11204.0
10799,h,S,Northern Metropolitan,3,12.0,3073.0,3.0,1.0,606.0,-37.72057,145.02615,21650.0


In [21]:
# getting list of categorical vars
obj  = (X_new_train.dtypes == 'object')
cat_vars = list(obj[obj].index)

print('Categorical variables are:')
print(cat_vars)

Categorical variables are:
['Type', 'Method', 'Regionname']


# Categorical data approaches


## Approach 1 (removing categorical values)

In [24]:
# defining a function
def score_dataset(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    y_preds = model.predict(X_test)
    return mean_absolute_error(y_test, y_preds)

In [25]:
# dropping categorical columns
drop_X_new_train = X_new_train.select_dtypes(exclude=['object'])
drop_X_new_test = X_new_test.select_dtypes(exclude=['object'])

# printing the result
print('MAE after dropping categorical values')
print(score_dataset(drop_X_new_train, drop_X_new_test, y_train, y_test))

MAE after dropping categorical values
175703.48185157913


## Approach 2 (ordinal Encoding the categorical values)

In [27]:
# copying th values
label_X_train = X_new_train.copy()
label_X_test = X_new_test.copy()

# defining encoder
ordinal_encoder = OrdinalEncoder()
label_X_train[cat_vars] = ordinal_encoder.fit_transform(X_new_train[cat_vars])
label_X_test[cat_vars] = ordinal_encoder.fit_transform(X_new_test[cat_vars])

#printing the result
print('MAE after ordinal encoding categorical values')
print(score_dataset(label_X_train, label_X_test, y_train, y_test))



MAE after ordinal encoding categorical values
165936.40548390493
