In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [13]:
data = pd.read_csv('./melb_data.csv')
# Separate target from predictors
y = data.Price
X = data.drop(['Price'], axis=1)
cols_missing = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_missing, axis=1, inplace=True)

low_card_cols = [col for col in X.columns if X[col].nunique() < 10 and X[col].dtype == "object" ]
num_cols = [col for col in X.columns if X[col].dtype in ['int64','float64']]

cols = low_card_cols + num_cols
x = X[cols].copy()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)


In [16]:
s = (x_train.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical Variables")
print(object_cols)

Categorical Variables
['Type', 'Method', 'Regionname']


In [20]:
def score_dataset(X_train, x_test, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(x_test)
    return mean_absolute_error(y_valid, preds)

# DROP THE COLUMNS!!!!

In [29]:
drop_x_train = x_train.select_dtypes(exclude= ['object'])
drop_x_test  = x_test.select_dtypes(exclude= ['object'])
score = score_dataset(drop_x_train, drop_x_test, y_train, y_test)
print(f'Score for Dropping Categorical Columns: {score}')

Score for Dropping Categorical Columns: 181602.13464148482


# Ordinal Encoder

In [31]:
label_x_train = x_train.copy()
label_x_test = x_test.copy()

ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(label_x_train[object_cols])
label_x_train[object_cols] = ordinal_encoder.transform(label_x_train[object_cols])
label_x_test[object_cols] = ordinal_encoder.transform(label_x_test[object_cols])

score_label = score_dataset(label_x_train, label_x_test, y_train, y_test)
print(f'Score for Ordinal Encoding Categorical Columns: {score_label}')

Score for Ordinal Encoding Categorical Columns: 171054.69997352548


# One-Hot Encoder

In [58]:
OH_ENCODER = OneHotEncoder(handle_unknown='ignore',sparse_output=False)

OH_ENCODER.fit(x_train[object_cols])
train_cols = pd.DataFrame(OH_ENCODER.transform(x_train[object_cols]))
test_cols = pd.DataFrame(OH_ENCODER.transform(x_test[object_cols]))

train_cols.index = x_train.index
test_cols.index = x_test.index

num_cols_train = x_train.select_dtypes(exclude=['object'])
num_cols_test = x_test.select_dtypes(exclude=['object'])

x_train_oh = pd.concat([num_cols_train, train_cols], axis = 1)
x_test_oh = pd.concat([num_cols_test, test_cols], axis = 1)

x_train_oh.columns = x_train_oh.columns.astype('str')
x_test_oh.columns = x_test_oh.columns.astype('str')

OH_SCORE = score_dataset(x_train_oh, x_test_oh, y_train, y_test)
print(f" Score For One-Hot Encoding: {OH_SCORE}")


 Score For One-Hot Encoding: 169818.55641173993
