In [12]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor

# Create some syntetic data
size = 10000

Year = np.random.randint(1998, high=2021, size=size, dtype=int)
LotArea = np.random.randint(50, high=2001, size=size, dtype=int)
nFlor = np.random.randint(1, high=11, size=size, dtype=int)
BedAbv = np.random.normal(loc=50.0, scale=5.0, size=size)
county = np.random.randint(1, high=51, size=size, dtype=int)

# Syntetic categorical data
locs = np.array(['north', 'south', 'center','east','west'])
Location = np.tile(locs, int(size/5))
np.random.shuffle(Location)

neigh = np.array(['Croxton','Croxton East','Dennis','Merri','Northcote South','Ruckers Hill','Westgarth',
'Preston','Bell','Darebin Park','Gilberton','Northland','Preston Lake','Regent','Regent West','South Preston',
'Sylvester','Reservoir','Gilbank','Regent','Reservoir North','Ruthven','Summerhill','Thornbury','Thornbury North'])
Neighborhood = np.tile(neigh, int(size/25))
np.random.shuffle(Location)


# Define a function to obtain prices for later testing the model
def fun_price(y,lot,flor,bed,county):    
    return np.sin(y/2020)*lot*flor + bed - county/2

def col_price(row):
    y = row.Year
    flor = row.nFlor
    lot = row.LotArea
    bed = row.BedAbv
    county = row.county
    return int(fun_price(y,lot,flor,bed,county))

home_data = pd.DataFrame({'Year':Year, 'LotArea':LotArea, 'nFlor':nFlor,'BedAbv':BedAbv, 
                          'county':county, 'Location':Location,'Neighborhood':Neighborhood})
home_data['Price'] = home_data.apply(col_price,axis=1)
home_data

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Regression Tree Model Fit
y = home_data.Price
feature_columns = ['Year', 'LotArea', 'nFlor', 'BedAbv', 'county','Location','Neighborhood']
X = home_data[feature_columns]
# split data into training and validation data, for both features and target
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 0)

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [3]:
# 1. Drop columns with catrgorical values
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop categorical variables):
14.642668000000002


In [4]:
# 2. Ordinal Encoding

# Before jumping, sometimes there are categorical values in the X_valid but not in the X_train, 
# The easiest way is to drop these columns

# Categorical columns in the training data
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_valid[col]).issubset(set(X_train[col]))]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

Categorical columns that will be ordinal encoded: ['Location', 'Neighborhood']

Categorical columns that will be dropped from the dataset: []


In [5]:
# 2.1 Ordinal Encoding

from sklearn.preprocessing import OrdinalEncoder

# Drop categorical columns that will not be encoded (in this case does nothing)
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
label_X_valid[good_label_cols] = ordinal_encoder.transform(X_valid[good_label_cols])

print("MAE from Approach 2 (Ordinal Encoding):") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from Approach 2 (Ordinal Encoding):
15.125080000000004


In [6]:
label_X_train

Unnamed: 0,Year,LotArea,nFlor,BedAbv,county,Location,Neighborhood
2967,2007,607,10,50.352921,28,2.0,14.0
700,2005,702,5,44.468571,31,4.0,1.0
3481,2002,309,5,45.485570,22,1.0,23.0
1621,2008,1920,9,49.558706,46,0.0,17.0
800,2010,762,8,46.206240,36,1.0,1.0
...,...,...,...,...,...,...,...
9225,2001,1821,9,46.605351,29,1.0,1.0
4859,2019,1107,5,51.169145,45,2.0,3.0
3264,2005,75,8,46.168854,29,0.0,13.0
9845,2001,935,4,53.588139,17,2.0,15.0


In [7]:
# 3.0 One-hot encoding

# Typically will only one-hot encode columns with relatively low cardinality. 
# One-hot encoding can greatly expand the size of the dataset.

# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]

# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)

Categorical columns that will be one-hot encoded: ['Location']

Categorical columns that will be dropped from the dataset: ['Neighborhood']


In [8]:
# 3.1 One-hot encoding

from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))

MAE from Approach 3 (One-Hot Encoding):




14.837995999999999




In [9]:
# Watch extra columns created by OneHotEncoder
OH_X_train

Unnamed: 0,Year,LotArea,nFlor,BedAbv,county,0,1,2,3,4
2967,2007,607,10,50.352921,28,0.0,0.0,1.0,0.0,0.0
700,2005,702,5,44.468571,31,0.0,0.0,0.0,0.0,1.0
3481,2002,309,5,45.485570,22,0.0,1.0,0.0,0.0,0.0
1621,2008,1920,9,49.558706,46,1.0,0.0,0.0,0.0,0.0
800,2010,762,8,46.206240,36,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
9225,2001,1821,9,46.605351,29,0.0,1.0,0.0,0.0,0.0
4859,2019,1107,5,51.169145,45,0.0,0.0,1.0,0.0,0.0
3264,2005,75,8,46.168854,29,1.0,0.0,0.0,0.0,0.0
9845,2001,935,4,53.588139,17,0.0,0.0,1.0,0.0,0.0


In [10]:
# BOTH METHODS
Both_X_train = OH_X_train.copy()
Both_X_valid = OH_X_valid.copy()

Both_X_train['Neighborhood'] = label_X_train['Neighborhood']
Both_X_valid['Neighborhood'] = label_X_valid['Neighborhood']

print("MAE from Approach 4 (Both Encodings):") 
print(score_dataset(Both_X_train, Both_X_valid, y_train, y_valid))



MAE from Approach 4 (Both Encodings):
15.174712




In [11]:
Both_X_train

Unnamed: 0,Year,LotArea,nFlor,BedAbv,county,0,1,2,3,4,Neighborhood
2967,2007,607,10,50.352921,28,0.0,0.0,1.0,0.0,0.0,14.0
700,2005,702,5,44.468571,31,0.0,0.0,0.0,0.0,1.0,1.0
3481,2002,309,5,45.485570,22,0.0,1.0,0.0,0.0,0.0,23.0
1621,2008,1920,9,49.558706,46,1.0,0.0,0.0,0.0,0.0,17.0
800,2010,762,8,46.206240,36,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
9225,2001,1821,9,46.605351,29,0.0,1.0,0.0,0.0,0.0,1.0
4859,2019,1107,5,51.169145,45,0.0,0.0,1.0,0.0,0.0,3.0
3264,2005,75,8,46.168854,29,1.0,0.0,0.0,0.0,0.0,13.0
9845,2001,935,4,53.588139,17,0.0,0.0,1.0,0.0,0.0,15.0
