In [2]:
import pandas as pd

mel_train_data = pd.read_csv("../datasets/mel_house_train.csv")
mel_test_data = pd.read_csv("../datasets/mel_house_test.csv")

mel_train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
# Drop house records where target value is missing
mel_train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)

target = mel_train_data.SalePrice

### Getting the predictors

In [10]:
missing_cols = [col for col in mel_train_data.columns if mel_train_data[col].isnull().any()]

# Candidate predictors for train/test
candidate_train_predictors = mel_train_data.drop(['Id', 'SalePrice']+ missing_cols, axis=1)
candidate_test_predictors = mel_test_data.drop(['Id'] + missing_cols, axis=1)


### Identifying the cardianility of the column. 

Cardianility means how many unique values are there in a column.


In [17]:
low_cardianility_cols = [col for col in candidate_train_predictors.columns if
                            candidate_train_predictors[col].nunique() < 10 and 
                            candidate_train_predictors[col].dtype == 'object']

numeric_cols = [col for col in candidate_train_predictors.columns if 
                candidate_train_predictors[col].dtype in ['int64', 'float64']]

my_cols = low_cardianility_cols + numeric_cols

train_predictors = candidate_train_predictors[my_cols]
test_predictors = candidate_test_predictors[my_cols]

In [21]:
# Random sample selection
train_predictors.dtypes.sample(10)

3SsnPorch        int64
LowQualFinSF     int64
HeatingQC       object
PavedDrive      object
BsmtUnfSF        int64
YearBuilt        int64
RoofMatl        object
Utilities       object
MSSubClass       int64
ExterCond       object
dtype: object

In [22]:
ohe_train_predictors = pd.get_dummies(train_predictors)
ohe_train_predictors

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,8450,7,5,2003,2003,706,0,150,856,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,20,9600,6,8,1976,1976,978,0,284,1262,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,60,11250,7,5,2001,2002,486,0,434,920,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,70,9550,7,5,1915,1970,216,0,540,756,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,60,14260,8,5,2000,2000,655,0,490,1145,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5,50,14115,5,5,1993,1995,732,0,64,796,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
6,20,10084,8,5,2004,2005,1369,0,317,1686,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
7,60,10382,7,6,1973,1973,859,32,216,1107,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
8,50,6120,7,5,1931,1950,0,0,952,952,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
9,190,7420,5,6,1939,1950,851,0,140,991,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


### Evaluating with Classifier

Calculate MAE on both type of predictors

1. One-Hot Encoded Predictors and numeric predictors
2. Numeric predictors

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# Function to calculate MAE
def get_mae(X, y):
    
    # multiple with -1 incase if it returns a negative value
    return -1 * cross_val_score(RandomForestRegressor(50), X, y, scoring='neg_mean_absolute_error').mean()

In [25]:
predictors_without_cat = train_predictors.select_dtypes(exclude=['object'])

mae_without_categorical = get_mae(predictors_without_cat, target)
mae_one_hot_encoded = get_mae(ohe_train_predictors, target)

In [27]:
print("Mean absolute error when dropping categories : "+ str(int(mae_without_categorical)))
print("Mean absolute error when one-hot encoding is done : "+ str(int(mae_one_hot_encoded)))

Mean absolute error when dropping categories : 18268
Mean absolute error when one-hot encoding is done : 18064


### Align the columns when multiple data files are used

If the training datasets and test datasets get misaligned, the results will be wrong. This could happen if the categorical data has different number of values in the training data vs test data. 

In [30]:
ohe_test_predictors = pd.get_dummies(test_predictors)

final_train, final_test = ohe_train_predictors.align(ohe_test_predictors, join='left', axis=1)
final_test

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,20,11622,5,6,1961,1961,468.0,144.0,270.0,882.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,20,14267,6,6,1958,1958,923.0,0.0,406.0,1329.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,60,13830,5,5,1997,1998,791.0,0.0,137.0,928.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,60,9978,6,6,1998,1998,602.0,0.0,324.0,926.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,120,5005,8,5,1992,1992,263.0,0.0,1017.0,1280.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5,60,10000,6,5,1993,1994,0.0,0.0,763.0,763.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
6,20,7980,6,7,1992,2007,935.0,0.0,233.0,1168.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
7,60,8402,6,5,1998,1998,0.0,0.0,789.0,789.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
8,20,10176,7,5,1990,1990,637.0,0.0,663.0,1300.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
9,20,8400,4,5,1970,1970,804.0,78.0,0.0,882.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
