# Summary

I will perform the following feature engineering tasks to see their impact on prediction. Data in this notebook is from Kaggle house price prediction (https://www.kaggle.com/c/house-prices-advanced-regression-techniques).

+ try different encoding for categorical variables
+ feature selection

In [3]:
import os

# Scientific libraries
import numpy as np
# import scipy
import pandas as pd

# Graphic libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='ticks', context='talk')

# Creating alias for magic commands
%alias_magic t time

#tools for modeling
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV

Created `%t` as an alias for `%time`.
Created `%%t` as an alias for `%%time`.


## Prepare data

In [4]:
# folder = '/kaggle/input/house-prices-advanced-regression-techniques/'
folder = 'data'

train = pd.read_csv(os.path.join(folder, 'train.csv'))
test = pd.read_csv(os.path.join(folder, 'test.csv'))

print(train.shape)
print(test.shape)

(1460, 81)
(1459, 80)


In [5]:
train.columns = [str.lower(cc) for cc in train.columns]
test.columns = [str.lower(cc) for cc in test.columns]

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1460 non-null   int64  
 1   mssubclass     1460 non-null   int64  
 2   mszoning       1460 non-null   object 
 3   lotfrontage    1201 non-null   float64
 4   lotarea        1460 non-null   int64  
 5   street         1460 non-null   object 
 6   alley          91 non-null     object 
 7   lotshape       1460 non-null   object 
 8   landcontour    1460 non-null   object 
 9   utilities      1460 non-null   object 
 10  lotconfig      1460 non-null   object 
 11  landslope      1460 non-null   object 
 12  neighborhood   1460 non-null   object 
 13  condition1     1460 non-null   object 
 14  condition2     1460 non-null   object 
 15  bldgtype       1460 non-null   object 
 16  housestyle     1460 non-null   object 
 17  overallqual    1460 non-null   int64  
 18  overallc

A quick scan tells us that there are features with very few data: `poolqc`, `miscfeature`, `fence` and `alley`. As these features are likely unimportant, we should drop them.

In [10]:
# concat train, test to a single dataset to ensure transformations are always done on both
test['saleprice'] = 0
data = pd.concat([train, test])

In [11]:
ids = data['id']
data = data.drop(columns=['id', 'poolqc', 'miscfeature', 'fence', 'alley'])

In [102]:
num_vars = 
check_na(data)

Unnamed: 0,column,na_count
55,fireplacequ,1420
2,lotfrontage,486
58,garagefinish,159
62,garagecond,159
61,garagequal,159
57,garageyrblt,159
56,garagetype,157
29,bsmtcond,82
30,bsmtexposure,82
28,bsmtqual,81


In [None]:
data[to_fill] = data[to_fill].fillna(data[to_fill].mean())

In [12]:
# as type of dwelling is encoded as numeric, we should declare it as category
data['mssubclass'] = data['mssubclass'].astype('category')

In [14]:
# identify categorical variables
cat_vars = list(data.select_dtypes(include=['object', 'category']).columns)
print('# categorical variables:', len(cat_vars))

# categorical variables: 40


In [49]:
# declare target
target = 'saleprice'

# Encode categorical variables

In [15]:
import category_encoders as ce

## Onehot encoding as baseline

In [94]:
def onehot_encode(cat_feat, data, dummy_na=False):
    # given a categorical column,
    # perform onehot encode and return encoded DF together with names of new binary columns
    categories = data[cat_feat].unique()
    print('there are', len(categories), 'categories as follows:')
    print(categories)
    
    encoded = pd.get_dummies(data[cat_feat], prefix=cat_feat, dummy_na=dummy_na)
    res = pd.concat([data.drop(columns=[cat_feat]), encoded], axis='columns')
    new_feat_names = ['_'.join([cat_feat, str(cc)]) for cc in categories]
    return res, new_feat_names

def encode_cat_feats(data, cat_feats, dummy_na=False):
    print('Onehot encode categorical features: ', cat_feats)

    encoded_df = data.copy()
    
    # encode 1 cat feature at a time
    new_feat_names = []
    for cf in cat_feats:
        encoded_df, new_names = onehot_encode(cf, encoded_df, dummy_na=dummy_na)
        new_feat_names += new_names

    return encoded_df, new_feat_names

In [50]:
def get_train_tests(data, target):
    train_part = data.loc[data[target] > 0]
    test_part = data.loc[data[target] == 0]
    return train_part, test_part

In [73]:
def check_na(data):
    # return the columns having NAs, sorted descendingly by their number of NAs
    na_count = [sum(data[ff].isnull()) for ff in data.columns]
    return pd.DataFrame({'column': data.columns, 'na_count': na_count}).\
              query('na_count > 0').sort_values('na_count', ascending=False)  

In [77]:
## reuse base features from predictive model notebook
# %store -r feats
# print('# base features:', len(feats))
# print(feats)

# # drop some
# feats = list(np.setdiff1d(feats, ['exterqual', 'kitchenqual', 'bsmtqual', 'masvnrarea', 'totalbsmtsf'])) # last 2 cols have NAs

# base features: 50
['overallqual', 'yearbuilt', 'mosold', 'yrsold', 'grlivarea', 'lotarea', 'bedroomabvgr', 'fullbath', 'halfbath', 'kitchenabvgr', 'totrmsabvgrd', 'totalbsmtsf', '1stflrsf', '2ndflrsf', 'lowqualfinsf', 'masvnrarea', 'exterqual', 'kitchenqual', 'bsmtqual', 'neighborhood_CollgCr', 'neighborhood_Veenker', 'neighborhood_Crawfor', 'neighborhood_NoRidge', 'neighborhood_Mitchel', 'neighborhood_Somerst', 'neighborhood_NWAmes', 'neighborhood_OldTown', 'neighborhood_BrkSide', 'neighborhood_Sawyer', 'neighborhood_NridgHt', 'neighborhood_NAmes', 'neighborhood_SawyerW', 'neighborhood_IDOTRR', 'neighborhood_MeadowV', 'neighborhood_Edwards', 'neighborhood_Timber', 'neighborhood_Gilbert', 'neighborhood_StoneBr', 'neighborhood_ClearCr', 'neighborhood_NPkVill', 'neighborhood_Blmngtn', 'neighborhood_BrDale', 'neighborhood_SWISU', 'neighborhood_Blueste', 'foundation_PConc', 'foundation_CBlock', 'foundation_BrkTil', 'foundation_Wood', 'foundation_Slab', 'foundation_Stone']


In this set of features, I already use onehot encode for `neighborhood` and `foundation`

In [96]:
encoded_data, new_feat_names = encode_cat_feats( data, cat_feats=cat_vars, dummy_na=True)
print(encoded_data.shape)

Onehot encode categorical features:  ['mssubclass', 'mszoning', 'street', 'lotshape', 'landcontour', 'utilities', 'lotconfig', 'landslope', 'neighborhood', 'condition1', 'condition2', 'bldgtype', 'housestyle', 'roofstyle', 'roofmatl', 'exterior1st', 'exterior2nd', 'masvnrtype', 'exterqual', 'extercond', 'foundation', 'bsmtqual', 'bsmtcond', 'bsmtexposure', 'bsmtfintype1', 'bsmtfintype2', 'heating', 'heatingqc', 'centralair', 'electrical', 'kitchenqual', 'functional', 'fireplacequ', 'garagetype', 'garagefinish', 'garagequal', 'garagecond', 'paveddrive', 'saletype', 'salecondition']
there are 16 categories as follows:
[60, 20, 70, 50, 190, ..., 160, 75, 180, 40, 150]
Length: 16
Categories (16, int64): [60, 20, 70, 50, ..., 75, 180, 40, 150]
there are 6 categories as follows:
['RL' 'RM' 'C (all)' 'FV' 'RH' nan]
there are 2 categories as follows:
['Pave' 'Grvl']
there are 4 categories as follows:
['Reg' 'IR1' 'IR2' 'IR3']
there are 4 categories as follows:
['Lvl' 'Bnk' 'Low' 'HLS']
there a

In [97]:
train, test = get_train_tests(encoded_data, target)
print(train.shape)
print(test.shape)

(1460, 331)
(1459, 331)


In [100]:
X_train = train.drop(columns= [target]) ; y_train = train[target]
X_test = test.drop(columns= [target])

In [101]:
print(check_na(X_train))
print(check_na(X_test))

         column  na_count
0   lotfrontage       259
23  garageyrblt        81
6    masvnrarea         8
          column  na_count
0    lotfrontage       227
23   garageyrblt        78
6     masvnrarea        15
15  bsmtfullbath         2
16  bsmthalfbath         2
7     bsmtfinsf1         1
8     bsmtfinsf2         1
9      bsmtunfsf         1
10   totalbsmtsf         1
24    garagecars         1
25    garagearea         1


In [None]:
from sklearn.ensemble import RandomForestRegressor
base_rf = RandomForestRegressor(n_estimators=100, max_features=1.0, n_jobs=-1,
                               random_state=1,
                               )

In [80]:
base_rf.fit(X_train, y_train)
base_rf.score(X_train, y_train)

0.9791964728783473

## Target encoding

In [85]:
train, test = get_train_tests(data, target)

In [86]:
target_enc = ce.TargetEncoder(cols=to_encode)
target_enc.fit(train[to_encode] , train[target])
print(target_enc.transform(train[to_encode]).head())

    neighborhood     foundation
0  197965.773333  225230.442040
1  238770.100937  149805.714511
2  197965.773333  225230.442040
3  210624.725490  132291.075342
4  335295.317073  225230.442040


In [87]:
# Transform the features, 
# rename the columns with _target suffix, and join to dataframe
# also remove old categ vars
train_TE = train.join(target_enc.transform(train[to_encode]).add_suffix('_target')).drop(columns=to_encode)
test_TE = test.join(target_enc.transform(test[to_encode]).add_suffix('_target')).drop(columns=to_encode)

In [89]:
features = train_TE.columns.drop([target])
print(features)

Index(['mssubclass', 'mszoning', 'lotfrontage', 'lotarea', 'street',
       'lotshape', 'landcontour', 'utilities', 'lotconfig', 'landslope',
       'condition1', 'condition2', 'bldgtype', 'housestyle', 'overallqual',
       'overallcond', 'yearbuilt', 'yearremodadd', 'roofstyle', 'roofmatl',
       'exterior1st', 'exterior2nd', 'masvnrtype', 'masvnrarea', 'exterqual',
       'extercond', 'bsmtqual', 'bsmtcond', 'bsmtexposure', 'bsmtfintype1',
       'bsmtfinsf1', 'bsmtfintype2', 'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf',
       'heating', 'heatingqc', 'centralair', 'electrical', '1stflrsf',
       '2ndflrsf', 'lowqualfinsf', 'grlivarea', 'bsmtfullbath', 'bsmthalfbath',
       'fullbath', 'halfbath', 'bedroomabvgr', 'kitchenabvgr', 'kitchenqual',
       'totrmsabvgrd', 'functional', 'fireplaces', 'fireplacequ', 'garagetype',
       'garageyrblt', 'garagefinish', 'garagecars', 'garagearea', 'garagequal',
       'garagecond', 'paveddrive', 'wooddecksf', 'openporchsf',
       'enclosedporc

In [91]:
X_train = train_TE[features]; y_train = train_TE[target]
X_test = test_TE[features]

In [92]:
base_rf.fit(X_train, y_train)
base_rf.score(X_train, y_train)

ValueError: could not convert string to float: 'RL'

## Catboost encoding