I will start with a standard random forest (RF) as base model. Then we will tune hyperparams of RF to achieve a better model. Finally, I will try XGBoost.

We will pick features based on analysis from EDA notebook.

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import  mean_absolute_error, mean_squared_error

In [3]:
!pip install -U importlib

Collecting importlib
  Downloading importlib-1.0.4.zip (7.1 kB)
Building wheels for collected packages: importlib
  Building wheel for importlib (setup.py): started
  Building wheel for importlib (setup.py): finished with status 'done'
  Created wheel for importlib: filename=importlib-1.0.4-py3-none-any.whl size=5862 sha256=3fca76519f9cf799e42885cb1aa470a5ca44847a55bcebae5623310ee67d7bda
  Stored in directory: c:\users\victor\appdata\local\pip\cache\wheels\86\e4\cb\62b0e9efd7da1e984baec0c0ded0b727a7ed25e1904ed51fca
Successfully built importlib
Installing collected packages: importlib
Successfully installed importlib-1.0.4


In [5]:
# folder = '/kaggle/input/house-prices-advanced-regression-techniques/'
folder = 'data'

train = pd.read_csv(os.path.join(folder, 'train.csv'))
test = pd.read_csv(os.path.join(folder, 'test.csv'))

print(train.shape)
print(test.shape)

train.head()

(1460, 81)
(1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
# concat train and test sets st we always perform transformation on both sets
test['SalePrice'] = 0
data = pd.concat([train, test])

print(data.shape)

# lowercase all column names for convenience
data.columns = [str.lower(cc) for cc in data.columns]

# sale price in thousands is better for plotting
data['sale_price_in_thousand'] = data['saleprice']/(10**3)

# sale price per square feet is also interested
data['sale_price_per_sf'] = data['saleprice'] / data['grlivarea']

(2919, 81)


There are two approaches:
+ predict directly sale price
+ predict price per SF, then multiply with living area to estimate sale price

I will try both, but first we need some helpers.

### Helper methods

In [7]:
def cal_age_from_built(row):
    return row['yrsold'] - row['yearbuilt']

def cal_age_from_remodel(row):
    return row['yrsold'] - row['yearremodadd']


def fold_zone_type(ms_zone):
    if ms_zone in ['FV', 'RH', 'C (all)']:
        return 'Other'
    else:
        return ms_zone
#         return {'RL': 'Residential Low Density'.lower() , 
#                 'RM': 'Residential Medium Density'.lower(),
#                 None: 'NA'
#                }[ms_zone]    

def to_adjacency(cond):
    if 'RR' in cond:
        return 'Railroad'
    if 'Pos' in cond:
        return 'Positive feature'
    return {
        'Artery': 'Arterial street',
        'Feedr': 'Feeder street',
        'Norm': 'Normal'    
        }[cond]

In [8]:
def onehot_encode(cat_feat, data, dummy_na=False):
    encoded = pd.get_dummies(data[cat_feat], prefix=cat_feat, dummy_na=dummy_na)
    res = pd.concat([data.drop(columns=[cat_feat]), encoded], axis='columns')
    return res

def encode_cat_feats(data, cat_feats, dummy_na=False):
    print('Onehot encode categorical features: ', cat_feats)

    encoded_df = data.copy()
    # encode 1 cat feature at a time
    for cf in cat_feats:
        encoded_df = onehot_encode(cf, encoded_df, dummy_na=dummy_na)

    return encoded_df

In [11]:
def list_numeric_columns(data):
    return list(data.columns[np.where(data.dtypes != 'object')])

def list_string_columns(data):
    return list(data.columns[np.where(data.dtypes == 'object')])

def split_train_valid(data, target):
    y = data.pop(target)
    X = data
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.1, 
                                                          random_state=1
                                                         )
    return X_train, X_valid, y_train, y_valid

def check_na(data):
    # check if any NA left
    na_count = [sum(data[ff].isnull()) for ff in data.columns]
    return pd.DataFrame({'column': data.columns, 'na_count': na_count}).\
              query('na_count > 0')   

In [12]:
def to_quantitative(text_feat, df, scoring):
    '''
    Given a feature stored in data as text but actually a quantitative feat, convert it to numerical values
    via given encoding
    :param scoring:
    :param text_feat:
    :return:
    '''
    n_na = sum(df[text_feat].isnull())
    print('\t Feature {0} has {1} NAs, they will be filled by 0'.format(text_feat, n_na))

    res = df.copy()
    res[text_feat].fillna("NA", inplace=True)
    res[text_feat] = res[text_feat].apply(lambda form: scoring[form])
    return res

def quant_to_scores(quant_feats, data, scorings):
    print('\n Converting quantitative text features to scores...')
    score_dict = dict(zip(quant_feats, scorings))
    
    for tf in quant_feats:  
        data = to_quantitative(text_feat=tf, df=data, scoring=score_dict[tf])

    return data

In [43]:
def make_output(y_pred):
    test_index = range(len(train_part) + 1, len(data) + 1)
    return pd.DataFrame({'Id': test_index, 'SalePrice': y_pred})

## Preprocessing

In [70]:
na_checker = check_na(data)
na_checker = na_checker.sort_values('na_count', ascending=False)
na_checker

Unnamed: 0,column,na_count
72,poolqc,2909
74,miscfeature,2814
6,alley,2721
73,fence,2348
57,fireplacequ,1420
3,lotfrontage,486
64,garagecond,159
59,garageyrblt,159
63,garagequal,159
60,garagefinish,159


In [80]:
data['totalbsmtsf'].fillna(data['totalbsmtsf'].mean(), inplace=True)
data['bsmtunfsf'].fillna(data['bsmtunfsf'].mean(), inplace=True)

In [74]:
target = 'saleprice'
train_part = data.loc[data[target] > 0]
test_part = data.loc[data[target] == 0]

## Simple model
LInear regressors with no derived features.

In [55]:
# total bathroom
data['total_bath'] = data['fullbath'] + data['halfbath']

In [None]:
target = 'saleprice'

In [84]:
def get_train_tests(data, target):
    train_part = data.loc[data[target] > 0]
    test_part = data.loc[data[target] == 0]
    return train_part, test_part

In [85]:
train_part, test_part = get_train_tests(data, target)

In [46]:
feats0 = ['overallqual', 'yearbuilt', 'mosold', 'yrsold', 'grlivarea', 'lotarea']

In [47]:
y_train = train_part[target]
X_train = train_part[feats0]
X_test = test_part[feats0]

In [48]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print('score of linear regressor', lr.score(X_train, y_train))

alphas = np.linspace(-3, 0, 4)
ridge = RidgeCV(alphas=alphas, cv=5)
ridge.fit(X_train, y_train)
print('score of ridge regressor', ridge.score(X_train, y_train))

score of linear regressor 0.7497036340620418
score of ridge regressor 0.7497036340620418


Ridge and base linear regressors have same score.

In [37]:
y_pred = ridge.predict(X_test)

In [44]:
ridge_res = make_output(ridge.predict(X_test))
ridge_res.head()

Unnamed: 0,Id,SalePrice
1454,2915,92308.551102
1455,2916,92462.926766
1456,2917,125227.592449
1457,2918,125937.080425
1458,2919,243743.740786


In [45]:
ridge_res.to_csv('output/ridge_pred.csv', index=False)

## Incrementally add features

In [57]:
# features for bathrooms, bedrooms
room_feats = ['bedroomabvgr', 'fullbath', 'halfbath', 'total_bath',
              'kitchenabvgr', 'totrmsabvgrd'
             ]

In [73]:
X_train = train_part[feats0 + room_feats]
X_test = test_part[feats0 + room_feats]

In [59]:
ridge.fit(X_train, y_train)
ridge.score(X_train, y_train)

  overwrite_a=True).T


0.7668613411201849

In [60]:
check_na(X_train)

Unnamed: 0,column,na_count


In [52]:
ridge_res = make_output(ridge.predict(X_test))
ridge_res.head()

Unnamed: 0,Id,SalePrice
0,1461,121729.187236
1,1462,155098.899738
2,1463,168843.642517
3,1464,192426.914222
4,1465,219777.022163


In [53]:
ridge_res.to_csv('output/ridge_pred_2.csv', index=False)

In [61]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print('score of linear regressor', lr.score(X_train, y_train))

score of linear regressor 0.7668627922722758


In [94]:
# basement features
# a potential feature is ratio between unfinished basement area and total area

# data['bsmt_unfinished_ratio'] = data['bsmtunfsf'] / data['totalbsmtsf']
bsmt_feats = ['totalbsmtsf', ] 


+ adding bsmtunfsf pull down perf
+ bsmt_unfinished_ratio has NA

In [95]:
train_part, test_part = get_train_tests(data, target)

In [96]:
X_train = train_part[feats0 + room_feats + bsmt_feats]
X_test = test_part[feats0 + room_feats + bsmt_feats]

In [97]:
print(check_na(X_train))
print(check_na(X_test))

Empty DataFrame
Columns: [column, na_count]
Index: []
Empty DataFrame
Columns: [column, na_count]
Index: []


In [98]:
lr.fit(X_train, y_train)
lr.score(X_train, y_train)

0.7834926493506555

In [99]:
lin_res = make_output(lr.predict(X_test))
lin_res.head()

Unnamed: 0,Id,SalePrice
0,1461,122864.405726
1,1462,171362.088712
2,1463,173757.455656
3,1464,195865.8502
4,1465,215198.228143


In [100]:
lin_res.to_csv('output/lin_res.csv', index=False)

Drop columns with lots of NAs.

In [157]:
columns_with_lots_of_na = na_checker.head(11)['column']
print(columns_with_lots_of_na)
data = data.drop(columns=columns_with_lots_of_na)

72          poolqc
74     miscfeature
6            alley
73           fence
57     fireplacequ
3      lotfrontage
60    garagefinish
63      garagequal
64      garagecond
59     garageyrblt
58      garagetype
Name: column, dtype: object


### For numeric columns, fill NAs by mean

In [158]:
num_vars = list_numeric_columns(data)
data[num_vars].mean()
data[num_vars] = data[num_vars].fillna(data[num_vars].mean())

### Convert quantitative text columns to scores

In [145]:
six_scale = {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "NA": 0}
quant_feats = ['utilities',
 'exterqual',
 'extercond',
 'heatingqc',
 'bsmtqual',
 'bsmtcond',
 'kitchenqual',
 'bsmtexposure',
 'bsmtfintype1'
]
scorings = [{"AllPub": 4, "NoSewr": 3, "NoSeWa": 2, "ELO": 1, "NA": 0},
            six_scale,
            six_scale,
            six_scale,
            six_scale,
            six_scale,
            six_scale,
            {"Gd": 4, "Av": 3, "Mn": 2, "No": 1, "NA": 0},
            {"GLQ": 6, "ALQ": 5, "BLQ": 4, "Rec": 3, "LwQ": 2, "Unf": 1, "NA": 0},
            ]
len(quant_feats) == len(scorings)

True

In [168]:
len(quant_feats)

9

In [159]:
data = quant_to_scores(quant_feats, data, scorings)


 Converting quantitative text features to scores...
	 Feature utilities has 2 NAs, they will be filled by 0
	 Feature exterqual has 0 NAs, they will be filled by 0
	 Feature extercond has 0 NAs, they will be filled by 0
	 Feature heatingqc has 0 NAs, they will be filled by 0
	 Feature bsmtqual has 81 NAs, they will be filled by 0
	 Feature bsmtcond has 82 NAs, they will be filled by 0
	 Feature kitchenqual has 1 NAs, they will be filled by 0
	 Feature bsmtexposure has 82 NAs, they will be filled by 0
	 Feature bsmtfintype1 has 79 NAs, they will be filled by 0


In [160]:
data[quant_feats].describe()

Unnamed: 0,utilities,exterqual,extercond,heatingqc,bsmtqual,bsmtcond,kitchenqual,bsmtexposure,bsmtfintype1
count,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0
mean,3.996574,3.396711,3.085646,4.151764,3.477561,2.918465,3.509764,1.623844,3.541624
std,0.11102,0.580293,0.372361,0.957952,0.905448,0.57495,0.665273,1.070026,2.113851
min,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0,1.0
50%,4.0,3.0,3.0,5.0,4.0,3.0,3.0,1.0,4.0
75%,4.0,4.0,3.0,5.0,4.0,3.0,4.0,2.0,6.0
max,4.0,5.0,5.0,5.0,5.0,4.0,5.0,4.0,6.0


### Compute derived features

From EDA, following derived features have decent/high correlation with target:
+ age from built
+ 

In [163]:
data['age_from_built'] = data.apply(axis=1, func=cal_age_from_built)

In [164]:
# for plotting
data['zone_type'] = data['mszoning'].apply(fold_zone_type)
data['adjacency'] = data['condition1'].apply(to_adjacency)

### Encode cat features

In [165]:
cat_feats = list_string_columns(data)
print('# cat feats: ', len(cat_feats))
print(cat_feats)

# cat feats:  27
['mszoning', 'street', 'lotshape', 'landcontour', 'lotconfig', 'landslope', 'neighborhood', 'condition1', 'condition2', 'bldgtype', 'housestyle', 'roofstyle', 'roofmatl', 'exterior1st', 'exterior2nd', 'masvnrtype', 'foundation', 'bsmtfintype2', 'heating', 'centralair', 'electrical', 'functional', 'paveddrive', 'saletype', 'salecondition', 'zone_type', 'adjacency']


In [166]:
dump_data = encode_cat_feats(data, cat_feats=cat_feats, dummy_na=True)
print(dump_data.shape)
dump_data.head()

Onehot encode categorical features:  ['mszoning', 'street', 'lotshape', 'landcontour', 'lotconfig', 'landslope', 'neighborhood', 'condition1', 'condition2', 'bldgtype', 'housestyle', 'roofstyle', 'roofmatl', 'exterior1st', 'exterior2nd', 'masvnrtype', 'foundation', 'bsmtfintype2', 'heating', 'centralair', 'electrical', 'functional', 'paveddrive', 'saletype', 'salecondition', 'zone_type', 'adjacency']
(2919, 260)


Unnamed: 0,id,mssubclass,lotarea,utilities,overallqual,overallcond,yearbuilt,yearremodadd,masvnrarea,exterqual,...,zone_type_Other,zone_type_RL,zone_type_RM,zone_type_nan,adjacency_Arterial street,adjacency_Feeder street,adjacency_Normal,adjacency_Positive feature,adjacency_Railroad,adjacency_nan
0,1,60,8450,4,7,5,2003,2003,196.0,4,...,0,1,0,0,0,0,1,0,0,0
1,2,20,9600,4,6,8,1976,1976,0.0,3,...,0,1,0,0,0,1,0,0,0,0
2,3,60,11250,4,7,5,2001,2002,162.0,4,...,0,1,0,0,0,0,1,0,0,0
3,4,70,9550,4,7,5,1915,1970,0.0,3,...,0,1,0,0,0,0,1,0,0,0
4,5,60,14260,4,8,5,2000,2000,350.0,4,...,0,1,0,0,0,0,1,0,0,0


In [None]:
# TODO: drop non-important feats
to_drop = ['saletype', 'salecondition']

## Base model

No feature selection, no tuning, just dump in all features.

In [167]:
target = 'saleprice'

encoded_train = dump_data.loc[dump_data[target] != 0].copy()
na_checker = check_na(encoded_train)
if not na_checker.empty:
    print(na_checker)

X_train, X_valid, y_train, y_valid = split_train_valid(encoded_train, target)

In [169]:
base_rf = RandomForestRegressor(n_estimators=100, max_features=1.0, n_jobs=-1,
                               random_state=1,
                               )
base_rf.fit(X_train, y_train)

RandomForestRegressor(max_features=1.0, n_jobs=-1, random_state=1)

In [170]:
y_pred = base_rf.predict(X_valid)
base_rmse = np.sqrt(mean_squared_error(y_valid, y_pred)) 
print('If predict directly sale price, base RMSE: ', round(base_rmse, 2))

If predict directly sale price, base RMSE:  1850.62


### Pick a small set of features

BAsed on EDA.

In [87]:
interest_cat_vars = ['adjacency', 'mszoning', 'neighborhood',
                     'bldgtype', 'housestyle', 
                     'heatingqc', 'centralair', 'electrical', 'kitchenqual',
                     'bsmtqual',
                    ]

interest_num_vars = ['age_from_built', 'overallqual']

base_features = interest_cat_vars + interest_num_vars
print(base_features)

['adjacency', 'mszoning', 'neighborhood', 'bldgtype', 'housestyle', 'heatingqc', 'centralair', 'electrical', 'kitchenqual', 'bsmtqual', 'age_from_built', 'overallqual']


In [113]:
picked_data = data[base_features]
print(picked_data.shape)
# picked_data.head()

Onehot encode categorical features:  ['adjacency', 'mszoning', 'neighborhood', 'bldgtype', 'housestyle', 'heatingqc', 'centralair', 'electrical', 'kitchenqual', 'bsmtqual']
(2919, 70)


### Predict sale price directly

In [115]:
target = 'saleprice'
ready_data_1 = pd.concat([picked_data, 
                        data[['grlivarea', 'garagearea']]],
                       axis=1
                      )
ready_data_1[target] = data[target]

encoded_train_1 = ready_data_1.loc[ready_data_1[target] != 0].copy()
na_checker = check_na(encoded_train_1)
if not na_checker.empty:
    print(na_checker)

X_train, X_valid, y_train, y_valid = split_train_valid(encoded_train_1, target)

In [116]:
base_rf = RandomForestRegressor(n_estimators=100, max_features=1.0, n_jobs=-1,
                               random_state=1,
                               )
base_rf.fit(X_train, y_train)

RandomForestRegressor(max_features=1.0, n_jobs=-1, random_state=1)

In [118]:
y_pred = base_rf.predict(X_valid)
base_rmse = np.sqrt(mean_squared_error(y_valid, y_pred)) 
print('If predict directly sale price, base RMSE: ', round(base_rmse, 2))

If predict directly sale price, base RMSE:  25119.73


### Predict price per square feet

I predict price per square feet, then multiply with living area to estimate sale price.

In [126]:
target = 'sale_price_per_sf'

# below columns are not needed for prediction, but needed later to compute rmse
additional_cols = ['grlivarea', 'saleprice'] 
ready_data_2 = pd.concat([picked_data, 
                          data[additional_cols]
                         ], 
                         axis=1
                        )

ready_data_2[target] = data[target]

encoded_train = ready_data_2.loc[ready_data_2[target] != 0].copy()
na_checker = check_na(encoded_train)
if not na_checker.empty:
    print(na_checker)
    encoded_train.dropna(inplace=True)

X_train, X_valid, y_train, y_valid = split_train_valid(encoded_train, target)

base_rf = RandomForestRegressor(n_estimators=100, max_features=1.0, n_jobs=-1,
                               random_state=1,
                               )
base_rf.fit(X_train.drop(columns=additional_cols), 
            y_train
           )

y_pred = base_rf.predict(X_valid.drop(columns=additional_cols))
base_rmse = np.sqrt(mean_squared_error(y_valid, y_pred)) 
print('Base RMSE when predict price per SF: ', round(base_rmse, 2))

Base RMSE:  17.26


In [127]:
# estimate sale price
X_valid['pred_price_per_sf'] = y_pred
X_valid['pred_sale_price'] = X_valid['pred_price_per_sf'] * X_valid['grlivarea']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [128]:
base_rmse2 = np.sqrt(mean_squared_error(X_valid.saleprice, 
                                        X_valid.pred_sale_price))

print('If predict via price per square feet, then RMSE: ', base_rmse2)

If predict via price per square feet, then RMSE:  29249.019800106955


## Tuning RF via grid search

In [129]:
?RandomForestRegressor

In [None]:
param_dict = 

In [None]:
X_test = dump_data.loc[dump_data[target] != 0]