In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import GridSearchCV #cross_val_score#, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

%matplotlib inline
plt.style.use('fivethirtyeight')

df_train = pd.read_csv('./data/iowa_housing/train.csv')
df_test = pd.read_csv('./data/iowa_housing/test.csv')

df_train.drop('Id', axis = 1, inplace = True)
df_test.drop('Id', axis = 1, inplace = True)

In [2]:
def rmse(estimator, X, y):
    preds = estimator.predict(X)
    error = np.sqrt(np.mean((preds - y)**2))
    return error

In [3]:
num_col = ['MSSubClass',
           'LotFrontage',
           'LotArea',
           'LotConfig',
           'LandSlope',
           'OverallCond',
           'OverallQual',
           'YearBuilt',
           'YearRemodAdd',
           'MasVnrArea',
           'BsmtFinSF1',
           'BsmtFinSF2',
           'BsmtUnfSF',
           'TotalBsmtSF',
           '1stFlrSF',
           '2ndFlrSF',
           'LowQualFinSF',
           'GrLivArea',
           'BsmtFullBath',
           'BsmtHalfBath',
           'FullBath',
           'HalfBath',
           'KitchenQual',
           'TotRmsAbvGrd',
           'Fireplaces',
           'GarageYrBlt',
           'GarageCars',
           'GarageArea',
           'WoodDeckSF',
           'OpenPorchSF',
           'EnclosedPorch',
           '3SsnPorch',
           'ScreenPorch',
           'PoolArea',
           'MiscVal',
           'MoSold',
           'YrSold'
          ]

cat_ord_col = ['Utilities', 
               'ExterQual', 
               'ExterCond',
               'BsmtQual',
               'BsmtCond',
               'BsmtExposure',
               'BsmtFinType1',
               'BsmtFinType2',
               'HeatingQC',
               'CentralAir',
               'Electrical',
               'Functional',
               'FireplaceQu',
               'GarageType',
               'GarageFinish',
               'GarageQual',
               'GarageCond',
               'PavedDrive',
               'PoolQC',
               'Fence',
               'SaleType'
              ]

cat_not_ord = ['MSZoning',
               'Street',
               'Alley',
               'LotShape',
               'LandContour',
               'Neighborhood',
               'Condition1',
               'Condition2',
               'BldgType',
               'HouseStyle',
               'RoofStyle',
               'RoofMatl',
               'Exterior1st', 
               'Exterior2nd',
               'MasVnrType',
               'Foundation',
               'Heating',
               'MiscFeature',
               'SaleCondition'
              ]

In [4]:
df_train[num_col].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 37 columns):
MSSubClass       1460 non-null int64
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
LotConfig        1460 non-null object
LandSlope        1460 non-null object
OverallCond      1460 non-null int64
OverallQual      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
MasVnrArea       1452 non-null float64
BsmtFinSF1       1460 non-null int64
BsmtFinSF2       1460 non-null int64
BsmtUnfSF        1460 non-null int64
TotalBsmtSF      1460 non-null int64
1stFlrSF         1460 non-null int64
2ndFlrSF         1460 non-null int64
LowQualFinSF     1460 non-null int64
GrLivArea        1460 non-null int64
BsmtFullBath     1460 non-null int64
BsmtHalfBath     1460 non-null int64
FullBath         1460 non-null int64
HalfBath         1460 non-null int64
KitchenQual      1460 non-null object
TotRmsAbvGrd     1460 non-null int6

### Attempting to improve upon the previous results. First order of business is to address the null columns

### Preceding are the columns with null values.
It's clear that we need some feature transformation to clean up the dataset, including null value handling (likely imputation). We can break these down into numeric types, and categorical, the latter we'll determine ordinality.

In [5]:
num_cols = df_train.select_dtypes(include=np.number).columns.tolist()
num_cols.remove('SalePrice')

In [6]:
df_train.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [7]:
obj_cols = list(set(df_train.columns.tolist()) - set (num_cols))
obj_cols.remove('SalePrice')

In [8]:
for x in cat_not_ord:
    print(df_train[x].value_counts(dropna = False))

RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64
Pave    1454
Grvl       6
Name: Street, dtype: int64
NaN     1369
Grvl      50
Pave      41
Name: Alley, dtype: int64
Reg    925
IR1    484
IR2     41
IR3     10
Name: LotShape, dtype: int64
Lvl    1311
Bnk      63
HLS      50
Low      36
Name: LandContour, dtype: int64
NAmes      225
CollgCr    150
OldTown    113
Edwards    100
Somerst     86
Gilbert     79
NridgHt     77
Sawyer      74
NWAmes      73
SawyerW     59
BrkSide     58
Crawfor     51
Mitchel     49
NoRidge     41
Timber      38
IDOTRR      37
ClearCr     28
StoneBr     25
SWISU       25
Blmngtn     17
MeadowV     17
BrDale      16
Veenker     11
NPkVill      9
Blueste      2
Name: Neighborhood, dtype: int64
Norm      1260
Feedr       81
Artery      48
RRAn        26
PosN        19
RRAe        11
PosA         8
RRNn         5
RRNe         2
Name: Condition1, dtype: int64
Norm      1445
Feedr        6
Artery       2
P

In [9]:
def cat_not_ord_dealer(df):
    dropped = []
    for c in cat_not_ord:
        if c not in df.columns:
            break
        if df[c].isnull().any():
            df[c].fillna('other', inplace = True)
        sizes = df[c].value_counts()
        if (sizes[0] > df.shape[0] - 30):
            df.drop(c, axis = 1, inplace = True)
            dropped.append(c)
        else:
            bad = sizes.index[sizes < 30]
            df.loc[df[c].isin(bad), c] = 'other'
    return dropped

In [10]:
dropped_train = cat_not_ord_dealer(df_train)
dropped_train

['Street', 'Condition2', 'RoofMatl']

In [11]:
dropped_test = cat_not_ord_dealer(df_test)
dropped_test

['Street', 'Condition2', 'RoofMatl', 'Heating']

In [12]:
set(df_train.columns) - set(df_test.columns)

{'Heating', 'SalePrice'}

In [13]:
df_train.drop('Heating', axis = 1, inplace = True)

In [14]:
def num_col_handler(df):
    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    if 'SalePrice' in numeric_cols:
        numeric_cols.remove('SalePrice')
    for c in numeric_cols:
        if df[c].isnull().any():
            df[c].fillna(df[c].mean(), inplace = True)
    return 'done'

In [15]:
num_col_handler(df_train)

'done'

In [16]:
num_col_handler(df_test)

'done'

In [17]:
set(df_train.columns.tolist()) - set(df_test.columns.tolist())

{'SalePrice'}

In [19]:
num_cols = df_train.select_dtypes(include=np.number).columns.tolist()
num_cols.remove('SalePrice')

In [40]:
cols_drop = list(set(df_train.columns) - set(df_test.columns)) + list(set(df_test.columns) - set(df_train.columns))
cols_drop.remove('SalePrice')

In [41]:
for c in cols_drop:
    if c in df_train.columns: df_train.drop(c, axis = 1, inplace = True)
    if c in df_test.columns: df_test.drop(c, axis = 1, inplace = True)

In [42]:
non_num_cols = list(set(df_train.columns) - set(num_cols))

In [43]:
train_mean = df_train[num_cols].mean()
train_std = df_train[num_cols].std()

In [44]:
df_train[num_cols] = (df_train[num_cols] - train_mean) / train_std
df_test[num_cols] = (df_test[num_cols] - train_mean) / train_std

In [45]:
df_train = pd.get_dummies(df_train, drop_first = True)
df_test = pd.get_dummies(df_test, drop_first = True)

In [46]:
cols_drop = list(set(df_train.columns) - set(df_test.columns)) + list(set(df_test.columns) - set(df_train.columns))
cols_drop.remove('SalePrice')

In [47]:
for c in cols_drop:
    if c in df_train.columns: df_train.drop(c, axis = 1, inplace = True)
    if c in df_test.columns: df_test.drop(c, axis = 1, inplace = True)

In [48]:
X_train = df_train.loc[:, df_train.columns != 'SalePrice']
y_train = np.log(df_train['SalePrice'])
X_test = df_test.loc[:, df_test.columns != 'SalePrice']

Ready to Fit

In [49]:
rfc = RandomForestRegressor()

In [50]:
rfc_param_grid = {
    'max_features': [1,2,0.5,'sqrt','log2'],
    'min_samples_leaf': [1,2,3,5,10],
    'n_estimators': [1,5,10,25,100,1000]
}

In [51]:
scorer = make_scorer(mean_squared_error, greater_is_better = False)

In [52]:
grid = GridSearchCV(estimator = rfc, param_grid = rfc_param_grid, cv = 10, scoring = scorer)

In [66]:
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_features': [1, 2, 0.5, 'sqrt', 'log2'], 'min_samples_leaf': [1, 2, 3, 5, 10], 'n_estimators': [1, 5, 10, 25, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(mean_squared_error, greater_is_better=False),
       verbose=0)

In [67]:
grid.score(X_train, y_train)

-0.0025745271972179014

In [68]:
grid.best_params_

{'max_features': 0.5, 'min_samples_leaf': 1, 'n_estimators': 1000}

In [53]:
rfc.set_params(n_estimators = 1000, 
              max_features = 0.5, 
              min_samples_leaf = 1)
rfc.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [54]:
y_test = np.expm1(rfc.predict(X_test))

In [56]:
df_submission = pd.read_csv('./data/iowa_housing/sample_predictions.csv')
df_submission['SalePrice'] = pd.Series(y_test)

In [57]:
df_submission.to_csv('./data/iowa_housing/ts_predictions_2.csv', index=False)

In [58]:
df_submission

Unnamed: 0,Id,SalePrice
0,1461,124046.010191
1,1462,152941.335770
2,1463,178682.722523
3,1464,182991.903583
4,1465,196141.730901
5,1466,183778.361753
6,1467,170138.665246
7,1468,175492.775121
8,1469,182793.567654
9,1470,122661.428239


In [None]:
# old score: 0.16798
# first round: 0.14225
# second round: 0.14192

In [63]:
sorted(list(zip(rfc.feature_importances_, X_train.columns)), reverse = True)

[(0.3358257714851593, 'OverallQual'),
 (0.1613311354898265, 'GrLivArea'),
 (0.07318595986094313, 'YearBuilt'),
 (0.04425295848742036, 'TotalBsmtSF'),
 (0.0435606078052571, 'GarageCars'),
 (0.038594421014799506, 'ExterQual_TA'),
 (0.032939909180598345, '1stFlrSF'),
 (0.032143881496938524, 'GarageArea'),
 (0.019846601508318948, 'BsmtFinSF1'),
 (0.01720665064586645, 'LotArea'),
 (0.016758847513766634, 'FullBath'),
 (0.012616633553219282, 'Fireplaces'),
 (0.01203103230061818, 'CentralAir_Y'),
 (0.010672200529539702, '2ndFlrSF'),
 (0.010465427064154478, 'YearRemodAdd'),
 (0.010323817447840233, 'OverallCond'),
 (0.009363186932242175, 'GarageYrBlt'),
 (0.007812465096475638, 'LotFrontage'),
 (0.005638046682409965, 'BsmtUnfSF'),
 (0.00465751317204866, 'TotRmsAbvGrd'),
 (0.004412764505529028, 'OpenPorchSF'),
 (0.003772895582416073, 'GarageType_Attchd'),
 (0.0036096902199339817, 'MasVnrArea'),
 (0.0033750995719894184, 'MoSold'),
 (0.0032112310641367607, 'MSZoning_RM'),
 (0.0031471965836505546, 'M

### Old Code

In [None]:
# to conveniently optimize the random forest model, use three nested loops to tune 3 model parameters
rf_scores = []

n_tr = [1, 10, 25, 50, 100, 1000]
spl = [1, 2, 4, 8, 16, 32]
mx_ft = [1, 0.5, 'sqrt', 'log2', len(X_train.columns) - 1]

for t in n_tr:
    print("# Trees: ", t)
    for m in mx_ft:
        for s in spl:
            rf.set_params(n_estimators = t,
                          max_features = m, 
                          min_samples_leaf = s,
                          n_jobs = -1)
            scores = cross_val_score(estimator = rf, X = X_train, y = y_train, scoring = rmse, cv = 10)
            rf_scores.append((np.mean(scores), t, m, s))

In [None]:
min(rf_scores)

In [None]:
rf.set_params(n_estimators = 1000, 
              max_features = 0.5, 
              min_samples_leaf = 1, 
              n_jobs = -1)
rf.fit(X_train, y_train)

In [None]:
to_drop_train = list(set(X_train.columns) - set(X_test.columns))

In [None]:
if to_drop_train: 
    df_train.drop(to_drop_train, axis = 1,  inplace = True)

In [None]:
to_drop_test = list(set(X_test.columns) - set(X_train.columns))

In [None]:
if to_drop_test: 
    df_test.drop(to_drop_test, axis = 1,  inplace = True)

In [None]:
y_test = np.expm1(rfc.predict(X_test))

In [None]:
rf_scores = []
spl = [1, 2, 4, 8]
mx_ft = [1, 0.5, 'sqrt', 'log2']

for m in mx_ft:
    for s in spl:
        rf.set_params(max_features = m, min_samples_leaf = s, n_jobs = -1)
        scores = cross_val_score(estimator = rf, X = X_train, y = y_train, scoring = rmse, cv = 10)
        rf_scores.append((np.mean(scores), m, s))

In [None]:
min(rf_scores)

In [None]:
rf.set_params(max_features = 'log2', min_samples_leaf = 1, n_jobs = -1)

In [None]:
rf.fit(X_train, y_train)

In [None]:
df_test['prediction'] = rf.predict(df_test)

In [None]:
df_submission['SalePrice'] = pd.Series(y_test)

In [None]:
# df_submission['SalePrice'] = np.exp(df_test['prediction'])
# old score: 0.16798