In [158]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV#cross_val_score#, train_test_split

%matplotlib inline
plt.style.use('fivethirtyeight')

df_train = pd.read_csv('./data/iowa_housing/train.csv')#, usecols = [1,17, 18, 19, 43,44,46,49,50,70,77,80])
df_test = pd.read_csv('./data/iowa_housing/test.csv')#, usecols = [1,17, 18, 19, 43,44,46,49,50,70,77])

df_train.drop('Id', axis = 1, inplace = True)
df_test.drop('Id', axis = 1, inplace = True)

In [159]:
def rmse(estimator, X, y):
    preds = estimator.predict(X)
    error = np.sqrt(np.mean((preds - y)**2))
    return error

In [160]:
num_col = ['MSSubClass',
           'LotFrontage',
           'LotArea',
           'LotConfig',
           'LandSlope',
           'OverallCond',
           'OverallQual',
           'YearBuilt',
           'YearRemodAdd',
           'MasVnrArea',
           'BsmtFinSF1',
           'BsmtFinSF2',
           'BsmtUnfSF',
           'TotalBsmtSF',
           '1stFlrSF',
           '2ndFlrSF',
           'LowQualFinSF',
           'GrLivArea',
           'BsmtFullBath',
           'BsmtHalfBath',
           'FullBath',
           'HalfBath',
           'KitchenQual',
           'TotRmsAbvGrd',
           'Fireplaces',
           'GarageYrBlt',
           'GarageCars',
           'GarageArea',
           'WoodDeckSF',
           'OpenPorchSF',
           'EnclosedPorch',
           '3SsnPorch',
           'ScreenPorch',
           'PoolArea',
           'MiscVal',
           'MoSold',
           'YrSold'
          ]

cat_ord_col = ['Utilities', 
               'ExterQual', 
               'ExterCond',
               'BsmtQual',
               'BsmtCond',
               'BsmtExposure',
               'BsmtFinType1',
               'BsmtFinType2',
               'HeatingQC',
               'CentralAir',
               'Electrical',
               'Functional',
               'FireplaceQu',
               'GarageType',
               'GarageFinish',
               'GarageQual',
               'GarageCond',
               'PavedDrive',
               'PoolQC',
               'Fence',
               'SaleType'
              ]

cat_not_ord = ['MSZoning',
               'Street',
               'Alley',
               'LotShape',
               'LandContour',
               'Neighborhood',
               'Condition1',
               'Condition2',
               'BldgType',
               'HouseStyle',
               'RoofStyle',
               'RoofMatl',
               'Exterior1st', 
               'Exterior2nd',
               'MasVnrType',
               'Foundation',
               'Heating',
               'MiscFeature',
               'SaleCondition'
              ]

In [161]:
df_train[num_col].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 37 columns):
MSSubClass       1460 non-null int64
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
LotConfig        1460 non-null object
LandSlope        1460 non-null object
OverallCond      1460 non-null int64
OverallQual      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
MasVnrArea       1452 non-null float64
BsmtFinSF1       1460 non-null int64
BsmtFinSF2       1460 non-null int64
BsmtUnfSF        1460 non-null int64
TotalBsmtSF      1460 non-null int64
1stFlrSF         1460 non-null int64
2ndFlrSF         1460 non-null int64
LowQualFinSF     1460 non-null int64
GrLivArea        1460 non-null int64
BsmtFullBath     1460 non-null int64
BsmtHalfBath     1460 non-null int64
FullBath         1460 non-null int64
HalfBath         1460 non-null int64
KitchenQual      1460 non-null object
TotRmsAbvGrd     1460 non-null int6

### Attempting to improve upon the previous results. First order of business is to address the null columns

### Preceding are the columns with null values.
It's clear that we need some feature transformation to clean up the dataset, including null value handling (likely imputation). We can break these down into numeric types, and categorical, the latter we'll determine ordinality.

In [163]:
num_cols = df_train.select_dtypes(include=np.number).columns.tolist()
num_cols.remove('SalePrice')

In [67]:
df_train.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [164]:
obj_cols = list(set(df_train.columns.tolist()) - set (num_cols))
obj_cols.remove('SalePrice')

In [111]:
for x in cat_not_ord:
    print(df_train[x].value_counts(dropna = False))
#    print(df_test[x].value_counts(dropna = False))

RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64
Pave    1454
Grvl       6
Name: Street, dtype: int64
NaN     1369
Grvl      50
Pave      41
Name: Alley, dtype: int64
Reg    925
IR1    484
IR2     41
IR3     10
Name: LotShape, dtype: int64
Lvl    1311
Bnk      63
HLS      50
Low      36
Name: LandContour, dtype: int64
NAmes      225
CollgCr    150
OldTown    113
Edwards    100
Somerst     86
Gilbert     79
NridgHt     77
Sawyer      74
NWAmes      73
SawyerW     59
BrkSide     58
Crawfor     51
Mitchel     49
NoRidge     41
Timber      38
IDOTRR      37
ClearCr     28
StoneBr     25
SWISU       25
Blmngtn     17
MeadowV     17
BrDale      16
Veenker     11
NPkVill      9
Blueste      2
Name: Neighborhood, dtype: int64
Norm      1260
Feedr       81
Artery      48
RRAn        26
PosN        19
RRAe        11
PosA         8
RRNn         5
RRNe         2
Name: Condition1, dtype: int64
Norm      1445
Feedr        6
RRNn         2
A

In [120]:
def cat_not_ord_dealer(df):
    dropped = []
    for c in cat_not_ord:
        if c not in df.columns:
            break
        if df[c].isnull().any():
            df[c].fillna('other', inplace = True)
        sizes = df[c].value_counts()
        if (sizes[0] > df.shape[0] - 30):
            df.drop(c, axis = 1, inplace = True)
            dropped.append(c)
        else:
            bad = sizes.index[sizes < 30]
            df.loc[df[c].isin(bad), c] = 'other'
    return dropped

In [167]:
dropped = cat_not_ord_dealer(df_test)

In [168]:
dropped

['Street', 'Condition2', 'RoofMatl', 'Heating']

In [169]:
for x in cat_not_ord:
    if x in dropped:
        pass
    else:
        print(df_train[x].value_counts(dropna = False))

RL       1151
RM        218
FV         65
other      26
Name: MSZoning, dtype: int64
other    1369
Grvl       50
Pave       41
Name: Alley, dtype: int64
Reg      925
IR1      484
IR2       41
other     10
Name: LotShape, dtype: int64
Lvl    1311
Bnk      63
HLS      50
Low      36
Name: LandContour, dtype: int64
NAmes      225
other      150
CollgCr    150
OldTown    113
Edwards    100
Somerst     86
Gilbert     79
NridgHt     77
Sawyer      74
NWAmes      73
SawyerW     59
BrkSide     58
Crawfor     51
Mitchel     49
NoRidge     41
Timber      38
IDOTRR      37
Name: Neighborhood, dtype: int64
Norm      1260
Feedr       81
other       71
Artery      48
Name: Condition1, dtype: int64
1Fam      1220
TwnhsE     114
Duplex      52
Twnhs       43
2fmCon      31
Name: BldgType, dtype: int64
1Story    726
2Story    445
1.5Fin    154
SLvl       65
SFoyer     37
other      33
Name: HouseStyle, dtype: int64
Gable    1141
Hip       286
other      33
Name: RoofStyle, dtype: int64
VinylSd    515
H

In [170]:
set(df_train.columns) - set(df_test.columns)

{'Heating', 'SalePrice'}

In [171]:
df_train.drop('Heating', axis = 1, inplace = True)

In [144]:
def num_col_handler(df):
    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    if 'SalePrice' in numeric_cols:
        numeric_cols.remove('SalePrice')
    for c in numeric_cols:
        if df[c].isnull().any():
            df[c].fillna(df[c].mean(), inplace = True)
    return 'done'

In [172]:
num_col_handler(df_train)

'done'

In [173]:
num_col_handler(df_test)

'done'

In [174]:
set(df_train.columns.tolist()) - set(df_test.columns.tolist())

{'SalePrice'}

In [175]:
num_cols = df_train.select_dtypes(include=np.number).columns.tolist()
num_cols.remove('SalePrice')

In [176]:
non_num_cols = list(set(df_train.columns) - set(num_cols))

In [177]:
train_mean = df_train[num_cols].mean()
train_std = df_train[num_cols].std()

In [178]:
df_train[num_cols] = (df_train[num_cols] - train_mean) / train_std
df_test[num_cols] = (df_test[num_cols] - train_mean) / train_std

In [179]:
df_train = pd.get_dummies(df_train, drop_first = True)
df_test = pd.get_dummies(df_test, drop_first = True)

In [180]:
X_train = df_train.loc[:, df_train.columns != 'SalePrice']
y_train = np.log(df_train['SalePrice'])
X_test = df_test.loc[:, df_test.columns != 'SalePrice']

In [185]:
rfc = RandomForestRegressor()

In [186]:
rfc_param_grid = {
    'max_features': [1,2,0.5,'sqrt','log2'],
    'min_samples_leaf': [1,2,3,5,10],
    'n_estimators': [1,5,10,25,100,1000]
}

In [189]:
grid = GridSearchCV(estimator = rfc, param_grid = rfc_param_grid, cv = 10, scoring = rmse)

In [190]:
grid.fit(X_train, y_train)

KeyboardInterrupt: 

In [184]:
grid.best_params_

In [None]:
rf.set_params()

In [34]:
rf.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [35]:
# to conveniently optimize the random forest model, use three nested loops to tune 3 model parameters
rf_scores = []

n_tr = [1, 10, 25, 50, 100, 1000]
spl = [1, 2, 4, 8, 16, 32]
mx_ft = [1, 0.5, 'sqrt', 'log2', len(X_train.columns) - 1]

for t in n_tr:
    print("# Trees: ", t)
    for m in mx_ft:
        for s in spl:
            rf.set_params(n_estimators = t,
                          max_features = m, 
                          min_samples_leaf = s,
                          n_jobs = -1)
            scores = cross_val_score(estimator = rf, X = X_train, y = y_train, scoring = rmse, cv = 10)
            rf_scores.append((np.mean(scores), t, m, s))

# Trees:  1
# Trees:  10
# Trees:  25
# Trees:  50
# Trees:  100
# Trees:  1000


In [36]:
min(rf_scores)

(0.1364516759749105, 1000, 0.5, 1)

In [37]:
rf.set_params(n_estimators = 1000, 
              max_features = 0.5, 
              min_samples_leaf = 1, 
              n_jobs = -1)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [51]:
for c in X_test.columns[X_test.isnull().any()]:
    X_test[c].fillna(X_test[c].mean(), inplace = True)

In [54]:
to_drop = list(set(X_train.columns) - set(X_test.columns))

In [56]:
X_train.drop(to_drop, axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [60]:
rf.fit(X_train, y_train)
y_test = np.expm1(rf.predict(X_test))

array([127200.2038793 , 153010.68867714, 182443.54714521, ...,
       155847.73046676, 113913.12236423, 234719.03292445])

In [None]:
rf_scores = []
spl = [1, 2, 4, 8]
mx_ft = [1, 0.5, 'sqrt', 'log2']

for m in mx_ft:
    for s in spl:
        rf.set_params(max_features = m, min_samples_leaf = s, n_jobs = -1)
        scores = cross_val_score(estimator = rf, X = X_train, y = y_train, scoring = rmse, cv = 10)
        rf_scores.append((np.mean(scores), m, s))

In [None]:
min(rf_scores)

In [None]:
rf.set_params(max_features = 'log2', min_samples_leaf = 1, n_jobs = -1)

In [None]:
rf.fit(X_train, y_train)

In [None]:
df_test['prediction'] = rf.predict(df_test)

In [62]:
df_submission = pd.read_csv('../data/iowa_housing/sample_predictions.csv')

In [64]:
df_submission['SalePrice'] = pd.Series(y_test)

In [None]:
# df_submission['SalePrice'] = np.exp(df_test['prediction'])
# old score: 0.16798

In [66]:
df_submission.to_csv('../data/iowa_housing/ts_predictions_1.csv', index=False)

In [65]:
df_submission

Unnamed: 0,Id,SalePrice
0,1461,127200.203879
1,1462,153010.688677
2,1463,182443.547145
3,1464,183996.776800
4,1465,192793.995186
5,1466,185105.204392
6,1467,169942.143743
7,1468,176715.918429
8,1469,182947.284386
9,1470,122039.451187
