In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
# from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score#, train_test_split

%matplotlib inline
plt.style.use('fivethirtyeight')

df_train = pd.read_csv('../data/iowa_housing/train.csv')#, usecols = [1,17, 18, 19, 43,44,46,49,50,70,77,80])
df_test = pd.read_csv('../data/iowa_housing/test.csv')#, usecols = [1,17, 18, 19, 43,44,46,49,50,70,77])

df_train.drop('Id', axis = 1, inplace = True)
df_test.drop('Id', axis = 1, inplace = True)

In [31]:
def rmse(estimator, X, y):
    preds = estimator.predict(X)
    error = np.sqrt(np.mean((preds - y)**2))
    return error

In [2]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-

### Attempting to improve upon the previous results. First order of business is to address the null columns

In [3]:
null_columns = df_train.columns[df_train.isnull().any()]
null_columns

Index(['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
       'MiscFeature'],
      dtype='object')

In [4]:
df_train[null_columns].isnull().sum()

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [5]:
# 'LotFrontage' - fill with mean
df_train['LotFrontage'].fillna(df_train['LotFrontage'].mean(), inplace = True)
df_test['LotFrontage'].fillna(df_test['LotFrontage'].mean(), inplace = True)

In [6]:
# 'Alley' - mostly null, but some different types of alleys. Binarize for has or does not have Alley
df_train['Alley'].where(pd.isnull(df_train['Alley']),1, inplace = True)
df_train['Alley'].fillna(0, inplace = True)

df_test['Alley'].where(pd.isnull(df_test['Alley']),1, inplace = True)
df_test['Alley'].fillna(0, inplace = True)

In [7]:
df_train['MasVnrType'].fillna('None', inplace = True)
df_test['MasVnrType'].fillna('None', inplace = True)

In [8]:
df_train['MasVnrArea'].fillna(0, inplace = True)
df_test['MasVnrArea'].fillna(0, inplace = True)

In [9]:
# Filling missing categorical data with most common value in each column

In [10]:
df_train['BsmtQual'].fillna('TA', inplace = True)
df_test['BsmtQual'].fillna('TA', inplace = True)

In [11]:
df_train['BsmtCond'].fillna('TA', inplace = True)
df_test['BsmtCond'].fillna('TA', inplace = True)

In [12]:
df_train['BsmtExposure'].fillna('No', inplace = True)
df_test['BsmtExposure'].fillna('No', inplace = True)

In [13]:
df_train['BsmtFinType1'].fillna('Unf', inplace = True)
df_test['BsmtFinType1'].fillna('Unf', inplace = True)

In [14]:
df_train['BsmtFinType2'].fillna('Unf', inplace = True)
df_test['BsmtFinType2'].fillna('Unf', inplace = True)

In [15]:
df_train['Electrical'].fillna('SBrkr', inplace = True)
df_test['Electrical'].fillna('SBrkr', inplace = True)

In [16]:
df_train['FireplaceQu'].fillna('Gd', inplace = True)
df_test['FireplaceQu'].fillna('Gd', inplace = True)

In [17]:
df_train['GarageType'].fillna('Attchd', inplace = True)
df_test['GarageType'].fillna('Attchd', inplace = True)

In [18]:
df_train['GarageYrBlt'].fillna(df_train['GarageYrBlt'].mean(), inplace = True)
df_test['GarageYrBlt'].fillna(df_test['GarageYrBlt'].mean(), inplace = True)

In [19]:
df_train['GarageFinish'].fillna('Unf', inplace = True)
df_test['GarageFinish'].fillna('Unf', inplace = True)

In [20]:
df_train['GarageQual'].fillna('TA', inplace = True)
df_test['GarageQual'].fillna('TA', inplace = True)

In [21]:
df_train['GarageCond'].fillna('TA', inplace = True)
df_test['GarageCond'].fillna('TA', inplace = True)

In [22]:
df_train['PoolQC'].where(pd.isnull(df_train['PoolQC']),1, inplace = True)
df_train['PoolQC'].fillna(0, inplace = True)
df_test['PoolQC'].where(pd.isnull(df_test['PoolQC']),1, inplace = True)
df_test['PoolQC'].fillna(0, inplace = True)

In [23]:
df_train['Fence'].where(pd.isnull(df_train['Fence']),1, inplace = True)
df_train['Fence'].fillna(0, inplace = True)
df_test['Fence'].where(pd.isnull(df_test['Fence']),1, inplace = True)
df_test['Fence'].fillna(0, inplace = True)

In [24]:
df_train['MiscFeature'].where(pd.isnull(df_train['MiscFeature']),1, inplace = True)
df_train['MiscFeature'].fillna(0, inplace = True)
df_test['MiscFeature'].where(pd.isnull(df_test['MiscFeature']),1, inplace = True)
df_test['MiscFeature'].fillna(0, inplace = True)

In [25]:
num_cols = df_train.select_dtypes(include=np.number).columns.tolist()[1:]

In [26]:
non_num_cols = list(set(df_train.columns) - set(num_cols))

In [27]:
num_cols.remove('SalePrice')

In [28]:
df_train[num_cols] = (df_train[num_cols] - df_train[num_cols].mean()) / df_train[num_cols].std()
df_test[num_cols] = (df_test[num_cols] - df_test[num_cols].mean()) / df_test[num_cols].std()

In [29]:
df_train = pd.get_dummies(df_train, drop_first = True)
df_test = pd.get_dummies(df_test, drop_first = True)

In [30]:
X_train = df_train.loc[:, df_train.columns != 'SalePrice']
y_train = np.log(df_train['SalePrice'])
X_test = df_test.loc[:, df_test.columns != 'SalePrice']

In [None]:
X_train = df_train.loc[:, df_train.columns != 'SalePrice']
y_train = np.log(df_train['SalePrice'])

In [32]:
rf = RandomForestRegressor()

In [34]:
rf.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [35]:
# to conveniently optimize the random forest model, use three nested loops to tune 3 model parameters
rf_scores = []

n_tr = [1, 10, 25, 50, 100, 1000]
spl = [1, 2, 4, 8, 16, 32]
mx_ft = [1, 0.5, 'sqrt', 'log2', len(X_train.columns) - 1]

for t in n_tr:
    print("# Trees: ", t)
    for m in mx_ft:
        for s in spl:
            rf.set_params(n_estimators = t,
                          max_features = m, 
                          min_samples_leaf = s,
                          n_jobs = -1)
            scores = cross_val_score(estimator = rf, X = X_train, y = y_train, scoring = rmse, cv = 10)
            rf_scores.append((np.mean(scores), t, m, s))

# Trees:  1
# Trees:  10
# Trees:  25
# Trees:  50
# Trees:  100
# Trees:  1000


In [36]:
min(rf_scores)

(0.1364516759749105, 1000, 0.5, 1)

In [37]:
rf.set_params(n_estimators = 1000, 
              max_features = 0.5, 
              min_samples_leaf = 1, 
              n_jobs = -1)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [51]:
for c in X_test.columns[X_test.isnull().any()]:
    X_test[c].fillna(X_test[c].mean(), inplace = True)

In [54]:
to_drop = list(set(X_train.columns) - set(X_test.columns))

In [56]:
X_train.drop(to_drop, axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [60]:
rf.fit(X_train, y_train)
y_test = np.expm1(rf.predict(X_test))

array([127200.2038793 , 153010.68867714, 182443.54714521, ...,
       155847.73046676, 113913.12236423, 234719.03292445])

In [None]:
rf_scores = []
spl = [1, 2, 4, 8]
mx_ft = [1, 0.5, 'sqrt', 'log2']

for m in mx_ft:
    for s in spl:
        rf.set_params(max_features = m, min_samples_leaf = s, n_jobs = -1)
        scores = cross_val_score(estimator = rf, X = X_train, y = y_train, scoring = rmse, cv = 10)
        rf_scores.append((np.mean(scores), m, s))

In [None]:
min(rf_scores)

In [None]:
rf.set_params(max_features = 'log2', min_samples_leaf = 1, n_jobs = -1)

In [None]:
rf.fit(X_train, y_train)

In [None]:
df_test['prediction'] = rf.predict(df_test)

In [62]:
df_submission = pd.read_csv('../data/iowa_housing/sample_predictions.csv')

In [64]:
df_submission['SalePrice'] = pd.Series(y_test)

In [None]:
# df_submission['SalePrice'] = np.exp(df_test['prediction'])
# old score: 0.16798

In [66]:
df_submission.to_csv('../data/iowa_housing/ts_predictions_1.csv', index=False)

In [65]:
df_submission

Unnamed: 0,Id,SalePrice
0,1461,127200.203879
1,1462,153010.688677
2,1463,182443.547145
3,1464,183996.776800
4,1465,192793.995186
5,1466,185105.204392
6,1467,169942.143743
7,1468,176715.918429
8,1469,182947.284386
9,1470,122039.451187
