In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Read the data
X_full = pd.read_csv('home-data-for-ml-course/train.csv', index_col='Id')
X_test_full = pd.read_csv('home-data-for-ml-course/test.csv', index_col='Id')

# Obtain target and predictors
y = X_full.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = X_full[features].copy()
X_test = X_test_full[features].copy()

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [17]:
X_train.head()

Unnamed: 0_level_0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
619,11694,2007,1828,0,2,3,9
871,6600,1962,894,0,1,2,5
93,13360,1921,964,0,1,2,5
818,13265,2002,1689,0,2,3,7
303,13704,2001,1541,0,2,3,6


In [18]:
X_full.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
from sklearn.ensemble import RandomForestRegressor

# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

In [6]:
from sklearn.metrics import mean_absolute_error

# Function for comparing different models
def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

Model 1 MAE: 24015
Model 2 MAE: 23740
Model 3 MAE: 23528
Model 4 MAE: 23996
Model 5 MAE: 23706


# Model 3 is best

In [7]:
best_model = model_3

In [8]:
best_model.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [9]:
prediction_test = best_model.predict(X_test)

In [12]:
prediction_test.shape

(1459,)

In [13]:
prediction_test[:100]

array([119433.08, 158367.5 , 185351.21, 178343.12, 192898.29, 185013.05,
       173691.32, 173571.12, 202335.34, 118837.26, 191862.15,  93880.5 ,
        88924.  , 145176.84, 125653.6 , 335453.76, 247129.59, 277390.44,
       330092.24, 441544.66, 316562.04, 202914.73, 200106.27, 163517.93,
       174293.6 , 210989.65, 290504.38, 249310.77, 203039.68, 238233.9 ,
       191605.  , 107714.3 , 185815.5 , 284441.27, 217122.56, 219151.34,
       185429.9 , 154851.03, 153944.45, 154245.2 , 173641.09, 150178.43,
       215481.19, 231127.4 , 209578.22, 165568.  , 281903.83, 205866.31,
       163738.5 , 150830.37, 149980.  , 169381.5 , 158883.52, 143279.39,
       197395.  , 125036.32, 147546.68, 140108.64, 208845.  , 140697.37,
       143670.  , 162171.  , 121434.56, 126644.99, 109471.08, 141236.  ,
       103212.83, 121404.83, 138488.87, 149003.  , 129823.84, 100321.4 ,
       118869.71, 121401.5 , 140012.  , 102665.6 , 102273.25, 120451.5 ,
       214145.08, 170927.62, 137663.34, 136613.05, 

In [15]:
output = pd.DataFrame({'ID':X_test.index,
                      'SalePrice':prediction_test})

In [16]:
output.head(10)

Unnamed: 0,ID,SalePrice
0,1461,119433.08
1,1462,158367.5
2,1463,185351.21
3,1464,178343.12
4,1465,192898.29
5,1466,185013.05
6,1467,173691.32
7,1468,173571.12
8,1469,202335.34
9,1470,118837.26
