## About

Test various random forest models.

This notebook is an exercise in the [Intermediate Machine Learning](https://www.kaggle.com/learn/intermediate-machine-learning) course.  You can reference the tutorial at [this link](https://www.kaggle.com/alexisbcook/introduction).

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [13]:
import numpy as np

In [2]:
# Read the data
X_full = pd.read_csv('./home-data-for-ml-course/train.csv', index_col='Id')
X_test_full = pd.read_csv('./home-data-for-ml-course/test.csv', index_col='Id')

In [6]:
# X_full.head(3)
X_test_full.head(3)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal


In [7]:
# Obtain target and predictors
y = X_full.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = X_full[features].copy()
X_test = X_test_full[features].copy()

In [8]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [9]:
# Create various random forest models
from sklearn.ensemble import RandomForestRegressor

# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='absolute_error', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

In [10]:
from sklearn.metrics import mean_absolute_error

# Function for comparing different models
# applied to training data!
def score_model(model, X_t, X_v, y_t, y_v):
    model.fit(X_t, y_t) # fit on training portion of training data
    preds = model.predict(X_v) # predict using validation portion of training data
    return mean_absolute_error(y_v, preds)



In [21]:
best_model_score = np.inf
best_model = ""
for i in range(0, len(models)):
    mae = score_model(models[i],X_train, X_valid,y_train,y_valid)
    print("Model %d MAE: %d" % (i+1, mae))
    if mae < best_model_score:
        best_model_score = mae
        best_model = models[i]
print("best_model: ",best_model)

Model 1 MAE: 24015
Model 2 MAE: 23740
Model 3 MAE: 23528
Model 4 MAE: 23996
Model 5 MAE: 23706
best_model:  RandomForestRegressor(criterion='absolute_error', random_state=0)


In [22]:
# Create a new model .. will have same parameters as best_model
# But uses all the training data
my_model =  RandomForestRegressor(n_estimators=100, criterion='absolute_error', random_state=0)


In [24]:
# Fit the model to ALL the training data
my_model.fit(X, y)

In [25]:
# Generate test predictions
preds_test = my_model.predict(X_test)

In [27]:
# print(type(preds_test))
print(preds_test[:10])

[119433.08 158367.5  185351.21 178343.12 192898.29 185013.05 173691.32
 173571.12 202335.34 118837.26]


In [28]:
# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission2.csv', index=False)

In [30]:
output_csv = pd.read_csv("./submission2.csv")
print(output_csv.shape)
output_csv.head()


(1459, 2)


Unnamed: 0,Id,SalePrice
0,1461,119433.08
1,1462,158367.5
2,1463,185351.21
3,1464,178343.12
4,1465,192898.29
