In [1]:
# This file contains an intermediate approach to machine learning problems to 
# refer back to when writing new code

import pandas as pd

# read in the data
X_full = pd.read_csv("data/train.csv", index_col = "Id")
X_test_full = pd.read_csv("data/test.csv", index_col = "Id")

In [2]:
# create test features and prediction targets
from sklearn.model_selection import train_test_split

# obtain target and predictors

y = X_full.SalePrice
features = ["LotArea", "YearBuilt", "1stFlrSF", "2ndFlrSF", "BedroomAbvGr", "TotRmsAbvGrd"]
X = X_full[features]
X_test = X_test_full[features]

# break off validation set from training data

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 0)

In [3]:
# take a quick look at the training data features

X_train.head()

Unnamed: 0_level_0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,BedroomAbvGr,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
619,11694,2007,1828,0,3,9
871,6600,1962,894,0,2,5
93,13360,1921,964,0,2,5
818,13265,2002,1689,0,3,7
303,13704,2001,1541,0,3,6


In [7]:
# create 5 different random forest models to determine which one is the best with the lowest MAE (mean absolute error)

from sklearn.ensemble import RandomForestRegressor

# define models

model_1 = RandomForestRegressor(n_estimators = 50, random_state = 0)
model_2 = RandomForestRegressor(n_estimators = 100, random_state = 0)
model_3 = RandomForestRegressor(n_estimators = 100, criterion = "mae", random_state = 0)
model_4 = RandomForestRegressor(n_estimators = 200, min_samples_split = 20, random_state = 0)
model_5 = RandomForestRegressor(n_estimators = 50, max_depth = 7, random_state = 0)

models = [model_1, model_2, model_3, model_4, model_5]

In [9]:
# This function returns the mean absolute error (MAE) from the validation set.

from sklearn.metrics import mean_absolute_error

def score_model(model, X_t = X_train, X_v = X_valid, y_t = y_train, y_v = y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

Model 1 MAE: 23690
Model 2 MAE: 23635
Model 3 MAE: 23549
Model 4 MAE: 24024
Model 5 MAE: 23794


In [10]:
# use best model (model_3) to make predictions

# fir the model to the training data
model_3.fit(X, y)

#generate predictions
preds_test = model_3.predict(X_test)

# save predictions in format used for Kaggle competition scoring
output = pd.DataFrame({"Id": X_test.index,
                      "SalePrice": preds_test})
output.to_csv("data/submission_1.csv", index = False)