### Simple Decision Tree example

In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

# Path of the file to read
file_path = './input/iowa_train.csv'

# create variable to hold the data
home_data = pd.read_csv(file_path)
# identify the prediction target y
y = home_data.SalePrice
# create variable to hold the feature columns (columns to use to predict)
feature_columns = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
# load the feature data into X
X = home_data[feature_columns]

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Specify Model
model = DecisionTreeRegressor(random_state=1)
# Fit Model
model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)

print("First in-sample predictions: \t\t" , model.predict(val_X.head()))
print("Actual target values for those homes:\t", val_y.head().tolist())
print("Validation MAE: {:,.0f}".format(val_mae))


First in-sample predictions: 		 [186500. 184000. 130000.  92000. 164500.]
Actual target values for those homes:	 [231500, 179500, 122000, 84500, 142000]
Validation MAE: 29,653


### Use a loop to determine the optimal tree size from a list of static choices

In [2]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# loop to find the ideal tree size from candidate_max_leaf_nodes
mae_list = {}
for max_leaf_nodes in candidate_max_leaf_nodes:
    mae_list[max_leaf_nodes] = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
best_tree_size = min(mae_list, key=mae_list.get)
print("The best tree size is: " + str(best_tree_size) + "\n with a MAE: {:,.0f}".format(mae_list[best_tree_size]))

The best tree size is: 100
 with a MAE: 27,283


In [5]:
# Fit the model with best_tree_size. Fill in argument to make optimal size
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=0)

# fit the final model
final_model.fit(X, y)

# Make predictions and calculate mean absolute error
predictions = model.predict(X)
mae = mean_absolute_error(predictions, y)

print("First few predictions: \t\t" , final_model.predict(X.head()))
print("Actual target values for those homes:\t", y.head().tolist())
print("MAE: {:,.0f}".format(mae))


First few predictions: 		 [209133.65384615 146415.0075188  209133.65384615 143297.46666667
 270325.        ]
Actual target values for those homes:	 [208500, 181500, 223500, 140000, 250000]
MAE: 7,460


### Same thing now using a Random Forest model
#### first, the validation

In [8]:
from sklearn.ensemble import RandomForestRegressor

# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)

# fit your model
rf_model.fit(train_X, train_y)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_mae = mean_absolute_error(val_y, rf_model.predict(val_X))

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))


Validation MAE for Random Forest Model: 22762.42931506849


#### then, running the final model

In [12]:
from sklearn.ensemble import RandomForestRegressor

# Define the model. 
final_rf_model = RandomForestRegressor(random_state=0)

# fit your model
final_rf_model.fit(X, y)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_final_mae = mean_absolute_error(y, final_rf_model.predict(X))

print("First few predictions: \t\t\t" , final_rf_model.predict(X.head()))
print("Actual target values for those homes:\t", y.head().tolist())
print("MAE for Random Forest Model: {}".format(rf_final_mae))



First few predictions: 			 [210350. 175490. 220150. 140850. 250590.]
Actual target values for those homes:	 [208500, 181500, 223500, 140000, 250000]
MAE for Random Forest Model: 9445.384355838227
