In [1]:
# In decision trees, a tree can be made up with many splits.
# Each split or level divides the dataset (that is passed onto it) into half.
# If there are too many splits, there might be many leaves with small groups of houses in them. 
# This can be a disadvantage as the predictions are done based on a very small number of data.
# This is called as Overfitting. In Overfitting, training data is matched almost perfectly but new test data
#  would not do very good here.
# If the tree has got insufficient number of splits, the leaves will be left with a big collection of houses
#  with a vast diversity.
# This is called as Underfitting. In Uderfitting, both training data and testing data would not provide 
#  good results.

In [4]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [5]:
# Load dataset
melbourne_data_path = "../Data/melb_data.csv"

melbourne_data = pd.read_csv(melbourne_data_path)

y = melbourne_data.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

In [6]:
# A function to return the MAE for a given max number of leaf nodes.
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    mae = mean_absolute_error(val_y, predictions)
    return mae

In [7]:
# Using a for loop to try multiple values for max_leaf_nodes.
for max_leaf_nodes in [5, 50, 500, 5000]:
    mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Maximum leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, mae))

Maximum leaf nodes: 5 		 Mean Absolute Error: 356157
Maximum leaf nodes: 50 		 Mean Absolute Error: 264538
Maximum leaf nodes: 500 		 Mean Absolute Error: 222979
Maximum leaf nodes: 5000 		 Mean Absolute Error: 240506


In [None]:
# Out of these options, 500 seems like the optimal option for number of maximum leaf nodes.