In [9]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import pandas as pd

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [10]:
# Create some syntetic data
size = 10000
np.random.seed(0)

Year = np.random.randint(1998, high=2021, size=size, dtype=int)
LotArea = np.random.randint(50, high=2001, size=size, dtype=int)
nFlor = np.random.randint(1, high=11, size=size, dtype=int)
BedAbv = np.random.normal(loc=50.0, scale=5.0, size=size)
county = np.random.randint(1, high=51, size=size, dtype=int)

# Define a function to obtain prices for later testing the model
def fun_price(y,lot,flor,bed,county):    
    return np.sin(y/2020)*lot*flor + bed - county/2

def col_price(row):
    y = row.Year
    flor = row.nFlor
    lot = row.LotArea
    bed = row.BedAbv
    county = row.county
    return int(fun_price(y,lot,flor,bed,county))

home_data = pd.DataFrame({'Year':Year, 'LotArea':LotArea, 'nFlor':nFlor, 'BedAbv':BedAbv, 'county':county})
home_data['Price'] = home_data.apply(col_price,axis=1)

In [11]:
# split data into training and validation data, for both features and target
from sklearn.model_selection import train_test_split

y = home_data.Price
feature_columns = ['Year', 'LotArea', 'nFlor', 'BedAbv', 'county']
X = home_data[feature_columns]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [10, 100, 1000, 5000,10000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 10  		 Mean Absolute Error:  923
Max leaf nodes: 100  		 Mean Absolute Error:  192
Max leaf nodes: 1000  		 Mean Absolute Error:  26
Max leaf nodes: 5000  		 Mean Absolute Error:  20
Max leaf nodes: 10000  		 Mean Absolute Error:  20
