In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor

# Create some syntetic data
size = 10000

Year = np.random.randint(1998, high=2021, size=size, dtype=int)
LotArea = np.random.randint(50, high=2001, size=size, dtype=int)
nFlor = np.random.randint(1, high=11, size=size, dtype=int)
BedAbv = np.random.normal(loc=50.0, scale=5.0, size=size)
county = np.random.randint(1, high=51, size=size, dtype=int)

# Define a function to obtain prices for later testing the model
def fun_price(y,lot,flor,bed,county):    
    return np.sin(y/2020)*lot*flor + bed - county/2

def col_price(row):
    y = row.Year
    flor = row.nFlor
    lot = row.LotArea
    bed = row.BedAbv
    county = row.county
    return int(fun_price(y,lot,flor,bed,county))

home_data = pd.DataFrame({'Year':Year, 'LotArea':LotArea, 'nFlor':nFlor, 'BedAbv':BedAbv, 'county':county})
home_data['Price'] = home_data.apply(col_price,axis=1)

In [2]:
home_data

Unnamed: 0,Year,LotArea,nFlor,BedAbv,county,Price
0,2007,1431,6,44.950340,12,7233
1,2013,576,10,53.181942,21,4878
2,2013,796,8,43.276537,2,5388
3,2004,594,5,55.654062,47,2518
4,2010,786,5,49.089851,12,3339
...,...,...,...,...,...,...
9995,2018,1655,9,45.864986,29,12557
9996,2005,140,2,60.154790,17,286
9997,2007,820,9,48.986588,47,6209
9998,2019,728,1,48.771607,45,638


In [3]:
# Regression Tree Model Fit
y = home_data.Price
feature_columns = ['Year', 'LotArea', 'nFlor', 'BedAbv', 'county']
X = home_data[feature_columns]

# Specify Model
iowa_model = DecisionTreeRegressor()
# Fit Model
iowa_model.fit(X, y)

print("First in-sample predictions:", iowa_model.predict(X.head()))
print("Actual target values for those homes:", y.head().tolist())

First in-sample predictions: [7233. 4878. 5388. 2518. 3339.]
Actual target values for those homes: [7233, 4878, 5388, 2518, 3339]


In [8]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

# Create more sysntetic data to predict
size = 20
Year2 = np.random.randint(1998, high=2021, size=size, dtype=int)
LotArea2 = np.random.randint(50, high=2001, size=size, dtype=int)
nFlor2 = np.random.randint(1, high=11, size=size, dtype=int)
BedAbv2 = np.random.normal(loc=50.0, scale=5.0, size=size)
county2 = np.random.randint(1, high=51, size=size, dtype=int)

data_test = pd.DataFrame({'Year':Year2, 'LotArea':LotArea2, 'nFlor':nFlor2, 'BedAbv':BedAbv2, 'county':county2})
data_test['Price'] = home_data.apply(col_price,axis=1)
X_test = data_test[feature_columns]
y_test = data_test.Price

predictions = iowa_model.predict(X_test).tolist()
values = y_test.tolist()
err = mean_absolute_error(predictions,values)
errp =  mean_absolute_percentage_error(predictions,values)

print('Predictions:', predictions[:10])
print('Targets    :', values[:10])
print()
print(f'Absolute Mean Error:', err)
print()
print(f'Absolute Mean Error %:', errp)

Predictions: [1443.0, 7600.0, 1294.0, 5248.0, 803.0, 14806.0, 1016.0, 1695.0, 15862.0, 1988.0]
Targets    : [7233, 4878, 5388, 2518, 3339, 3236, 8816, 2080, 13808, 4771]

Absolute Mean Error: 4216.85

Absolute Mean Error %: 2.2620598511964087


In [10]:
# Actual expresion for the Abs mean error
erre = abs((np.array(predictions) - np.array(values))).mean()
print(erre)
# Dunno?
errep = abs(1 - np.array(predictions)/ np.array(values)).mean()
print(errep)

4216.85
1.9261421681073994


In [None]:
###########################################################################################################
## Conclusion: Creating Syntetic data from Random Distributions is not a good idea for testing models !! ##
###########################################################################################################
# Its better to Split the Original data

In [11]:
# split data into training and validation data, for both features and target

from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)


# Specify Model
iowa2_model = DecisionTreeRegressor()
# Fit Model
iowa2_model.fit(train_X,train_y)

predictions2 = iowa2_model.predict(val_X).tolist()
values2 = val_y.tolist()

# -------------------------------------------------------------------------
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

# Error
err = mean_absolute_error(predictions2,values2)
errp =  mean_absolute_percentage_error(predictions2,values2)

print("First in-sample predictions:", predictions2[:10])
print("Actual target values for those homes:", values2[:10])
print()
print(f'Absolute Mean Error:', err)
print()
print(f'Absolute Mean Error:', errp)

First in-sample predictions: [1372.0, 13100.0, 7101.0, 6577.0, 5620.0, 3858.0, 1385.0, 9018.0, 2172.0, 2167.0]
Actual target values for those homes: [1387, 13162, 7114, 6565, 5583, 3860, 1368, 9024, 2124, 2196]

Absolute Mean Error: 21.704

Absolute Mean Error: 0.008571092865915711
