In [5]:
import math
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

# Load Data

In [2]:
boston = load_boston()
print(boston['DESCR'])

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

# Process Data

In [17]:
df_boston = pd.DataFrame(boston['data'], columns=boston['feature_names'])
df_boston['MEDV'] = boston['target'] # target median value of home in $1000's

# Feature engineering: Convert average number of rooms per dwelling to int
df_boston['num_rooms'] = df_boston['RM'].apply(lambda x: int(x)) 

print(f'Dataset size: {df_boston.shape}')
df_boston.head()

Dataset size: (506, 15)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,num_rooms
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0,6
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6,6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7,7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4,6
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2,7


In [30]:
# Split dataframe into train and test
df_train, df_test = train_test_split(df_boston, test_size=0.2, random_state=42)
df_train = df_train.reset_index()
df_test = df_test.reset_index()
print(f'Size of train set: {df_train.shape}')
print(f'Size of test set: {df_test.shape}')

Size of train set: (404, 16)
Size of test set: (102, 16)


In [31]:
# Distribution of train
df_train.describe()

Unnamed: 0,index,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,num_rooms
count,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0
mean,250.428218,3.584372,11.569307,10.98505,0.071782,0.556484,6.315891,68.556436,3.808195,9.356436,404.032178,18.318317,356.278342,12.457351,22.796535,5.804455
std,142.332893,8.869255,23.152481,6.894618,0.258447,0.117704,0.709452,27.994922,2.131226,8.589721,166.172655,2.228701,91.566533,7.110381,9.332147,0.790062
min,1.0,0.00906,0.0,0.74,0.0,0.385,3.863,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0,3.0
25%,132.25,0.081437,0.0,5.13,0.0,0.452,5.8905,45.55,2.087875,4.0,279.0,16.8,375.4725,6.7725,16.95,5.0
50%,249.5,0.26139,0.0,8.56,0.0,0.538,6.21,77.7,3.17575,5.0,330.0,18.7,391.305,10.925,21.6,6.0
75%,369.25,2.9839,20.0,18.1,0.0,0.631,6.63675,93.65,5.4008,12.0,666.0,20.2,395.755,16.3725,26.4,6.0
max,505.0,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0,8.0


In [32]:
# Distribution of test
df_test.describe()

Unnamed: 0,index,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,num_rooms
count,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0
mean,260.705882,3.630947,10.54902,11.737745,0.058824,0.547609,6.160833,68.648039,3.742949,10.313725,424.892157,18.99902,358.241275,13.428235,21.488235,5.666667
std,161.162337,7.459607,24.083242,6.722576,0.236456,0.108602,0.663825,28.890866,2.010749,9.161889,177.462472,1.800577,90.641872,7.244601,8.605804,0.722116
min,0.0,0.00632,0.0,0.46,0.0,0.392,3.561,6.2,1.1691,1.0,188.0,13.0,6.68,2.88,5.0,3.0
25%,90.75,0.087368,0.0,6.1025,0.0,0.44825,5.87025,43.8,2.26395,4.0,281.75,18.4,375.02,7.6875,17.125,5.0
50%,272.5,0.209885,0.0,10.59,0.0,0.532,6.1705,74.7,3.3534,5.0,345.0,19.2,392.205,12.335,20.15,6.0
75%,410.25,4.522473,0.0,18.1,0.0,0.6215,6.47925,95.225,4.764825,24.0,666.0,20.2,396.9,18.0175,24.075,6.0
max,501.0,45.7461,95.0,27.74,1.0,0.871,8.034,100.0,10.7103,24.0,711.0,22.0,396.9,36.98,50.0,8.0


# Baseline Models

1. Use overall mean value of all houses in the train set
2. Use mean value based on number of rooms in the train set

In [33]:
print('Overall mean value: $%.2f' % (df_train['MEDV'].mean() * 1000))

Overall mean value: $22796.53


In [34]:
# Get mean value based on number of rooms (and make sure there are enough samples)
df_num_rooms = df_train.groupby('num_rooms').agg({'MEDV': ['mean', 'count']}).reset_index()
df_num_rooms.columns = [' '.join(col).strip() for col in df_num_rooms.columns.values]
df_num_rooms

Unnamed: 0,num_rooms,MEDV mean,MEDV count
0,3,23.1,1
1,4,17.32,10
2,5,17.6544,125
3,6,21.926066,211
4,7,36.793333,45
5,8,43.716667,12


Since there's only one sample for 3 bedroom dwellings, we'll fall back to the overall mean in this case.

In [35]:
# Store mean value by number of rooms in a dictionary
num_rooms_value_dict = {}
for _, row in tqdm(df_num_rooms.iterrows()):
    if row['num_rooms'] == 3: # fall back to overall mean
        num_rooms_value_dict[row['num_rooms']] = df_train['MEDV'].mean()
    else:
        num_rooms_value_dict[row['num_rooms']] = row['MEDV mean']

6it [00:00, 1996.18it/s]


In [36]:
# Augment test dataframe with baseline model results
df_test['overall_mean_val'] = df_train['MEDV'].mean()
df_test['rooms_mean_val'] = df_test.apply(lambda x: num_rooms_value_dict[x['num_rooms']], axis=1)

# Evaluating Model Goodness

In [44]:
abs_error = lambda estimate, target: abs(estimate - target)
abs_pc_error = lambda estimate, target: abs(estimate - target) / target * 100
sq_error = lambda estimate, target: (estimate - target) ** 2

df_test['om_ae'] = df_test.apply(lambda row: abs_error(row['overall_mean_val'], row['MEDV']), axis=1)
df_test['rm_ae'] = df_test.apply(lambda row: abs_error(row['rooms_mean_val'], row['MEDV']), axis=1)
df_test['om_ape'] = df_test.apply(lambda row: abs_pc_error(row['overall_mean_val'], row['MEDV']), axis=1)
df_test['rm_ape'] = df_test.apply(lambda row: abs_pc_error(row['rooms_mean_val'], row['MEDV']), axis=1)
df_test['om_se'] = df_test.apply(lambda row: sq_error(row['overall_mean_val'], row['MEDV']), axis=1)
df_test['rm_se'] = df_test.apply(lambda row: sq_error(row['rooms_mean_val'], row['MEDV']), axis=1)

In [48]:
def print_perf(df, col_ae, col_ape, col_se, header):
    print(header)
    print('\tMean AE: $%.2f' % (df[col_ae].mean() * 1000))
    print('\tMedian AE: $%.2f' % (df[col_ae].median() * 1000))
    print('\tMean APE: %.2f%%' % (df[col_ape].mean()))
    print('\tMedian APE: %.2f%%' % (df[col_ape].median()))
    print('\tRMSE: $%.2f' % (math.sqrt(df[col_se].mean()) * 1000))
print_perf(df_test, 'om_ae', 'om_ape', 'om_se', 'Baseline 1 Performance')
print_perf(df_test, 'rm_ae', 'rm_ape', 'rm_se', '\nBaseline 2 Performance')

Baseline 1 Performance
	Mean AE: $6255.84
	Median AE: $4346.53
	Mean APE: 37.67%
	Median APE: 20.94%
	RMSE: $8662.88

Baseline 2 Performance
	Mean AE: $4651.28
	Median AE: $3073.93
	Mean APE: 27.87%
	Median APE: 14.96%
	RMSE: $6402.08
