In [34]:
import math
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.datasets import load_boston

In [11]:
boston = load_boston()
print(boston['DESCR'])

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [9]:
df_boston = pd.DataFrame(boston['data'], columns=boston['feature_names'])
df_boston['target'] = boston['target']
df_boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [20]:
print('Overall mean value: %.2f' % (df_boston['target'].mean()))
print('Overall median value: %.2f' % (df_boston['target'].median()))
df_boston['target'].describe()

Overall mean value: 22.53
Overall median value: 21.20


count    506.000000
mean      22.532806
std        9.197104
min        5.000000
25%       17.025000
50%       21.200000
75%       25.000000
max       50.000000
Name: target, dtype: float64

In [23]:
df_boston['num_rooms'] = df_boston['RM'].apply(lambda x: int(x))
df_num_rooms = df_boston.groupby('num_rooms').agg({'target': ['mean', 'median', 'count']}).reset_index()
df_num_rooms.columns = [' '.join(col).strip() for col in df_num_rooms.columns.values]
num_rooms_value_dict = {}
for _, row in tqdm(df_num_rooms.iterrows()):
    num_rooms_value_dict[row['num_rooms']] = {'mean': row['target mean'], 'median': row['target median']}
df_num_rooms

6it [00:00, 2001.90it/s]


Unnamed: 0,num_rooms,target mean,target median,target count
0,3,25.3,25.3,2
1,4,16.023077,13.8,13
2,5,17.487342,18.55,158
3,6,22.015985,22.2,269
4,7,36.917647,35.2,51
5,8,44.2,48.3,13


In [26]:
df_boston['overall_mean_val'] = df_boston['target'].mean()
df_boston['overall_median_val'] = df_boston['target'].median()
df_boston['rooms_mean_val'] = df_boston.apply(lambda x: num_rooms_value_dict[x['num_rooms']]['mean'], axis=1)
df_boston['rooms_median_val'] = df_boston.apply(lambda x: num_rooms_value_dict[x['num_rooms']]['median'], axis=1)

In [31]:
abs_error = lambda x, y: abs(x-y)
sq_error = lambda x, y: (x-y) ** 2
df_boston['om_ae'] = df_boston.apply(lambda row: abs_error(row['overall_mean_val'], row['target']), axis=1)
df_boston['omd_ae'] = df_boston.apply(lambda row: abs_error(row['overall_median_val'], row['target']), axis=1)
df_boston['rm_ae'] = df_boston.apply(lambda row: abs_error(row['rooms_mean_val'], row['target']), axis=1)
df_boston['rmd_ae'] = df_boston.apply(lambda row: abs_error(row['rooms_median_val'], row['target']), axis=1)
df_boston['om_se'] = df_boston.apply(lambda row: sq_error(row['overall_mean_val'], row['target']), axis=1)
df_boston['omd_se'] = df_boston.apply(lambda row: sq_error(row['overall_median_val'], row['target']), axis=1)
df_boston['rm_se'] = df_boston.apply(lambda row: sq_error(row['rooms_mean_val'], row['target']), axis=1)
df_boston['rmd_se'] = df_boston.apply(lambda row: sq_error(row['rooms_median_val'], row['target']), axis=1)

In [45]:
def print_perf(df, col_ae, col_se, header):
    print(header)
    print('\tMean AE: $%.2f' % (df[col_ae].mean() * 1000))
    print('\tMedian AE: $%.2f' % (df[col_ae].median() * 1000))
    print('\tRMSE: $%.2f' % (math.sqrt(df[col_se].mean()) * 1000))
print_perf(df_boston, 'om_ae', 'om_se', 'Overall Mean Performance (Baseline 1)')
print_perf(df_boston, 'rm_ae', 'rm_se', '\nRoom Mean Performance (Baseline 2)')

Overall Mean Performance (Baseline 1)
	Mean AE: $6647.21
	Median AE: $4732.81
	RMSE: $9188.01

Room Mean Performance (Baseline 2)
	Mean AE: $4737.21
	Median AE: $3314.32
	RMSE: $6503.25
