In [1]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np

In [21]:
# gather data
boston_data = load_boston()
df = pd.DataFrame(data=boston_data.data, columns=boston_data.feature_names)

features = df.drop(['INDUS', 'AGE'], axis=1)
log_price = np.log(boston_data.target)
log_price = pd.DataFrame(log_price, columns=['PRICE'])
features

Unnamed: 0,CRIM,ZN,CHAS,NOX,RM,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,0.0,0.538,6.575,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,0.0,0.469,6.421,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,0.0,0.469,7.185,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,0.0,0.458,6.998,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,0.0,0.458,7.147,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,0.0,0.573,6.593,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,0.0,0.573,6.120,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,0.0,0.573,6.976,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,0.0,0.573,6.794,2.3889,1.0,273.0,21.0,393.45,6.48


In [54]:
CRIM_INX = 0
ZN_INX = 1
CHAS_INX = 2
NOX_INX = 3
RM_INX = 4
DIS_INX = 5
RAD_INX = 6
TAX_INX = 7
PTRATIO_INX = 8
B_INX = 9
LSTAT_INX = 10
#property_stats = np.ndarray(shape=[1, 11])  # 1 row with 11 colum, up to the feature shape
#property_stats[0][CRIM_INX] = features['CRIM'].mean()
#property_stats[0][ZN_INX] = features['ZN'].mean()
#...
#property_stats

In [47]:
# series bj has attribute values which givern ndarray obj
print(type(features.mean()))  # get all the mean from features
print(type(features.mean().values), 'has shape', features.mean().values.shape)  # but the shape must match
features.mean().values.reshape(11, 1)

<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'> has shape (11,)


array([[3.61352356e+00],
       [1.13636364e+01],
       [6.91699605e-02],
       [5.54695059e-01],
       [6.28463439e+00],
       [3.79504269e+00],
       [9.54940711e+00],
       [4.08237154e+02],
       [1.84555336e+01],
       [3.56674032e+02],
       [1.26530632e+01]])

In [48]:
property_stats = features.mean().values.reshape(1, 11)
property_stats  # obj for making an prediction

array([[3.61352356e+00, 1.13636364e+01, 6.91699605e-02, 5.54695059e-01,
        6.28463439e+00, 3.79504269e+00, 9.54940711e+00, 4.08237154e+02,
        1.84555336e+01, 3.56674032e+02, 1.26530632e+01]])

In [49]:
reg = LinearRegression().fit(features, log_price)
fitted_values = reg.predict(features)

In [67]:
# calculate mse and r-square
rsquare = reg.score(features, log_price)
mse = mean_squared_error(log_price, reg.predict(features))
rmse = np.sqrt(mse)
print('r-square :', rsquare)
print('mse :', mse)  # real price and predicted price
print('rmse :', rmse)

r-square : 0.7891431041340249
mse : 0.03516080084618688
rmse : 0.18751213519713034


In [68]:
def get_log_estimate(number_room,
                     student_per_classroom,
                     is_next_to_river=False,
                     high_confidence=True):
    # config property
    property_stats[0][RM_INX] = number_room
    property_stats[0][PTRATIO_INX] = student_per_classroom
    property_stats[0][CHAS_INX] = 1 if is_next_to_river else 0

    # make prediction
    log_estimate = reg.predict(property_stats)[0][0]

    # calculate the range
    if high_confidence:  # 2 std
        upper_bound = log_estimate + 2*rmse
        lowwer_cound = log_estimate - 2*rmse
    else:  # one standard
        upper_bound = log_estimate + 1*rmse
        lowwer_cound = log_estimate - 1*rmse

    return log_estimate, upper_bound, lowwer_cound

In [71]:
get_log_estimate(number_room=93,
                 student_per_classroom=20,
                 is_next_to_river=True,
                 high_confidence=True)

(10.937309436437314, 11.312333706831575, 10.562285166043052)

In [72]:
np.median(boston_data.target)  # nowaday house is boston cause about half of millions dollar

21.2

In [84]:
# convert the log price estimate using 1970s
# as well as upper and lowwer bound to today's price. round the values to the nearlest 1000 dollars
ZILLOW_MEDIAN_PRICE = 583.3
scale = ZILLOW_MEDIAN_PRICE/np.median(boston_data.target)

log_estimate, upper_bound, lowwer_bound = get_log_estimate(number_room=9,
                                                           student_per_classroom=15,
                                                           is_next_to_river=False,
                                                           high_confidence=True)

# convert to today's price
dollar_estimate = np.e**log_estimate * 1000 * scale
dollar_estimate = dollar_estimate.round(-3)  
upper_bound = np.e**upper_bound * 1000 * scale
upper_bound = upper_bound.round(-3)
lowwer_bound = np.e**lowwer_bound * 1000 * scale
lowwer_bound = lowwer_bound.round(-3)    

print('dollar estimate : {} $\nupper bound : {} $\nlowwer bound : {} $'.format(dollar_estimate, upper_bound, lowwer_bound))

dollar estimate : 827000.0 $
upper bound : 1203000.0 $
lowwer bound : 568000.0 $


In [100]:
def get_dollar_estimate(number_room, student_per_classroom, is_next_to_river=False, high_confidence=True):
    """ Estimate price in boston 
    Keyword arguments:
    number_room -- number of room in the property
    student_per_classroom -- number of students per teacher in the classroom for the school area
    is_next_to_river -- True if the property is next to the river
    high_confidence -- confidence on value price
    """
    if number_room < 1 or student_per_classroom < 1:
        return
    ZILLOW_MEDIAN_PRICE = 583.3
    scale = ZILLOW_MEDIAN_PRICE/np.median(boston_data.target)
    log_estimate, upper_bound, lowwer_bound = get_log_estimate(number_room=number_room,
                                                               student_per_classroom=student_per_classroom,
                                                               is_next_to_river=is_next_to_river,
                                                               high_confidence=high_confidence)
    # convert to today's price
    dollar_estimate = np.e**log_estimate * 1000 * scale
    dollar_estimate = dollar_estimate.round(-3)  
    upper_bound = np.e**upper_bound * 1000 * scale
    upper_bound = upper_bound.round(-3)
    lowwer_bound = np.e**lowwer_bound * 1000 * scale
    lowwer_bound = lowwer_bound.round(-3)    

    print('dollar estimate : {} $\nupper bound : {} $\nlowwer bound : {} $'.format(dollar_estimate, upper_bound, lowwer_bound))

In [99]:
get_dollar_estimate(number_room=0,
                    student_per_classroom=200,
                    is_next_to_river=True,
                    high_confidence=True)

In [101]:
import boston_house_price as val

val.get_dollar_estimate(6, 12, True)

dollar estimate : 783000.0 $
upper bound : 1139000.0 $
lowwer bound : 538000.0 $
