In [56]:
import graphlab as gl
import graphlab.aggregate as agg
import math
from __future__ import division

sales = gl.SFrame('home_data.gl/')

### Determine avg price in the most expensive area:

In [32]:
by_avg_price = sales.groupby('zipcode', agg.AVG('price'))
max_avg_price =  by_avg_price.sort('Avg of price', ascending=False)[0]['Avg of price']

print("The most expensive area's avg price: {0:.2f}$".format(max_avg_price))

The most expensive area's avg price: 2160606.60$


### Playing with filtering

In [41]:
filtered = sales[(sales['sqft_living'] > 2000) & (sales['sqft_living'] < 4000)]
fraction_filtered = filtered.shape[0] / sales.shape[0]

print("Filtered fraction: {0:.2f}".format(fraction_filtered))

Filtered fraction: 0.42


### Comparing models

In [44]:
train_data, test_data = sales.random_split(.8,seed=0)

In [60]:
def get_rmse (features):
    model = gl.linear_regression.create(train_data,target='price',features=features,validation_set=None)
    return model.evaluate(test_data)['rmse']

feature_set_1 = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']
rmse_1 = get_rmse(feature_set_1)

feature_set_2 = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house				
'grade', # measure of quality of construction				
'waterfront', # waterfront property				
'view', # type of view				
'sqft_above', # square feet above ground				
'sqft_basement', # square feet in basement				
'yr_built', # the year built				
'yr_renovated', # the year renovated				
'lat', 'long', # the lat-long of the parcel				
'sqft_living15', # average sq.ft. of 15 nearest neighbors 				
'sqft_lot15', # average lot size of 15 nearest neighbors 
]

rmse_2 = get_rmse(feature_set_2)

print("RMSE1:{0:.2f} RMSE2: {1:.2f} Diff: {2:.2f}".format(rmse_1, rmse_2, math.fabs(rmse_2 - rmse_1)))

RMSE1:179542.43 RMSE2: 156831.12 Diff: 22711.32
