## Fire up GraphLab Create

In [33]:
import graphlab

## Load house data

In [34]:
sales = graphlab.SFrame('home_data.gl/')

## Question 1 Selection and summary statistics: We found the zip code with the highest average house price. What is the average house price of that zip code?

In [35]:
price_zipcode = sales["price",  "zipcode"]
price_zipcode.groupby(key_columns='zipcode',operations={'mean': graphlab.aggregate.MEAN('price')}).sort('mean', ascending = False)

zipcode,mean
98039,2160606.6
98004,1355927.09779
98040,1194230.00355
98112,1095499.36803
98102,901258.238095
98109,879623.623853
98105,862825.231441
98006,859684.763052
98119,849448.01087
98005,810164.880952


So the zipcode with highest price is 98039, and the price is 2160606.6

## Question 2 Filtering data: What fraction of the houses have living space between 2000 sq.ft. and 4000 sq.ft.?

In [36]:
filtered_house = sales[(sales['sqft_living'] > 2000) & (sales['sqft_living'] < 4000)]
ratio = filtered_house.num_rows()* 1.0/sales.num_rows()
print ratio

0.421551843798


## Question 3 Building a regression model with several more features: What is the difference in RMSE between the model trained with my_features and the one trained with advanced_features?

In [37]:
advanced_features = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house
'grade', # measure of quality of construction
'waterfront', # waterfront property
'view', # type of view
'sqft_above', # square feet above ground
'sqft_basement', # square feet in basement
'yr_built', # the year built
'yr_renovated', # the year renovated
'lat', 'long', # the lat-long of the parcel
'sqft_living15', # average sq.ft. of 15 nearest neighbors
'sqft_lot15', # average lot size of 15 nearest neighbors
]

my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

In [38]:
train_data,test_data = sales.random_split(.8,seed=0)

In [39]:
my_features_model = graphlab.linear_regression.create(train_data,target='price',features=my_features,validation_set=None)

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 17384
PROGRESS: Number of features          : 6
PROGRESS: Number of unpacked features : 6
PROGRESS: Number of coefficients    : 115
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | 1         | 2        | 0.027575     | 3763208.270524     | 181908.848367 |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: SUCCESS: Optimal solution found.
PROGRESS:


In [40]:
my_adv_features_model = graphlab.linear_regression.create(train_data,target='price',features=advanced_features,validation_set=None)

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 17384
PROGRESS: Number of features          : 18
PROGRESS: Number of unpacked features : 18
PROGRESS: Number of coefficients    : 127
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | 1         | 2        | 0.048532     | 3469012.450624     | 154580.940734 |
PROGRESS: | 2         | 3        | 0.089062     | 3469012.450673     | 154580.940735 |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: SUCCESS: Optimal solution found.
PROGRESS:


In [41]:
print my_features_model.evaluate(test_data)
print my_adv_features_model.evaluate(test_data)

{'max_error': 3486584.50938179, 'rmse': 179542.43331269047}
{'max_error': 3556849.413849059, 'rmse': 156831.11680200775}


In [42]:
179542.43331269105 - 156831.11680191013

22711.316510780918