In [1]:
import graphlab
graphlab.canvas.set_target('ipynb')

# Put our home sales data into a SFrame we can work with

In [2]:
sales = graphlab.SFrame('home_data.gl/')

[INFO] This non-commercial license of GraphLab Create is assigned to znorris@gmail.comand will expire on October 12, 2016. For commercial licensing options, visit https://dato.com/buy/.

[INFO] Start server at: ipc:///tmp/graphlab_server-31808 - Server binary: /home/znorris/anaconda/envs/dato-env/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1444918467.log
[INFO] GraphLab Server Version: 1.6.1


# Selection and Summary Statistics of a Specific Zipcode

In [36]:
# Grab only the subset of data for the zipcode we want
zip98039 = sales[sales['zipcode']=='98039']

In [37]:
# Let's just checkout some general stats about our subset
zip98039.show()  # Doesn't appear to work on github

In [38]:
# Now let's grab just the mean average price for this subset
print float(sum(zip98039['price']))/len(zip98039['price'])

2160606.6


# Filtering Data
https://dato.com/products/create/docs/generated/graphlab.SFrame.html

We want to select houses that have 'sqft_living' higher than 2,000 sq. ft. but no larger than 4,000 sq. ft. then detmine the fraction of filter_range/all_houses

In [6]:
# Let's grab just the data that fits our sq. ft. filter 
filtered_sales = sales[(sales['sqft_living'] >= 2000) & (sales['sqft_living'] <= 4000)]

In [7]:
# get that ratio of filter_range/all_houses
total_filtered = len(filtered_sales)
total_sales = len(sales['sqft_living'])
print("Float: {}").format(float(total_filtered) / total_sales)
# let's be rational
from fractions import Fraction
print("Rational: {}").format(Fraction(total_filtered, total_sales))

Float: 0.426641373248
Rational: 9221/21613


# Building a regression model with more features
We'll compare the two feature lists we have created and compute the root mean squared error (RMSE) on the test_data for each model.
* When doing a train-test split, use a seed=0.
* In the module we discussed residual sum of squares (RSS) as an error metric for regression, but GraphLab Create uses root mean squared error (RMSE). These are two common measures of error regression, and RMSE is simply the square root of the RSS. RMSE can be more intuitive than RSS, since its units are the same as that of the target column in the data, in our case the unit is dollars ($).
* When calling linear_regression.create() set the parameter validation_set=None otherwise the function will set aside a small random subset of data to use in verification. We need our results to be equal to the those on the quiz.

In [11]:
# Use the advance_features list given to us in the assignment to create a model
advanced_features = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house				
'grade', # measure of quality of construction				
'waterfront', # waterfront property				
'view', # type of view				
'sqft_above', # square feet above ground				
'sqft_basement', # square feet in basement				
'yr_built', # the year built				
'yr_renovated', # the year renovated				
'lat', 'long', # the lat-long of the parcel				
'sqft_living15', # average sq.ft. of 15 nearest neighbors 				
'sqft_lot15', # average lot size of 15 nearest neighbors 
]

# Use the my_freatures list given to us in the assignment to create a model
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

In [12]:
# First we need to do a train-test split of our data
train_data, test_data = sales.random_split(.8, seed=0) # 80% training, 20% testing

In [27]:
# Build our two training models
print "My Features Model:\n"
my_features_model = graphlab.linear_regression.create(train_data, 'price', features=my_features, validation_set=None)
print "\nAdvanced Features Model:\n"
advanced_features_model = graphlab.linear_regression.create(train_data, 'price', features=advanced_features, validation_set=None)

My Features Model:

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 17384
PROGRESS: Number of features          : 6
PROGRESS: Number of unpacked features : 6
PROGRESS: Number of coefficients    : 115
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | 1         | 2        | 0.020215     | 3763208.270523     | 181908.848367 |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+

Advanced Features Model:

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 17384
PROGRESS

## Evaluate the models

In [15]:
# Setup for graphs so we can see what our models look like
from matplotlib import pyplot as plt
%matplotlib inline
# http://matplotlib.org/api/pyplot_api.html?highlight=pyplot#matplotlib.pyplot.plot

In [21]:
# Evaluate model with test_data
print('my_features_model: {}').format(my_features_model.evaluate(test_data))
print('advanced_features_model: {}').format(advanced_features_model.evaluate(test_data))

my_features_model: {'max_error': 3486584.509381705, 'rmse': 179542.4333126903}
advanced_features_model: {'max_error': 3556849.413858208, 'rmse': 156831.1168021901}


In [40]:
# Plot the models. Needs more research, not as simple with multiple features.

### Determine difference in RMSE between the two models

In [39]:
my_features_model_rmse = my_features_model.evaluate(test_data)
advanced_features_model_rmse = advanced_features_model.evaluate(test_data)
print("Diff. of models in RMSE:  ${}").format(abs(my_features_model_rmse.get('rmse') - advanced_features_model_rmse.get('rmse')))

Diff. of models in RMSE:  $22711.3165105


# Win.