In [2]:
import graphlab
import math
import numpy as np
import pdb

In [3]:
sales = graphlab.SFrame('kc_house_data.gl/')

[INFO] This non-commercial license of GraphLab Create is assigned to kaviarasu.govindaraju@snapchat.com and will expire on February 06, 2017. For commercial licensing options, visit https://dato.com/buy/.

[INFO] Start server at: ipc:///tmp/graphlab_server-56541 - Server binary: /Users/kaviarasu.govindaraju/anaconda/envs/dato-env/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1457040545.log
[INFO] GraphLab Server Version: 1.8.1


In [4]:
# Create a features matrix and an output matrix from raw data
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_sframe = graphlab.SFrame()
    for feature in features:
        features_sframe[feature] = data_sframe[feature]
        
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = features_sframe.to_numpy()
    
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_sarray = data_sframe[output]
    
    # this will convert the SArray into a numpy array:
    output_array = output_sarray.to_numpy() # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)

In [5]:
# Given features_matrix and a matrix of regression coefficients (weights), compute the outcome
# Just the dot product of these two
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [6]:
# the derivative of the regression cost function with respect to the weight of ‘feature’
# is just twice the dot product between ‘feature’ and ‘errors’
# Partial_Derivative(w) = -2H(y-Hw) from d/dw [(y-Hw)^2] 
def feature_derivative(errors, feature):
    derivative = np.dot(errors, feature) * 2
    return(derivative)

In [7]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        predictions = predict_outcome(feature_matrix, weights)
        
        # compute the errors as predictions - output:
        errors = predictions - output
        
        gradient_sum_squares = 0 # initialize the gradient
        
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors, feature_matrix[:, i])
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares += np.dot(derivative, derivative)
            # update the weight based on step size and derivative:
            weights[i] -= step_size * derivative
            
        gradient_magnitude = math.sqrt(gradient_sum_squares)
        
        if gradient_magnitude < tolerance:
            print "Converged!"
            converged = True
    return(weights)

## Apply this gradient descent to solve house prices

In [8]:
train_data,test_data = sales.random_split(.8,seed=0)

In [9]:
# Train a model using sqft_living as the only feature
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

simple_weights = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)
simple_weights

Converged!


array([-46999.88716555,    281.91211912])

In [10]:
# Predict prices using estimated simple_weights
def predict_prices_for_houses(data_sframe):
    data_sframe['predicted_house_price'] = data_sframe.apply(lambda x: simple_weights[0] + simple_weights[1] * x['sqft_living'])

In [11]:
predict_prices_for_houses(test_data)

In [12]:
def calculate_squared_errors(data_sframe):
    data_sframe['squared_error'] = data_sframe.apply(lambda x: ( (x['price'] - x['predicted_house_price']) ** 2 ))

In [13]:
calculate_squared_errors(test_data)

In [25]:
error1 = test_data['squared_error'].sum()

In [15]:
# Train a model using two features
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

multiple_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)
multiple_weights

Converged!


array([ -9.99999688e+04,   2.45072603e+02,   6.52795277e+01])

In [19]:
def predict_prices_with_multiple_weights(data_sframe):
    data_sframe['multiple_regression_outcome'] = data_sframe.apply(lambda x: multiple_weights[0] + 
                                                                   multiple_weights[1] * x['sqft_living'] +
                                                                   multiple_weights[2] * x['sqft_living15'])

In [20]:
predict_prices_with_multiple_weights(test_data)

In [21]:
def calculate_multiple_regression_squared_errors(data_sframe):
    data_sframe['mr_squared_error'] = data_sframe.apply(lambda x: ( (x['price'] - x['multiple_regression_outcome']) ** 2 ))

In [23]:
calculate_multiple_regression_squared_errors(test_data)

In [26]:
error2 = test_data['mr_squared_error'].sum()

In [29]:
error1 - error2

5136601127912.0625

In [31]:
test_data.head(1)

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
114101516,2014-05-28 00:00:00+00:00,310000.0,3.0,1.0,1430.0,19901,1.5,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,4,7,1430,0,1927,0,98028,47.75584254

long,sqft_living15,sqft_lot15,predicted_house_price,squared_error,multiple_regression_outco me ...
-122.22874498,1780.0,12697.0,356134.443171,2128386846.69,366651.412037

mr_squared_error
3209382485.74
