## Import library and load data

In [4]:
from graphlab import SFrame
import numpy as np

In [22]:
sales = SFrame('./kc_house_data.gl/')

## Build componet methods

In [223]:
def get_numpy_data(data_sframe,features,output):
    """
    A function that takes a dataset as input, return a feature matrix as well as an array
    containing output values
    
    Parameters
    ----------
    data_sframe: A SFrame data struture that contains the dataset including target features and all
    other features(X)
    
    features： A list of features that is selected to build in preparing the model
    
    output: The array containing the values of target features(y)
    
    Return
    ------
    A tuple consist of the feature matrix includes a constant column as initial intercept plus the numpy array of the target feature
    """
    
    data_sframe['constant'] = 1
    features = ['constant'] + features
    features_sframe = data_sframe[features]
    features_matrix =  features_sframe.to_numpy()
    
    output_array = data_sframe[output]
    output_matrix = output_array.to_numpy()
    
    return(features_matrix, output_matrix)

In [224]:
#test of above function
(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') # the [] around 'sqft_living' makes it a list
print example_features[0,:] # this accesses the first row of the data the ':' indicates 'all columns'
print example_output[0] # and the corresponding output

[  1.00000000e+00   1.18000000e+03]
221900.0


In [225]:
def predict_output(feature_matrix, weights):
    """
    A method that predict the target values provied the learnt weights and the input(feature_matrix)
    
    Parameters
    ----------
    feature_matrix :A 2D numpy array containintg all features for all data entries
    
    weights : A 1D numpy array containing the coeffcients learnt using the model
    
    Return
    ------
    The predicted value using the learnt weight
    """
    return np.dot(feature_matrix, weights)

In [226]:
my_weights = np.array([1., 1.]) # the example weights
my_features = example_features[0,] # we'll use the first data point
predicted_value = np.dot(my_features, my_weights)
print predicted_value

1181.0


In [227]:
# test the above method
test_predictions = predict_output(example_features, my_weights)
print test_predictions[0] # should be 1181.0
print test_predictions[1] # should be 2571.0

1181.0
2571.0


In [228]:
 def feature_derivative(errors, feature):
    """
    A method computes the derivative of a coefficient with regard to one single feature
    
    Parameters
    ----------
    errors : The difference between the real value and predicted value of the feature
    
    feature : The vector containing values for the single feature of all data points
    
    Return
    ------
    The derivative of the weight for the single feature
    """
    return np.dot(errors, feature)*2

In [230]:
# Test the derivative function
(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') 
my_weights = np.array([0., 0.]) # this makes all the predictions 0
test_predictions = predict_output(example_features, my_weights) 
# just like SFrames 2 numpy arrays can be elementwise subtracted with '-': 
errors = test_predictions - example_output # prediction errors in this case is just the -example_output
feature = example_features[:,0] # let's compute the derivative with respect to 'constant', the ":" indicates "all rows"
derivative = feature_derivative(errors, feature)
print derivative
print -np.sum(example_output)*2 # should be the same as derivative

-23345850022.0
-23345850022.0


## Gradient descent

### Make use of all above helper methods

In [231]:
from math import sqrt

In [287]:
def gradient_descent_update_individual(feature_matrix, output, initial_weights, step_size, tolerance):
    """
    The core algorithm to find the optimal value of the coefficients.
    
    Parameters
    ----------
    feature_matrix : A 2D array containing all the selected features 
    
    output ：A 1D array containing the target feature of all data points
    
    initial_weights:  A 1D array containing the initial weights for all the features
    
    step_size : The step size for gradient descent
    
    tolerance: The threshold of magnitude of the gradient to stop the iteration
    
    Return
    ------
    A 1D array containing all the converged weights for all features
    """
    converged = False
    weights = np.array(initial_weights)
    
    while not converged:
        prediction = np.dot(feature_matrix, weights)
        errors =  prediction - output
        gradient_sum_squares = 0 
        
        for i in range(len(weights)):
            derivative = feature_derivative(errors,feature_matrix[:,i])
            weights[i] = weights[i] - step_size*derivative
            gradient_sum_squares += derivative*derivative
            
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged =True
    
    return weights
    

## TODO!! Implement gradient descent without explicit loop to increase computing speed

In [288]:
def gradient_descent_update_once(feature_matrix, output, initial_weights, step_size, tolerance):
    pass

## Start to test the whole build

In [289]:
train_data, test_data = sales.random_split(.8,0)

In [290]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

### inspect the weights

In [292]:
simple_weights

array([-46999.88716555,    281.91211912])

### predict

In [317]:
featuer_house_test, _ = get_numpy_data(test_data,simple_features,my_output) 

In [320]:
featuer_house_test

array([[  1.00000000e+00,   1.43000000e+03],
       [  1.00000000e+00,   2.95000000e+03],
       [  1.00000000e+00,   1.71000000e+03],
       ..., 
       [  1.00000000e+00,   2.52000000e+03],
       [  1.00000000e+00,   2.31000000e+03],
       [  1.00000000e+00,   1.02000000e+03]])

In [319]:
predict_output(featuer_house_test,simple_weights)[0]

356134.44317092974

### predict using another model

In [325]:
model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. 
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [328]:
new_weights = gradient_descent_update_individual(feature_matrix, output,
                                             initial_weights, step_size,
                                             tolerance)

In [329]:
featuer_house_test, _ = get_numpy_data(test_data,model_features,my_output) 

In [331]:
predict_output(featuer_house_test[0],new_weights)

366651.41203655908

In [332]:
test_data[0]['price']

310000.0

## Compare models using RSS

In [346]:
def get_rss(weights,dataset,features,output):
    residual = predict_output(features,weights) - dataset[output].to_numpy()
    rss = np.sum(residual*residual)
    return rss

In [347]:
simple_feature_test,_ = get_numpy_data(test_data,simple_features,my_output)

In [348]:
rss1 = get_rss(simple_weights,test_data,simple_feature_test,'price')
rss1

275400047593155.94

In [354]:
simple_feature_test

array([[  1.00000000e+00,   1.43000000e+03],
       [  1.00000000e+00,   2.95000000e+03],
       [  1.00000000e+00,   1.71000000e+03],
       ..., 
       [  1.00000000e+00,   2.52000000e+03],
       [  1.00000000e+00,   2.31000000e+03],
       [  1.00000000e+00,   1.02000000e+03]])

In [356]:
new_feature_test,_ = get_numpy_data(test_data,model_features,my_output)

In [357]:
new_feature_test

array([[  1.00000000e+00,   1.43000000e+03,   1.78000000e+03],
       [  1.00000000e+00,   2.95000000e+03,   2.14000000e+03],
       [  1.00000000e+00,   1.71000000e+03,   1.03000000e+03],
       ..., 
       [  1.00000000e+00,   2.52000000e+03,   2.52000000e+03],
       [  1.00000000e+00,   2.31000000e+03,   1.83000000e+03],
       [  1.00000000e+00,   1.02000000e+03,   1.02000000e+03]])

In [358]:
rss2 = get_rss(new_weights,test_data,new_feature_test,'price')
rss2

270263446465244.06

In [359]:
rss2 < rss1

True