In [1]:
import numpy as np
import pandas as pd

##### Next write a function that takes a data set, a list of features (e.g. [‘sqft_living’, ‘bedrooms’]), to be used as inputs, and a name of the output (e.g. ‘price’). This function should return a features_matrix (2D array) consisting of first a column of ones followed by columns containing the values of the input features in the data set in the same order as the input list. It should also return an output_array which is an array of the values of the output in the data set (e.g. ‘price’). e.g. if you’re using SFrames and numpy you can complete the following function:

In [2]:
train_data = pd.read_csv('kc_house_train_data.csv')
test_data = pd.read_csv('kc_house_test_data.csv')

In [3]:
def get_numpy_data(data_frame, features, output):
    """
    data_frame: pd.Dataframe
    featrues: a list of features name (e.g. [‘sqft_living’, ‘bedrooms’])
    output: a name of the output (e.g. ‘price’).
    
    """
    # create a constant column with value one
    constant_column = np.ones((len(data_frame), 1)) 
    
    # create the features matrix
    features_matrix = np.hstack((constant_column, data_frame.as_matrix(columns=features)))
    
    # this will convert the Series into a numpy array:
    output_name = [output]
    output_array = data_frame.as_matrix(columns=output_name)
    
    return features_matrix, output_array

##### If the features matrix (including a column of 1s for the constant) is stored as a 2D array (or matrix) and the regression weights are stored as a 1D array then the predicted output is just the dot product between the features matrix and the weights (with the weights on the right). Write a function ‘predict_output’ which accepts a 2D array ‘feature_matrix’ and a 1D array ‘weights’ and returns a 1D array ‘predictions’. e.g. in python:

In [4]:
def predict_outcome(feature_matrix, weights):
    
    return np.dot(feature_matrix, weights)

##### If we have a the values of a single input feature in an array ‘feature’ and the prediction ‘errors’ (predictions - output) then the derivative of the regression cost function with respect to the weight of ‘feature’ is just twice the dot product between ‘feature’ and ‘errors’. Write a function that accepts a ‘feature’ array and ‘error’ array and returns the ‘derivative’ (a single number). e.g. in python:

In [5]:
def feature_derivative(errors, feature):
    # errors = predictions - output
    # return a number
    return 2 * np.dot(feature, errors)

##### Now we will use our predict_output and feature_derivative to write a gradient descent function. Although we can compute the derivative for all the features simultaneously (the gradient) we will explicitly loop over the features individually for simplicity. Write a gradient descent function that does the following:

In [6]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights).reshape(len(initial_weights), 1).astype(float)
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        # compute the errors as predictions - output:
        predictions = predict_outcome(feature_matrix, weights)
        errors = predictions - output
        
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors, feature_matrix[:, i])
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares += derivative ** 2
            # update the weight based on step size and derivative:
            weights[i] = weights[i] - step_size * derivative
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return weights 

In [7]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [8]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size, tolerance)

In [9]:
simple_weights

array([[-46999.88716555],
       [   281.91211918]])

##### Now build a corresponding ‘test_simple_feature_matrix’ and ‘test_output’ using test_data. Using ‘test_simple_feature_matrix’ and ‘simple_weights’ compute the predicted house prices on all the test data.

In [10]:
test_simple_feature_matrix, test_output = get_numpy_data(test_data,simple_features,my_output)

In [11]:
predicted_house_prices = predict_outcome(test_simple_feature_matrix, simple_weights)

In [23]:
predicted_house_prices[0]

array([356134.443255])

In [12]:
RSS_model_1 = np.sum((predicted_house_prices - test_output) ** 2)

In [13]:
# Now compute RSS on all test data for this model. Record the value and store it for later
RSS_model_1

275400044902128.3

##### Now we will use the gradient descent to fit a model with more than 1 predictor variable (and an intercept). Use the following parameters:

In [14]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [15]:
new_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

In [16]:
new_weights

array([[-9.99999688e+04],
       [ 2.45072603e+02],
       [ 6.52795267e+01]])

In [17]:
new_test_simple_feature_matrix, new_test_output = get_numpy_data(test_data,model_features, my_output)
new_predicted_house_prices = predict_outcome(new_test_simple_feature_matrix, new_weights)

In [18]:
new_predicted_house_prices[0]

array([366651.41162949])

In [19]:
test_data.head()['price'][0]

310000.0

In [20]:
RSS_model_2 = np.sum((new_predicted_house_prices - new_test_output) ** 2)

In [22]:
RSS_model_2

270263443629803.56