In [4]:
import numpy as np
import pandas as pd
import warnings

In [5]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
sales = pd.read_csv('../data/kc_house_data.csv', dtype=dtype_dict)
train_data = pd.read_csv('../data/kc_house_train_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('../data/kc_house_test_data.csv', dtype=dtype_dict)

In [6]:
def get_numpy_data(df, features, output):
    df['constant'] = 1 # add a constant column 
    # add the column 'constant' to the front of the features list so that we can extract it along with the others:
    features = ['constant'] + features # combine two lists
    # select the columns of data_SFrame given by the features list into the SFrame features_sframe (now including constant):
    features_df = df[features]
    # the following line will convert the features_SFrame into a numpy matrix:
    feature_matrix = features_df.as_matrix()
    # assign the column of df associated with the output to the Series output_series
    output_series = df[output]
    # the following will convert the Series into a numpy array 
    output_array = output_series.as_matrix()
    return(feature_matrix, output_array)

In [7]:
(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') # the [] around 'sqft_living' makes it a list
print train_data[['sqft_living', 'price']].ix[0, :]
print example_features[0,:] # this accesses the first row of the data the ':' indicates 'all columns'
print example_output[0] # and the corresponding output

sqft_living      1180.0
price          221900.0
Name: 0, dtype: float64
[  1.00000000e+00   1.18000000e+03]
221900.0


In [8]:
my_weights = np.array([1., 1.]) # the example weights
my_features = example_features[0,] # we'll use the first data point
predicted_value = np.dot(my_features, my_weights)
print predicted_value

1181.0


In [9]:
def predict_output(feature_matrix, weights):
    # assume feature_matrix is a numpy matrix containing the features as columns and weights is a corresponding numpy array
    # create the predictions vector by using np.dot()
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [10]:
test_predictions = predict_output(example_features, my_weights)
print test_predictions[0] # should be 1181.0
print test_predictions[1] # should be 2571.0

1181.0
2571.0


In [11]:
def calculate_RSS(y_pred, y_true):
    residuals = y_true - y_pred
    return (residuals * residuals).sum()

In [12]:
def feature_derivative(errors, feature):
    # Assume that errors and feature are both numpy arrays of the same length (number of data points)
    # compute twice the dot product of these vectors as 'derivative' and return the value
    derivative = 2 * np.dot(errors, feature)
    return(derivative)

In [13]:
(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') 
my_weights = np.array([0., 0.]) # this makes all the predictions 0
test_predictions = predict_output(example_features, my_weights) 
# just like SFrames 2 numpy arrays can be elementwise subtracted with '-': 
errors = test_predictions - example_output # prediction errors in this case is just the -example_output
feature = example_features[:,0] # let's compute the derivative with respect to 'constant', the ":" indicates "all rows"
derivative = feature_derivative(errors, feature)
print derivative
print -np.sum(example_output)*2 # should be the same as derivative

-23345850016.0
-23345850016.0


In [14]:
from math import sqrt # recall that the magnitude/length of a vector [g[0], g[1], g[2]] is sqrt(g[0]^2 + g[1]^2 + g[2]^2)

In [15]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False 
    weights = np.array(initial_weights) # make sure it's a numpy array
    while not converged:
        # compute the predictions based on feature_matrix and weights using your predict_output() function
        predictions = predict_output(feature_matrix, weights)
        # compute the errors as predictions - output
        errors = predictions - output
        
        gradient_sum_squares = 0 # initialize the gradient sum of squares
        # while we haven't reached the tolerance yet, update each feature's weight
        for i in range(len(weights)): # loop over each weight
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative_Wi = feature_derivative(errors, feature_matrix[:, i])
            # add the squared value of the derivative to the gradient magnitude (for assessing convergence)
            gradient_sum_squares += derivative_Wi * derivative_Wi
            # subtract the step size times the derivative from the current weight
            weights[i] = weights[i] - step_size * derivative_Wi
        # compute the square-root of the gradient sum of squares to get the gradient matnigude:
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

In [16]:
#train_data,test_data = sales.random_split(.8,seed=0)
# let's test out the gradient descent
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [17]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)
print "sqft_living weight: %.1f" % simple_weights[1]

sqft_living weight: 281.9


In [18]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)

In [19]:
test_predictions = predict_output(test_simple_feature_matrix, simple_weights)
print "Predicted Price for 1st house in Test data : %.f" % test_predictions[0]

Predicted Price for 1st house in Test data : 356134


In [20]:
# Now that you have the predictions on test data, compute the RSS on the test data set. 
# Recall that RSS is the sum of the squared errors (difference between prediction and output).
RSS_model1 = calculate_RSS(test_predictions, test_output)
print "RSS model1 : %.2f" % RSS_model1

RSS model1 : 275400044902128.31


In [21]:
model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. 
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [22]:
model2_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

In [23]:
(test_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)

In [24]:
test_predictions = predict_output(test_feature_matrix, model2_weights)
print "Predicted Price for 1st house in Test data : %.f" % test_predictions[0]

Predicted Price for 1st house in Test data : 366651


In [25]:
# Now that you have the predictions on test data, compute the RSS on the test data set. 
# Recall that RSS is the sum of the squared errors (difference between prediction and output).
RSS_model2 = calculate_RSS(test_predictions, test_output)
print "RSS model2 : %.2f" % RSS_model2

RSS model2 : 270263443629803.56


In [26]:
print "Delta with model1 : ", test_output[0] - 356134
print "Delta with model2 : ", test_output[0] - 366651

Delta with model1 :  -46134.0
Delta with model2 :  -56651.0


In [27]:
RSS_model1 > RSS_model2

True