In [542]:
import pandas as pd
import numpy as np
import time

In [543]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [544]:
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

In [545]:
train_data['constant'] = 1
test_data['constant'] = 1

In [546]:
# define a function that returns numpy array
def get_numpy_data(data_frame, features, output):
    features = ['constant'] + features
    features_matrix = np.array(data_frame[features])
    output_array = np.array(data_frame[output]).reshape(-1, 1)
    return (features_matrix, output_array)

In [547]:
# prediction function
def predict_outcome(feature_matrix, weights):
    return np.dot(feature_matrix, weights)

In [548]:
# gradient
def feature_derivative(errors, feature):
    return 2*float(np.dot(errors.T, feature))

In [549]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = initial_weights
    while not converged:
        predictions = predict_outcome(feature_matrix, weights)
        errors = output - predictions
        gradient_sum_squares = 0
        for i in range(len(weights)):
            feature = feature_matrix[:, i]
            derivative = feature_derivative(errors, feature)
            gradient_sum_squares += derivative*derivative
        
            weights[i] += step_size*derivative
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return weights

In [550]:
simple_features = ['sqft_living']
my_output = 'price'
simple_feature_matrix, output = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.matrix([-47000.0,1.]).T
step_size = 7e-12
tolerance = 2.5e7


In [551]:
# Quiz 1
simple_weights = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)

In [552]:
simple_weights

matrix([[-46999.88716555],
        [   281.91211918]])

In [553]:
# Quiz 2
prediction_1 = np.dot(test_data.loc[0][['constant', 'sqft_living']], simple_weights)

In [554]:
prediction_1

matrix([[356134.44325500238]], dtype=object)

In [537]:
#RSS
errors = np.dot(test_data[['constant', 'sqft_living']], simple_weights) - np.array(test_data.price).reshape(-1,1)
RSS1 = np.mean(np.square(errors))

In [515]:
# new model
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.]).reshape(-1,1)
step_size = 4e-12
tolerance = 1e9

In [516]:
model_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

In [530]:
predictions = np.dot(test_data[['constant', 'sqft_living', 'sqft_living15']], model_weights)

In [555]:
#Quiz 3
prediction_2 = predictions[0]
prediction_2

array([ 366651.41162949])

In [532]:
test_data.loc[0].price

310000.0

In [533]:
prediction_1

matrix([[356134.44325500238]], dtype=object)

In [534]:
prediction_2

array([ 366651.41162949])

In [538]:
# RSS
RSS2 = np.mean(np.square(predictions - np.array(test_data.price).reshape(-1,1)))

In [539]:
RSS1

65121788815.826035

In [540]:
RSS2

63907175131.190247