In [231]:
import pandas as pd
import numpy as np
import sklearn

In [232]:
import math

In [233]:
train_data = pd.read_csv('kc_house_train_data.csv')

In [234]:
len(train_data)

17384

In [235]:
test_data = pd.read_csv('kc_house_test_data.csv')

In [236]:
pd.DataFrame([1,2,3]).to_numpy()

array([[1],
       [2],
       [3]])

In [237]:
def get_numpy_data(data_frame, features, output):
    selected_data_frame = data_frame[features]
    output_array = data_frame[output].to_numpy()
    np_selected_data_frame = selected_data_frame.to_numpy()
    total_row = np_selected_data_frame.shape[0]
    np_ones = np.ones(total_row, dtype=int).reshape(total_row, 1)
    
    features_array = np.append(np_ones, np_selected_data_frame, axis=1)
    
    return (features_array, output_array)

In [238]:
def predict_outcome(feature_matrix, weights):
    return np.dot(feature_matrix, weights)

In [239]:
def feature_derivative(errors, feature):
    return 2 * np.dot(errors, feature)

In [324]:
def regression_gradient_descent(feature_matrix, output, initial_weights,
                                step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    step = 0
    total_sample = len(output)
    total_feature = len(initial_weights)
    print(total_sample, total_feature)
    while not converged:
        step += 1

        # compute the predictions based on feature_matrix and weights
        predictions = predict_outcome(feature_matrix, weights)
        
        # compute the errors as predictions - output
#         errors = predictions - output
        errors = np.array(predictions - output).reshape(total_sample, 1)
        
        gradient_sum_squares = 0 # initialize the gradient
        
#         # while not converged, update each weight individually:
#         for i in range(len(weights)):
#             feature_i = feature_matrix[:, i]
#             feature_i_derivative = feature_derivative(errors, feature_i)
#             weights[i] -= step_size * feature_i_derivative
#             gradient_sum_squares += feature_i_derivative * feature_i_derivative
            
        
#         gradient_magnitude = math.sqrt(gradient_sum_squares)
        feature_derivative = 2 * (np.dot(feature_matrix.transpose(), errors))
        weights -= step_size * feature_derivative.flatten()
        gradient_magnitude = math.sqrt(np.sum(feature_derivative * feature_derivative))
        print("Step %s, gradient_magnitude = %s" % (step, gradient_magnitude))
        
        if (gradient_magnitude < tolerance):
            converged = True
    
    return weights

In [325]:
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [326]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output,
                                            initial_weights, step_size,
                                             tolerance)

17384 2
Step 1, gradient_magnitude = 50551530784973.43
Step 2, gradient_magnitude = 13127451026296.436
Step 3, gradient_magnitude = 3408996083241.0596
Step 4, gradient_magnitude = 885263580285.0411
Step 5, gradient_magnitude = 229889265767.8974
Step 6, gradient_magnitude = 59698688272.16019
Step 7, gradient_magnitude = 15502826425.342028
Step 8, gradient_magnitude = 4025844402.3433805
Step 9, gradient_magnitude = 1045449748.3826516
Step 10, gradient_magnitude = 271487891.9533199
Step 11, gradient_magnitude = 70504114.8442527
Step 12, gradient_magnitude = 18320017.267022587


In [327]:
simple_weights
# array([-46999.88716555,    281.91211918])

array([-46999.88716555,    281.91211918])

In [328]:
test_simple_features = ['sqft_living']
test_output_col = 'price'
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, test_simple_features,
                                                          test_output_col
                                                          )

In [329]:
test_predicted_price = predict_outcome(test_simple_feature_matrix, simple_weights)

### Predicted price for the 1st house in the Test data set for model 1 

In [330]:
test_predicted_price[0]

356134.4432550024

In [331]:
# actual
test_data['price'][0]

310000.0

In [332]:
test_errors = test_data['price'] - test_predicted_price

In [333]:
test_rss_simple = np.sum(test_errors * test_errors)
test_rss_simple

275400044902128.3

### More than one feature

In [334]:
model_features = ['sqft_living', 'sqft_living15']
output = 'price'
initial_weights = [-100000., 1., 1.]
step_size = 4e-12
tolerance = 1e9

In [335]:
(model_features_matrix, output_array) = get_numpy_data(
    train_data,
    model_features,
    output
)

In [336]:
model_weights = regression_gradient_descent(model_features_matrix,
                                            output_array,
                                            initial_weights,
                                            step_size,
                                            tolerance
                                           )

17384 3
Step 1, gradient_magnitude = 73072020556001.0
Step 2, gradient_magnitude = 22673220967534.74
Step 3, gradient_magnitude = 7060794583490.448
Step 4, gradient_magnitude = 2275682396587.6187
Step 5, gradient_magnitude = 928984110636.1444
Step 6, gradient_magnitude = 656307431862.7949
Step 7, gradient_magnitude = 610615358663.3833
Step 8, gradient_magnitude = 593078772020.3285
Step 9, gradient_magnitude = 578705926685.8691
Step 10, gradient_magnitude = 564945682565.4801
Step 11, gradient_magnitude = 551538687674.9812
Step 12, gradient_magnitude = 538452428981.17944
Step 13, gradient_magnitude = 525676918665.55664
Step 14, gradient_magnitude = 513204549505.9787
Step 15, gradient_magnitude = 501028105997.0814
Step 16, gradient_magnitude = 489140564644.1252
Step 17, gradient_magnitude = 477535070644.7635
Step 18, gradient_magnitude = 466204932037.81085
Step 19, gradient_magnitude = 455143615657.44507
Step 20, gradient_magnitude = 444344743347.63806
Step 21, gradient_magnitude = 433802

In [337]:
model_weights

array([-9.99999688e+04,  2.45072603e+02,  6.52795267e+01])

###  Predicted price for the 1st house using model 2

In [338]:
test_features = ['sqft_living', 'sqft_living15']
(test_feature_matrix, test_output) = get_numpy_data(test_data, test_features, 'price')

In [339]:
predicted_test_houses = predict_outcome(test_feature_matrix, model_weights)

In [340]:
predicted_test_houses[0]

366651.41162949393

In [341]:
test_data['price'][0]

310000.0

In [342]:
model_2_errors = test_data['price'] - predicted_test_houses

In [343]:
model_2_rss = np.sum(model_2_errors * model_2_errors)
model_2_rss

270263443629803.56

In [344]:
test_rss < model_2_rss

False