# W2_Multiple Linear Regression (II)

In [1]:
import numpy as np
import pandas as pd
from math import sqrt

In [2]:
# get numpy array we need from dataframe
def get_numpy_array(data, features, output):
    data['constant'] = 1
    features = ['constant'] + features
    features_matrix = data[features].values
    output_array = data[output].values[:,0] # get pure 1-D np array
    return (features_matrix, output_array)

In [3]:
# compute for estimated output
def get_predictions(features_matrix, weights):
    predictions = np.dot(features_matrix, weights)
    return predictions

In [4]:
# get derivative given a feature
def get_feature_derivative(errors, feature):
    derivative = 2 * np.dot(errors, feature)
    return derivative

In [5]:
# implement gradient descent to update linear regression model's weights
def reg_gradient_descent(features_matrix, output, initial_w, step_size, tolerance):
    converged = False
    weights = initial_w
    while not converged:
        predictions = get_predictions(features_matrix, initial_w)
        errors = predictions - output
        
        gradient_sum_squares = 0
        
        for i in range(len(weights)):
            derivative = get_feature_derivative(errors, features_matrix[:,i])
            gradient_sum_squares += derivative ** 2
            weights[i] = weights[i] - step_size * derivative
        
        gradient_magnitude = sqrt(gradient_sum_squares)
        
        if gradient_magnitude < tolerance:
            converged = True
            
    return weights

In [6]:
train_data = pd.read_csv('kc_house_train_data.csv')
test_data = pd.read_csv('kc_house_test_data.csv')

In [7]:
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [8]:
test_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,114101516,20140528T000000,310000.0,3,1.0,1430,19901,1.5,0,0,...,7,1430,0,1927,0,98028,47.7558,-122.229,1780,12697
1,9297300055,20150124T000000,650000.0,4,3.0,2950,5000,2.0,0,3,...,9,1980,970,1979,0,98126,47.5714,-122.375,2140,4000
2,1202000200,20141103T000000,233000.0,3,2.0,1710,4697,1.5,0,0,...,6,1710,0,1941,0,98002,47.3048,-122.218,1030,4705
3,8562750320,20141110T000000,580500.0,3,2.5,2320,3980,2.0,0,0,...,8,2320,0,2003,0,98027,47.5391,-122.07,2580,3980
4,7589200193,20141110T000000,535000.0,3,1.0,1090,3000,1.5,0,0,...,8,1090,0,1929,0,98117,47.6889,-122.375,1570,5080


In [9]:
# model 1
simple_feature = ['sqft_living']
output = ['price']
simple_feature_matrix, output_array = get_numpy_array(train_data, simple_feature, output)
initial_w = np.array([-47000.0, 1.0])
step_size = 7e-12
tolerance = 2.5e7

simple_weights = reg_gradient_descent(simple_feature_matrix, output_array, initial_w, step_size, tolerance)

In [10]:
simple_weights

array([-46999.88716555,    281.91211918])

In [11]:
# make a prediction
X = np.array([1, 1430])
get_predictions(X, simple_weights)

356134.44325500238

In [12]:
# model 2
features = ['sqft_living', 'sqft_living15']
output = ['price']
features_matrix, output_array = get_numpy_array(train_data, features, output)
initial_w = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

multiple_weights = reg_gradient_descent(features_matrix, output_array, initial_w, step_size, tolerance)

In [13]:
# make a prediction
X = test_data.iloc[0][['sqft_living','sqft_living15']].values
X = np.array([1, X[0], X[1]])
get_predictions(X, multiple_weights)

366651.41162949387

In [14]:
# get RSS value
def get_RSS(X, weights, y):
    predictions = get_predictions(X, weights)
    return sum((predictions-y) ** 2)

In [15]:
RSS1 = get_RSS(simple_feature_matrix, simple_weights, output_array)
RSS2 = get_RSS(features_matrix, multiple_weights, output_array)

In [16]:
RSS1 > RSS2

True