# W4_Ridge Regression (II)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# import data
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
data = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

In [3]:
# get numpy array we need from dataframe
def get_numpy_data(dataframe, features, output):
    dataframe['constant'] = 1
    features = ['constant'] + features
    features_matrix = dataframe[features].values
    output_array = dataframe[output].values 
    return (features_matrix, output_array)

In [4]:
# compute for estimated output
def get_predictions(features_matrix, weights):
    predictions = np.dot(features_matrix, weights)
    return predictions

In [5]:
# get a certrain feature's derivative
def get_feature_derivative(errors, feature, weights, l2_penalty, is_constant):
    if is_constant == False:
        term1 = 2 * np.dot(errors, feature)
        term2 = 2 * l2_penalty * weights # regularization term
        derivative = term1 + term2
    else:
        derivative = 2 * np.dot(errors, feature) # we don't want to do regularization on intercept
    return derivative

# test get_feature_derivative function
print('# Testing get_feature_derivative function:')

example_features, example_output = get_numpy_data(data, ['sqft_living'], 'price')
my_weights = np.array([1., 10.])
example_predictions = get_predictions(example_features, my_weights)
errors = example_predictions - example_output 

# next two lines should print the same values
print(get_feature_derivative(errors, example_features[:,1], my_weights[1], 1, False))
print(np.sum(errors*example_features[:,1])*2+20.)

# next two lines should print the same values
print(get_feature_derivative(errors, example_features[:,0], my_weights[0], 1, True))
print(np.sum(errors)*2.)

# Testing get_feature_derivative function:
-5.6554166816e+13
-5.6554166816e+13
-22446749330.0
-22446749330.0


In [6]:
# implement gradient descent of Ridge regression
def gradient_descent(features_matrix, output, ini_weights, step_size, l2_penalty, iteration):
    weights = ini_weights
    num = 0
    while num <= iteration:
        
        predictions = get_predictions(features_matrix, weights)
        errors = predictions - output
        
        # loop over every weight
        for i in range(len(weights)):
            if i == 0:
                is_constant = True
            else:
                is_constant = False
            
            derivative = get_feature_derivative(errors, features_matrix[:,i], weights[i], l2_penalty, is_constant)
            weights[i] = weights[i] - step_size * derivative
        
        num += 1
        
    return weights

In [7]:
# test gradient_descent function using different penalty
# simple feature
simple_feature = ['sqft_living']
output = 'price'
train_feature_1, train_output_1 = get_numpy_data(train_data, simple_feature, output)
test_feature_1, test_output_1 = get_numpy_data(test_data, simple_feature, output)
ini_weights_1 = np.array([0., 0.])
step_size = 1e-12
iteration = 1000

In [8]:
simple_weights_penalty_0 = gradient_descent(train_feature_1, train_output_1, np.array([0.,0.]), step_size, 0, iteration)
simple_weights_penalty_0

array([ -1.63382329e-01,   2.63024369e+02])

In [9]:
simple_weights_penalty_high = gradient_descent(train_feature_1, train_output_1, np.array([0.,0.]), step_size, 1e11, iteration)
simple_weights_penalty_high

array([   9.77704731,  124.57217382])

In [10]:
print(ini_weights_1)
print(simple_weights_penalty_0)
print(simple_weights_penalty_high)

[ 0.  0.]
[ -1.63382329e-01   2.63024369e+02]
[   9.77704731  124.57217382]


In [11]:
# compute RSS
def get_RSS(X, weights, y):
    predictions = get_predictions(X, weights)
    return sum((predictions-y) ** 2)

# compare RSS (simple feature)
RSS1 = get_RSS(test_feature_1, ini_weights_1, test_output_1)
RSS2 = get_RSS(test_feature_1, simple_weights_penalty_0, test_output_1)
RSS3 = get_RSS(test_feature_1, simple_weights_penalty_high, test_output_1)
print(RSS1, RSS2, RSS3)

1.78427328614e+15 2.75723632143e+14 6.9464208928e+14


In [12]:
# test gradient_descent function using different penalty
# multiple features
features = ['sqft_living', 'sqft_living15']
output = 'price'
train_feature_2, train_output_2 = get_numpy_data(train_data, features, output)
test_feature_2, test_output_2 = get_numpy_data(test_data, features, output)

ini_weights_2 = np.array([0., 0., 0.])

multiple_weights_penalty_0 = gradient_descent(train_feature_2, train_output_2, np.array([0.,0.,0.]), step_size, 0, iteration)
multiple_weights_penalty_0

array([  -0.35780714,  243.05572642,   22.41312485])

In [13]:
multiple_weights_penalty_high = gradient_descent(train_feature_2, train_output_2, np.array([0.,0.,0.]), step_size, 1e11, iteration)
multiple_weights_penalty_high

array([  6.74968592,  91.48927275,  78.43658676])

In [14]:
RSS_1 = get_RSS(test_feature_2, ini_weights_2, test_output_2)
RSS_2 = get_RSS(test_feature_2, multiple_weights_penalty_0, test_output_2)
RSS_3 = get_RSS(test_feature_2, multiple_weights_penalty_high, test_output_2)
print(RSS1, RSS2, RSS3)

1.78427328614e+15 2.75723632143e+14 6.9464208928e+14


In [15]:
# use first row in test data set to compare which model has lower RSS
test_data.head(1)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,constant
0,114101516,20140528T000000,310000.0,3.0,1.0,1430.0,19901,1.5,0,0,...,1430,0,1927,0,98028,47.7558,-122.229,1780.0,12697.0,1


In [16]:
X = np.array([1, 1430.0, 1780.0])
prediction1 = get_predictions(X, multiple_weights_penalty_0)
prediction2 = get_predictions(X, multiple_weights_penalty_high)

In [17]:
prediction1

387464.69321329583

In [18]:
prediction2

270453.53414681269