In [15]:
import pandas as pd
import numpy as np
import scipy.linalg as la
from sklearn.linear_model import Ridge

### Load the Data

In [16]:
houses = pd.read_csv("kc_house_data.csv")
houses.tail()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
21608,263000018,20140521T000000,360000.0,3,2.5,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,6600060120,20150223T000000,400000.0,4,2.5,2310,5813,2.0,0,0,...,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,1523300141,20140623T000000,402101.0,2,0.75,1020,1350,2.0,0,0,...,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,291310100,20150116T000000,400000.0,3,2.5,1600,2388,2.0,0,0,...,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287
21612,1523300157,20141015T000000,325000.0,2,0.75,1020,1076,2.0,0,0,...,7,1020,0,2008,0,98144,47.5941,-122.299,1020,1357


### Clean the Data

In [30]:
houses = houses[["price", "bedrooms", "bathrooms", "sqft_living", "sqft_lot"]]

house_features = houses.drop("price", axis=1)
house_prices = houses["price"]

### My Ordinary Least Squares
I have changed this function from my previous homework assignment to calculate the coefficients and intercept separately.

In [57]:
def my_linreg(A,y):
    """
    Ordinary Least Squares algorithm.
    
    Inputs:
        A (array-like) - 2D array of training features.
        y (array-like) - 1D array of target values.
    Outputs:
        w (array-like) - array of coefficients.
        w0 (float) - intercept.
    """
    # Center about mean.
    X = A - A.mean(axis=0)
    
    # Solve for coefficients
    Q,R = np.linalg.qr(X)
    w = la.solve(R.T.dot(R), X.T.dot(y))
    
    # Solve for intercept
    w0 = y.mean() - w.dot(A.mean(axis=0))
    
    return w, w0

### My Ridge Regression

In [32]:
def my_ridge(A, y, lmbda):
    """
    Ridge Regression (Penalized Least Squares).
    
    Inputs:
        A (array-like) - 2D array of training features.
        y (array_like) - 1D array of target values.
        lmbda (float) - scalar used to weight l2 regularization.
    Outputs:
        w (array-like) - array of coefficients.
        w0 (float) - intercept.
    """
    # Center about mean.
    X = A - A.mean(axis=0)
    
    # Augment data with terms used for l2 regularization.
    X = np.vstack((X, np.sqrt(lmbda)*np.eye(A.shape[1])))
    Y = np.hstack((y, np.zeros(A.shape[1])))
    
    # Solve for coefficients.
    Q,R = np.linalg.qr(X)
    w = la.solve(R.T.dot(R), X.T.dot(Y))
    
    # Solve for intercept.
    w0 = y.mean() - w.dot(A.mean(axis=0))
    
    return w, w0

### Comparison (Answers)

In [73]:
lin_w, lin_w0 = my_linreg(house_features.values, house_prices.values)
print "OLS coefficients:", '\t\t\t', lin_w, '\n'
for lmbda in [10**k for k in np.arange(-5,6)]:
    ridge_w, ridge_w0 = my_ridge(house_features.values, house_prices.values, lmbda)
    skridge_w = Ridge(alpha=lmbda).fit(house_features, house_prices).coef_
    
    print "(lambda="+str(lmbda)+"):"
    print "\tMy Ridge coefficients\t\t", ridge_w
    print "\tScikit Ridge coefficients\t", skridge_w 
    print ""

OLS coefficients: 			[ -5.94068124e+04   6.26866040e+03   3.14291721e+02  -3.77652579e-01] 

(lambda=1e-05):
	My Ridge coefficients		[ -5.94068124e+04   6.26866038e+03   3.14291721e+02  -3.77652579e-01]
	Scikit Ridge coefficients	[ -5.94068124e+04   6.26866038e+03   3.14291721e+02  -3.77652579e-01]

(lambda=0.0001):
	My Ridge coefficients		[ -5.94068119e+04   6.26866018e+03   3.14291721e+02  -3.77652578e-01]
	Scikit Ridge coefficients	[ -5.94068119e+04   6.26866018e+03   3.14291721e+02  -3.77652578e-01]

(lambda=0.001):
	My Ridge coefficients		[ -5.94068074e+04   6.26865816e+03   3.14291719e+02  -3.77652573e-01]
	Scikit Ridge coefficients	[ -5.94068074e+04   6.26865816e+03   3.14291719e+02  -3.77652573e-01]

(lambda=0.01):
	My Ridge coefficients		[ -5.94067623e+04   6.26863802e+03   3.14291705e+02  -3.77652518e-01]
	Scikit Ridge coefficients	[ -5.94067623e+04   6.26863802e+03   3.14291705e+02  -3.77652518e-01]

(lambda=0.1):
	My Ridge coefficients		[ -5.94063112e+04   6.26843658e+03   

### Comparison (Timing)

In [47]:
%timeit my_linreg(house_features.values, house_prices.values)

1000 loops, best of 3: 1 ms per loop


In [48]:
%timeit my_ridge(house_features.values, house_prices.values, .1)

100 loops, best of 3: 1.56 ms per loop


In [72]:
%timeit Ridge(alpha=.1).fit(house_features.values, house_prices.values)

1000 loops, best of 3: 795 µs per loop
