In [1]:
import numpy as np
import pandas as pd
import scipy.linalg as la
from sklearn.linear_model import LinearRegression

### Load the Data

In [2]:
houses = pd.read_csv("kc_house_data.csv")
houses.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


### Clean the Data

In [3]:
houses = houses[["price", "bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors"]]
houses["intercept"] = 1

house_features = houses.drop("price", axis=1)
house_prices = houses["price"]

### My Ordinary Least Squares

In [4]:
def my_linreg(A,y):
    Q,R = np.linalg.qr(A)
    return la.solve(R.T.dot(R), A.T.dot(y))

### Scikit-learn Ordinary Least Squares

In [5]:
reg = LinearRegression().fit(house_features, house_prices)

### Evaluation of Results
#### Error

In [6]:
results = my_linreg(house_features, house_prices)
coeffs = results[:5]
intercept = results[5]

inf_norm_coeffs = np.max(np.abs(coeffs - reg.coef_[:5]))
inf_norm_intercept = np.abs(intercept - reg.intercept_)

print "Coefficient max error:", inf_norm_coeffs
print "Intercept error:", inf_norm_intercept

Coefficient max error: 3.74097908207e-09
Intercept error: 5.08298398927e-08


#### Timing

In [7]:
%timeit LinearRegression().fit(house_features, house_prices)

100 loops, best of 3: 3.84 ms per loop


In [8]:
%timeit my_linreg(house_features, house_prices)

100 loops, best of 3: 3.09 ms per loop
