In [1]:
import numpy as np
import pandas as pd

In [2]:
train=pd.read_csv('kc_house_train_data.csv')
train['sqft_living']=train['sqft_living'].astype(float)
test=pd.read_csv('kc_house_test_data.csv')
test['sqft_living']=test['sqft_living'].astype(float)
train['sqft_living15']=train['sqft_living15'].astype(float)
test['sqft_living15']=test['sqft_living15'].astype(float)

In [3]:
test.columns

Index([u'id', u'date', u'price', u'bedrooms', u'bathrooms', u'sqft_living',
       u'sqft_lot', u'floors', u'waterfront', u'view', u'condition', u'grade',
       u'sqft_above', u'sqft_basement', u'yr_built', u'yr_renovated',
       u'zipcode', u'lat', u'long', u'sqft_living15', u'sqft_lot15'],
      dtype='object')

In [4]:
def get_numpy_data(data, features, output):      # get features matrix from data frame
    data['constant'] = 1.0
    features = ['constant'] + features
    features_matrix = data[features].values
    output_array = data[output].values
    return(features_matrix, output_array)

In [5]:
def predict_outcome(feature_matrix, weights):               # predict outcomes
    predictions=np.dot(feature_matrix, weights)
    return(predictions)

In [6]:
def feature_derivative(errors, feature):                  # negative derivative
    derivative=2.0*np.dot(feature,errors)
    return(derivative)

In [7]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):    # final program!
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        gradient_sum_squares = 0 
        yhead=predict_outcome(feature_matrix,weights)
        errors=output-yhead
        for i in range(len(weights)):
            derivative=feature_derivative(errors,feature_matrix[:,i])
            gradient_sum_squares=gradient_sum_squares+derivative**2
            weights[i]=weights[i]+step_size*derivative            
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

In [8]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix,output) = get_numpy_data(train, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7
simple_weights = regression_gradient_descent(simple_feature_matrix,output,initial_weights,step_size,tolerance)
simple_weights
(feature_matrix, output) = get_numpy_data(test, simple_features ,my_output)
rss1=np.sum((predict_outcome(feature_matrix, simple_weights)-output)**2)
predict1=predict_outcome(feature_matrix, simple_weights)[0]

In [9]:
output

array([ 310000.,  650000.,  233000., ...,  610685.,  400000.,  402101.])

In [10]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9
simple_weights2= regression_gradient_descent(feature_matrix,output,initial_weights,step_size,tolerance)
simple_weights2
(feature_matrix, output) = get_numpy_data(test, model_features ,my_output)
rss2=np.sum((predict_outcome(feature_matrix, simple_weights2)-output)**2)
predict2=predict_outcome(feature_matrix, simple_weights2)[0]

In [11]:
predict2

366651.41162949387

In [12]:
feature_matrix

array([[  1.00000000e+00,   1.43000000e+03,   1.78000000e+03],
       [  1.00000000e+00,   2.95000000e+03,   2.14000000e+03],
       [  1.00000000e+00,   1.71000000e+03,   1.03000000e+03],
       ..., 
       [  1.00000000e+00,   2.52000000e+03,   2.52000000e+03],
       [  1.00000000e+00,   2.31000000e+03,   1.83000000e+03],
       [  1.00000000e+00,   1.02000000e+03,   1.02000000e+03]])

In [13]:
test.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,constant
0,114101516,20140528T000000,310000,3,1.0,1430,19901,1.5,0,0,...,1430,0,1927,0,98028,47.7558,-122.229,1780,12697,1
1,9297300055,20150124T000000,650000,4,3.0,2950,5000,2.0,0,3,...,1980,970,1979,0,98126,47.5714,-122.375,2140,4000,1
2,1202000200,20141103T000000,233000,3,2.0,1710,4697,1.5,0,0,...,1710,0,1941,0,98002,47.3048,-122.218,1030,4705,1
3,8562750320,20141110T000000,580500,3,2.5,2320,3980,2.0,0,0,...,2320,0,2003,0,98027,47.5391,-122.07,2580,3980,1
4,7589200193,20141110T000000,535000,3,1.0,1090,3000,1.5,0,0,...,1090,0,1929,0,98117,47.6889,-122.375,1570,5080,1


In [14]:
simple_weights2

array([ -9.99999688e+04,   2.45072603e+02,   6.52795267e+01])

In [15]:
-9.99999688e+04+2.45072603e+02*1430+6.52795277e+01*1780

366651.412796

In [16]:
rss1,rss2

(275400044902128.31, 270263443629803.56)

In [17]:
from sklearn.linear_model import LinearRegression
lm2=LinearRegression()
lm1=LinearRegression()
column1=['sqft_living']
column2=['sqft_living', 'sqft_living15']
x1=train[column1]
x2=train[column2]
y=train['price']
x3=test[column2]
lm1.fit(x2,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [18]:
lm1.coef_
lm1.intercept_

-100262.1751585335

In [19]:
lm1.coef_

array([ 245.18871442,   65.27158522])

In [20]:
lm1.predict(x3)

array([ 366541.10816718,  762725.72477249,  386240.25928765, ...,
        682097.7799554 ,  585570.75612155,  216407.33048282])