In [1]:
from sklearn import datasets
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import linear_model
import numpy as np

In [2]:
boston = datasets.load_boston()

In [3]:
X = boston.data

In [4]:
Y = boston.target

In [5]:
X.shape

(506, 13)

In [6]:
X_scaled = preprocessing.scale(X)

In [7]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X_scaled, Y, test_size=.3)

In [8]:
lgr = linear_model.LinearRegression()

In [9]:
lgr.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [10]:
lgr.intercept_

22.364704850120297

In [11]:
lgr.coef_

array([-0.68323851,  1.01815476,  0.47181209,  0.34363802, -1.94909919,
        3.20498713, -0.145166  , -2.92886123,  2.30503435, -2.12285947,
       -2.05384684,  1.12036685, -3.417728  ])

In [12]:
Y_test_pred = lgr.predict(X_test)

In [13]:
lgr.score(X_test, Y_test_pred)

1.0

In [14]:
def cost_function(X, Y, theta):
    return (1/(2*X.shape[0]))*np.sum((np.dot(X, theta) - Y)**2, axis=0)

In [15]:
def batch_gradient_descent(X, Y, alpha=.001, error=1e-9):
    (m,n) = X.shape
    X = np.hstack([np.ones((m,1)), X])
    theta = np.zeros((n+1,1))
    no_of_iterations = 0
    
    while True:
        cost_prev = cost_function(X, Y, theta)
        theta -= (alpha/m)*np.dot(X.T, np.dot(X, theta)-Y)
        cost_curr = cost_function(X, Y, theta)
        no_of_iterations += 1
        if abs(cost_curr - cost_prev)<error:
            break

    return (theta, no_of_iterations)
    

In [16]:
Y_train = np.reshape(Y_train, (Y_train.shape[0],1))
Y_test = np.reshape(Y_test, (Y_test.shape[0],1))

In [17]:
%%time
(theta, no_of_iterations) = batch_gradient_descent(X_train, Y_train)

CPU times: user 2.3 s, sys: 4.17 ms, total: 2.31 s
Wall time: 2.31 s


In [18]:
theta

array([[22.36458873],
       [-0.68220283],
       [ 1.01680972],
       [ 0.46853043],
       [ 0.34404329],
       [-1.94768456],
       [ 3.20619066],
       [-0.14594213],
       [-2.92821709],
       [ 2.29512444],
       [-2.11244433],
       [-2.05336665],
       [ 1.12038542],
       [-3.41690566]])

In [35]:
def stochastic_gradient(X, Y, alpha=0.01, error=1e-9):
    (m,n) = X.shape
    X = np.hstack([np.ones((m,1)), X])
    theta = np.zeros((n+1,1))
    no_of_iterations = 0
    
    np.random.shuffle(X)
    
    while True:
        cost_prev = cost_function(X, Y, theta)
        theta -= alpha*np.dot(X[:n+1, :].T, np.dot(X[:n+1, :], theta) - Y[:n+1, :])
        cost_curr = cost_function(X, Y, theta)
        no_of_iterations += 1
        if abs(cost_prev-cost_curr)<error:
            break
        
    return (theta, no_of_iterations)

In [38]:
%%time
stochastic_gradient(X_train, Y_train)

CPU times: user 4.62 s, sys: 5.99 ms, total: 4.63 s
Wall time: 4.63 s


(array([[ 22.83959457],
        [ 26.73896819],
        [-20.3219519 ],
        [-58.81003056],
        [ -6.22604075],
        [  3.24237867],
        [ 21.53286477],
        [ -7.73742086],
        [ 38.3716848 ],
        [-62.89731722],
        [ 85.02558252],
        [-20.98501644],
        [ -7.67892779],
        [ 44.83890689]]), 173412)