# Linear Regression using Stochastic Gradient Descent(SGD) on Boston House Prices

In [160]:
from sklearn.datasets import load_boston
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.metrics import r2_score
from datetime import datetime

In [2]:
boston_data = load_boston()

In [3]:
X = boston_data.data

In [173]:
X.shape

(506, 13)

Observation: There are 506 Data points with 13 Features

In [5]:
y = boston_data.target
len(y)

506

In [104]:
from sklearn.cross_validation import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.33)

### Standardization

In [105]:
std = StandardScaler()
x_train = std.fit_transform(x_train)
x_test = std.fit_transform(x_test)

## Custom Linear Regression using Gradient Descent 

In [166]:
def linearRegressionGD(X,y,w = np.zeros(13),b = 0,iterations=5000,learning_rate=0.1,epsilon_val=0.004):
    """
        Linear Regression Using Gradient Descent Optimization

        Parameters
        ----------
        X : numpy array or sparse matrix of shape [n_samples,n_features]
            Training data

        y : numpy array of shape [n_samples, n_targets]
            Target values

        w:  Weight Vector first initializad with 0's
        
        
        b:  y-intercept
        
        
        iterations: no.of iterations
        
        epsilon_val: Termination Criteria

        Returns
        -------
        w,b,cost or MSE
        """
    start = datetime.now()
    n = float(len(y))
    costs = [];
    decay_rate = 0.0001
    eta=learning_rate
    for i in range(iterations):
        
        #learning rate 
        eta = eta * (1.0/(1+decay_rate*i))
        error = y -  predict(X,w,b) #y_act - y_actual
        
        #partial differentiation w.r.t w
        w_grad = (-2.0/n)*(X.T.dot(error))
        
        #partial differentiation w.r.t b
        b_grad = (-2.0/n)*(np.sum(error))
        
        #MSE
        cost = (1.0/n)*np.sum(np.power(error,2))
        
        #updating weight vector
        w = w - (eta*w_grad)
        
        #updating y-intercept
        b = b - (eta*b_grad)
        
        costs.append(cost)
        
        #Stopping Criteria
        if i==0:
            w_prev = w;
        else:
            dist = (np.linalg.norm(w-w_prev))
            w_prev = w
            
            #Terminates the loop when difference between w and w_prev is very less 
            if(round(dist,6)<epsilon_val):
                print("no.of iterations: ",i,"Cost: ",min(costs))
                break;
        
    print("Time: Taken: ",datetime.now() - start)
    return w,b,cost;



def predict(x,m,c):
    """
        It predicts the target value based on the input parameter values
        
        Parameters
        ----------
        x : numpy array or sparse matrix of shape [n_samples,n_features]
            Training data
        
        
    """
    y = np.dot(x,m) + c
    return y


def score(X,y,m,c):
    
    """
    It gives the Score or accuracy
    
    Parameters
        ----------
        X : numpy array or sparse matrix of shape [n_samples,n_features]
            Training data
            
        y : numpy array of shape [n_samples, n_targets]
            Target values
            
        m : slope
        
        c : y-intercept  
    """
    
    from sklearn.metrics import r2_score
    return r2_score(y,predict(X,m,c).flatten())

In [167]:
w1,b1,cost1 = linearRegressionGD(x_train,y_train)

no.of iterations:  152 Cost:  23.1395328035
Time: Taken:  0:00:00.011968


In [169]:
score(x_test,y_test,w1,b1)

0.72196287871389897

## Custom Linear Regression using SGD

In [164]:
def linearRegressionSGD(X,y,w = np.zeros(13),b = 0,iterations=1000,case_size=100,learning_rate=0.1,epsilon_val=0.004):
    """
        Linear Regression Using Gradient Descent Optimization

        Parameters
        ----------
        X : numpy array or sparse matrix of shape [n_samples,n_features]
            Training data

        y : numpy array of shape [n_samples, n_targets]
            Target values

        w:  Weight Vector first initializad with 0's
        
        
        b:  y-intercept
        
        
        iterations: no.of iterations
        
        
        case_size : bunch of points to be taken in each iteration 
        
        
        epsilon_val: Termination Criteria


        Returns
        -------
        w,b,cost or MSE
        """
    start = datetime.now()
    n = float(len(y))
    costs = [];
    decay_rate = 0.0001
    eta=learning_rate
    
    
    for iters in range(iterations):
        eta = eta * (1.0/(1+decay_rate*iters))
        
        X,y =  shuffle(X,y,random_state=0,replace=False)
    
    
        for i in range(case_size):
            x_mini = X[i:i+case_size]
            y_mini = y[i:i+case_size]
            #learning rate 
            #eta = eta * (1.0/(1+decay_rate*i))
            error = y_mini -  predict(x_mini,w,b) #y_act - y_predicted

            #partial differentiation w.r.t w
            w_grad = (-2.0/n)*(x_mini.T.dot(error))

            #partial differentiation w.r.t b
            b_grad = (-2.0/n)*(np.sum(error))

            #MSE
            cost = (1.0/n)*np.sum(np.power(error,2))

            #updating weight vector
            w = w - (eta*w_grad)

            #updating y-intercept
            b = b - (eta*b_grad)

            costs.append(cost)

            #Stopping Criteria
        #Stopping Criteria
        if iters==0:
            w_prev = w;
        else:
            dist = (np.linalg.norm(w-w_prev))
            w_prev = w
            
            #Terminates the loop when difference between w and w_prev is very less 
            if(round(dist,6)<epsilon_val):
                print("no.of iterations: ",iters,"Cost: ",min(costs))
                break;
        
    print("Time Taken: ",datetime.now() - start)
    return w,b,cost;

In [165]:
w,b,cost = linearRegressionSGD(x_train,y_train)

no.of iterations:  375 Cost:  2.63148090471
Time Taken:  0:00:00.915583


In [155]:
# final Weight Vector
w

array([-0.58435358,  1.1170941 ,  0.5553632 ,  0.86510863, -2.0640718 ,
        2.65955573,  0.41057385, -3.07033171,  2.77224903, -2.12395843,
       -2.001628  ,  1.0886039 , -4.51357934])

In [156]:
# Final y-intercept
b

22.782099005287087

In [151]:
cost

8.1187871776980671

In [152]:
score(x_test,y_test,w,b)

0.72068643671585575

## SKlearn Linear Regression

In [142]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

#n_job=no.cores processor, Parallelization
lr = LinearRegression(n_jobs=4)

lr.fit(x_train,y_train)

y_pred = lr.predict(x_test)


In [146]:
r2_score(y_test,y_pred)

0.72237302553200955

## Scores Custom vs Sklearn Implementation

In [172]:
print("Custom GD Linear Regression Score: ",score(x_test,y_test,w1,b1))
print("Custom SGD Linear Regression Score: ",score(x_test,y_test,w,b))
print("SkLearn Linear Regression Score: ",r2_score(y_test,y_pred))

Custom GD Linear Regression Score:  0.721962878714
Custom SGD Linear Regression Score:  0.720686436716
SkLearn Linear Regression Score:  0.722373025532
