# Stochastic Gradient Descent

In [1]:
import numpy as np

In [2]:
m = 100000

x = np.random.normal(size=m)
X = x.reshape(-1, 1)
y = 4. * x + 3. + np.random.normal(0, 3, size=m)

## Batch Gradient Descent

In [3]:
def J(theta, X_b, y):
    try:
        return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b)
    except:
        return float('inf')
    
def dJ(theta, X_b, y):
    return X_b.T.dot(X_b.dot(theta) - y) * 2 /len(X_b)


def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
    theta = initial_theta
    i_ters = 0

    while i_ters < n_iters:
        gradient = dJ(theta, X_b, y)
        last_theta = theta

        theta = theta - eta * gradient
        
        if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
            break
        
        i_ters += 1
        
    return theta

In [4]:
%%time
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01

theta = gradient_descent(X_b, y, initial_theta, eta)

CPU times: user 1.08 s, sys: 79.2 ms, total: 1.16 s
Wall time: 610 ms


In [5]:
theta

array([3.00548183, 3.99957439])

## Stochastic Gradient Descent

In [6]:
def dJ_sgd(theta, X_b_i, y_i):
    return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2.

In [7]:
def sgd(X_b, y, initial_theta, n_iters=1e4):
    t0 = 5
    t1 = 50
    
    def learning_rate(t):
        return t0 / (t + t1)
    
    theta = initial_theta
    for cur_iter in range(n_iters):
        rand_i = np.random.randint(len(X_b))
        grandient = dJ_sgd(theta, X_b[rand_i], y[rand_i])
        theta = theta - learning_rate(cur_iter) * grandient
        
    return theta

In [8]:
%%time
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])

theta = sgd(X_b, y, initial_theta, n_iters=len(X_b)//3)

CPU times: user 259 ms, sys: 5.19 ms, total: 264 ms
Wall time: 272 ms


In [9]:
theta

array([3.01051997, 4.00042883])

## Implement Stochastic Gradient Descent in our Linear Regression

In [10]:
from LinearRegression import LinearRegression

In [11]:
lin_reg = LinearRegression()

In [12]:
lin_reg.fit_sgd(X, y, n_iters=2)

LinearRegression()

In [13]:
lin_reg.coef_

array([4.00250208])

In [14]:
lin_reg.interception_

2.99549432743106

In [15]:
from sklearn import datasets

In [16]:
boston = datasets.load_boston()
X = boston.data
y = boston.target

X = X[y < 50]
y = y[y < 50]

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)

In [19]:
from sklearn.preprocessing import StandardScaler

In [20]:
standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train_standard = standardScaler.transform(X_train)
X_test_standard = standardScaler.transform(X_test)

In [21]:
lin_reg1 = LinearRegression()

In [22]:
%time lin_reg1.fit_sgd(X_train_standard, y_train, n_iters=2)

CPU times: user 7.28 ms, sys: 3.36 ms, total: 10.6 ms
Wall time: 8.46 ms


LinearRegression()

In [23]:
lin_reg1.score(X_test_standard, y_test)

0.7023540267002972

In [24]:
lin_reg2 = LinearRegression()
%time lin_reg2.fit_sgd(X_train_standard, y_train, n_iters=50)
lin_reg2.score(X_test_standard, y_test)

CPU times: user 109 ms, sys: 3.27 ms, total: 112 ms
Wall time: 116 ms


0.8010607203772715

In [25]:
lin_reg3 = LinearRegression()
%time lin_reg3.fit_sgd(X_train_standard, y_train, n_iters=200)
lin_reg3.score(X_test_standard, y_test)

CPU times: user 347 ms, sys: 3.66 ms, total: 351 ms
Wall time: 355 ms


0.8013637263683608

# Scikit Learn 中的 SGD

In [26]:
from sklearn.linear_model import SGDRegressor

In [29]:
sgd_reg = SGDRegressor()
%time sgd_reg.fit(X_train_standard, y_train)
sgd_reg.score(X_test_standard, y_test)

CPU times: user 1.41 ms, sys: 669 µs, total: 2.08 ms
Wall time: 1.29 ms




0.781791984344825

In [30]:
sgd_reg = SGDRegressor(n_iter=100)
%time sgd_reg.fit(X_train_standard, y_train)
sgd_reg.score(X_test_standard, y_test)

CPU times: user 6.94 ms, sys: 1.69 ms, total: 8.63 ms
Wall time: 6.65 ms




0.800368893275816