![](批量梯度下降法.png)
由于求梯度时，每个样本都要参与到计算中，导致样本数量大时，时间耗费较多

![](随机梯度下降法.png)
![](图示.png)

模拟退火的思想·

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [4]:
m = 100000

x = np.random.normal(size=m)
X = x.reshape(-1,1)
y = x*4.+3.+np.random.normal(0,3,size=m)

In [5]:
def J(theta, X_b, y):
    try:
        return np.sum(y-X_b,dot(theta)**2) / len(y)
    except:
        return float('inf')

def dJ(theta, X_b, y):
    return X_b.T.dot(X_b.dot(theta)-y)*2/len(y)

def gradient_descent(X_b, y, initial_theta, eta, n_iter=1e4, epsilon=1e-8):
    theta = initial_theta
    curr_iter = 0
    
    while(curr_iter < n_iter):
        gradient = dJ(theta, X_b, y)
        last_theta = theta
        theta = theta - eta * gradient
        if(abs(J(theta,X_b,y) - J(last_theta,X_b,y)) < epsilon):
            break
        curr_iter += 1
    return theta

In [9]:
%%time
X_b = np.hstack([np.ones([len(X),1]), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
theta = gradient_descent(X_b,y,initial_theta,eta)

Wall time: 10.8 s


In [10]:
theta

array([2.99535133, 3.98606195])

### 随机梯度下降法

In [11]:

def dJ_sgd(theta, X_b_i, y_i):
    return X_b_i.T.dot(X_b_i.dot(theta)-y_i)*2


# Stochastic gradient descent
def sgd(X_b, y, initial_theta, n_iter):
    t0 = 5
    t1 = 50
    
    def learning_rate(t):
        return t0 / (t + t1)
    
    theta = initial_theta
    for curr_iter in range(n_iter):
        rand_i = np.random.randint(len(X_b))
        gradient = dJ_sgd(theta,X_b[rand_i],y[rand_i])
        theta = theta - learning_rate(curr_iter)*gradient
        
    return theta

In [12]:
%%time
X_b = np.hstack([np.ones([len(X),1]), X])
initial_theta = np.zeros(X_b.shape[1])
theta = sgd(X_b,y,initial_theta,n_iter=len(X_b)//3)

Wall time: 370 ms


In [13]:
theta

array([2.99208417, 3.9861854 ])