# 随机梯度下降法

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
m = 100000

x = np.random.normal(size=m)
X = x.reshape(-1, 1)
y = 4.*x + 3. + np.random.normal(0, 3, size=m)

In [3]:
def J(theta, X_b, y):
    try:
        return np.sum((y - X_b.dot(theta))**2) / len(y)
    except:
        return float('inf')

def dJ(theta, X_b, y):
    return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)

def gradient_descent(X_b, y, initial_theta, eta, n_iters = 1e4, epsilon=1e-8):
    theta = initial_theta
    cur_iter=0
    
    while cur_iter < n_iters:
        gradient = dJ(theta, X_b, y)
        last_theta = theta
        theta = theta - (eta*gradient)
        if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
            break;
        cur_iter += 1
        
    return theta

In [4]:
%%time
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
theta = gradient_descent(X_b, y, initial_theta, eta)

CPU times: user 917 ms, sys: 1.64 ms, total: 918 ms
Wall time: 918 ms


In [5]:
theta

array([3.0013392 , 4.00078112])

---

![image.png](attachment:image.png)

## cost的随机

In [6]:
def dJ_sgd(thetae, X_b_i, y_i):
    return X_b_i * (X_b_i.dot(theta) - y_i) * 2.

随机梯度下降法

In [7]:
# 随机梯度下降法
def sgd(X_b, y, initial_theta, n_iters):
    
    t0 = 5
    t1 = 50
    
    def learing_rate(t):
        return t0 / (t + t1)
    
    theta = initial_theta
    for cur_iter in range(n_iters):
        rand_i = np.random.randint(len(X_b))
        gradient = dJ_sgd(theta, X_b[rand_i], y[rand_i])
        theta = theta - learing_rate(cur_iter) * gradient
        
    return theta

In [8]:
%%time
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
theta = sgd(X_b, y, initial_theta, len(X_b))

CPU times: user 1.11 s, sys: 7.05 ms, total: 1.11 s
Wall time: 1.1 s


In [9]:
theta

array([ 2.08041735, -3.60286488])