# 06-线性回归之梯度下降-python实现

## 创建数据

In [12]:
import numpy as np
X = 2 * np.random.random(size=20000).reshape(-1, 2)
y = X[:, 0] * 2. + X[:, 1] * 3. + 5. + np.random.normal(size=10000)
temp = np.ones((len(y), 1))
X_b = np.hstack((X,temp))                                              #为了矩阵运算方便在X中加上全为1的一列
theta = np.zeros(X_b.shape[1])                                          #theta是参数，梯度下降通过不断更新theta的值使损失函数达到最小值
eta = 0.01                                                              #eta代表是学习速率
episilon = 1e-8                                                         #episilon用来判断损失函数是否收敛
print(X_b.shape)
print(y.shape)
print(theta.shape)

(10000, 3)
(10000,)
(3,)


# 批量梯度下降法

In [13]:
def J(theta, X_b, y): 
    '''
    损失函数
    '''
    return np.sum((y - np.dot(X_b, theta))**2) / len(y)

def dJ(theta, X_b, y):
    '''
    损失函数对theta的偏导数
    '''
    gradient = X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)
    return gradient

In [10]:
def gradient_decent(theta, X_b, y):
    '''
    梯度下降过程
    '''
    while True:
        last_theta = theta
        theta = theta - eta * dJ(theta, X_b, y)
        if abs(J(theta, X_b, y) - J(last_theta, X_b, y)) <= episilon:  #判断损失函数是否收敛，也可以限定最大迭代次数
            break
    return theta

In [14]:
rst = gradient_decent(theta, X_b, y)
print(rst)

[2.00405871 2.98056298 5.00839877]


## 随机梯度下降法

In [18]:
def dJ_sgd(theta, X_b_i, y_i):
    return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2

def sgd(X_b_i, y, theta, n_iters):
    t0 = 5
    t1 = 50
    
    def learn_rate(t):
        return t0/(t + t1)
    
    theta = theta
    for cur_iter in range(n_iters):
        rand_i = np.random.randint(len(X_b))
        gradient = dJ_sgd(theta, X_b[rand_i], y[rand_i])
        theta = theta - learn_rate(cur_iter) * gradient
    
    return theta
print(sgd(X_b, y, theta, n_iters=len(X_b)//3))

[2.00024478 3.01304123 4.99249275]


## 小批量随机下降


In [19]:
def dJ_mbgd(theta, X_b_n, y_n, num):
    return X_b_n.T.dot(X_b_n.dot(theta) - y_n) * 2 / num

def mbgd(theta, X_b, y, num, n_iters):
    t0 = 5
    t1 = 50
    theta = theta
    num = num
    
    def learn_rate(t):
        return t0/(t + t1)
    
    for cur_iter in range(n_iters):
        x_index = np.random.randint(0, len(y), num)
        gradient = dJ_mbgd(theta, X_b[x_index,], y[x_index], num)
        theta  = theta -  learn_rate(cur_iter) * gradient
        
    return theta
print(mbgd(theta, X_b, y, num=20, n_iters=len(X_b)//3))

[2.00226962 2.98841039 5.02916189]
