## 梯度下降、加速梯度下降、随机梯度下降、随机坐标下降

In [1]:
import scipy.sparse as sp
import numpy as np

定义一个凸目标函数：
$$
\min_{x} \frac{1}{2} ||Ax-b||^{2}
$$

定义一个读取数据的函数`Fun()`, 读取参数$A$和$b$, 以及计算目标函数值的`min_f`:

In [2]:
class Fun(object):
    def __init__(self, path_a, path_b):
        self.file_a_path = path_a
        self.file_b_path = path_b
        self.A, self.b = self._get_parameter()  # 获取到参数A和b的值
        self.A_T = self.A.transpose()
        self.x_init = sp.eye(self.A.shape[-1], self.b.shape[-1]).tocsr()  # 设置初始解

    def min_f(self, x):
        y = self.A * x - self.b  # 得到目标函数的值
        return np.linalg.norm(y.toarray(), ord=2)  # 得到目标函数的二范数

    def _get_parameter(self):
        self.A = sp.load_npz(self.file_a_path).tocsr()  # shape = (15935, 62061)
        self.b = sp.load_npz(self.file_b_path).tocsr()  # shape = (15935, 1)
        return self.A, self.b

## 梯度下降

梯度下降的迭代公式：

$$
x_{k+1} = x_{k} - \eta A^{T}(Ax_{k}-b)
$$

因为$A$的维度为`(15935, 62061)`，$b$的维度为`(15935, 1)`。所以$x$的维度为`(62061, 1)`，$x_{k}$的维度为`(62061, 1)`。$Ax_{k}-b$的纬度为`(15935, 1)`。$A^{T}$的维度为`(62061, 15935)`, $A^{T}(Ax_{k}-b)$的维度`(62061, 1)`能够与$x_{k}$对齐。



In [3]:
class GradientDescent(Fun):
    def __init__(self, path_a, path_b):
        super(GradientDescent, self).__init__(path_a, path_b)

    def gradient_decs(self, eta=0.002, iter_times=1, x_input=None):
        x_output = None
        y_input = self.min_f(x_input)
        for i in range(iter_times):
            x_output = x_input - eta * self.A_T * (self.A * x_input - self.b)
            x_input = x_output  # 更新 x 的值
            y_output = self.min_f(x_output)
            print("pre_y is {}  and y is {}".format(y_input, y_output))
            y_input = y_output
        return x_output

In [5]:
GD = GradientDescent(path_a='./news20_A.npz', path_b='./news20_b.npz')

GD.gradient_decs(eta=0.002, iter_times=20, x_input=GD.x_init)

print('')

pre_y is 15.722300259397239  and y is 13.255437472728753
pre_y is 13.255437472728753  and y is 12.830725844332473
pre_y is 12.830725844332473  and y is 12.51692906968907
pre_y is 12.51692906968907  and y is 12.27197839958532
pre_y is 12.27197839958532  and y is 12.071656835698864
pre_y is 12.071656835698864  and y is 11.900847835979816
pre_y is 11.900847835979816  and y is 11.750127725254952
pre_y is 11.750127725254952  and y is 11.613593698606474
pre_y is 11.613593698606474  and y is 11.487508355089918
pre_y is 11.487508355089918  and y is 11.369465105891495
pre_y is 11.369465105891495  and y is 11.25787798929786
pre_y is 11.25787798929786  and y is 11.151670710977404
pre_y is 11.151670710977404  and y is 11.05008706842521
pre_y is 11.05008706842521  and y is 10.9525750410614
pre_y is 10.9525750410614  and y is 10.858715532640206
pre_y is 10.858715532640206  and y is 10.768178203651498
pre_y is 10.768178203651498  and y is 10.680693783528262
pre_y is 10.680693783528262  and y is 10.59

## 加速梯度下降

加速梯度下降的迭代公式：

$$
y_{k+1} = x_{x}+\frac{k-1}{k+2}(x_{k}-x_{k-1}) \\
x_{k+1} = y_{k+1} - \eta A^{T}(Ay_{k+1}-b)
$$

因为$A$的维度为`(15935, 62061)`，$b$的维度为`(15935, 1)`。所以$x$的维度为`(62061, 1)`，$y_{k+1}$的维度为`(62061, 1)`。$Ay_{k+1}-b$的纬度为`(15935, 1)`。$A^{T}$的维度为`(62061, 15935)`, $A^{T}(Ay_{k+1}-b)$的维度`(62061, 1)`能够与$x_{k}$对齐。

In [6]:
class AccelerateGradientDescent(Fun):
    def __init__(self, path_a, path_b):
        super(AccelerateGradientDescent, self).__init__(path_a, path_b)
        self.x_pre = sp.eye(self.A.shape[-1], self.b.shape[-1]).tocsr()  # 设置x_{k-1}
        self.k = None

    def acc_gradient_decs(self, eta=0.002, iter_times=1, x_input=None):
        self.k = 1
        x_output = None
        y_input = self.min_f(x_input)
        for i in range(iter_times):
            y_k_1 = x_input + (self.k - 1)/(self.k + 2) * (x_input - self.x_pre)
            x_output = y_k_1 - eta * self.A_T * (self.A * y_k_1 - self.b)
            self.x_pre = x_input
            x_input = x_output  # 更新 x 的值
            y_output = self.min_f(x_output)
            print("pre_y is {}  and y is {}".format(y_input, y_output))
            y_input = y_output
            self.k += 1
        return x_output

In [7]:
AccGD = AccelerateGradientDescent(path_a='./news20_A.npz', path_b='./news20_b.npz')

AccGD.acc_gradient_decs(eta=0.002, iter_times=20, x_input=AccGD.x_init)

print('')

pre_y is 15.722300259397239  and y is 13.255437472728753
pre_y is 13.255437472728753  and y is 12.735302978524988
pre_y is 12.735302978524988  and y is 12.307633153911993
pre_y is 12.307633153911993  and y is 11.955413213701904
pre_y is 11.955413213701904  and y is 11.650605664213087
pre_y is 11.650605664213087  and y is 11.36805046757616
pre_y is 11.36805046757616  and y is 11.092384589510115
pre_y is 11.092384589510115  and y is 10.81782545703838
pre_y is 10.81782545703838  and y is 10.544467243141199
pre_y is 10.544467243141199  and y is 10.274496509332158
pre_y is 10.274496509332158  and y is 10.009923575915268
pre_y is 10.009923575915268  and y is 9.751882549700937
pre_y is 9.751882549700937  and y is 9.500825818496821
pre_y is 9.500825818496821  and y is 9.256927535312608
pre_y is 9.256927535312608  and y is 9.020336311075038
pre_y is 9.020336311075038  and y is 8.791231346844746
pre_y is 8.791231346844746  and y is 8.569781302129703
pre_y is 8.569781302129703  and y is 8.3561012

## 随机梯度下降

随机梯度下降的迭代公式：

$$
x_{k+1} = x_{k} - \eta ((a_{i} x_{k} - b_{i})*a_{i}))^{T}
$$

其中$a_{i}$为$A$的第$i$行。

因为$A$的维度为`(15935, 62061)`，$b$的维度为`(15935, 1)`。所以$x$的维度为`(62061, 1)`，$x_{k}$的维度为`(62061, 1)`。$a_{i}$的维度为`(1, 62061)`，$a_{i}^{T}x_{k} - b_{i}$的纬度为`(1, 1)`。$(a_{i} x_{k} - b_{i})*a_{i})$的维度为`(1, 62061)`, 其转置之后，能够与$x_{k}$对齐。

In [8]:
class StochasticGradientDescent(Fun):
    def __init__(self, path_a, path_b):
        super(StochasticGradientDescent, self).__init__(path_a, path_b)

    def stochastic_gradient_decs(self, eta=0.002, iter_times=1, x_input=None):

        x_output = None
        y_input = self.min_f(x_input)
        for i in range(iter_times):
            row = np.random.randint(0, self.A.shape[0])
            a_i = self.A[row]
            b_i = self.b[row]
            x_output = x_input - eta * ((a_i * x_input - b_i) * a_i).transpose()
            x_input = x_output  # 更新 x 的值
            y_output = self.min_f(x_output)
            print("pre_y is {}  and y is {}".format(y_input, y_output))
            y_input = y_output

        return x_output

In [9]:
StochasticGD = StochasticGradientDescent(path_a='./news20_A.npz', path_b='./news20_b.npz')

StochasticGD.stochastic_gradient_decs(eta=0.002, iter_times=20, x_input=StochasticGD.x_init)

print('')

pre_y is 15.722300259397239  and y is 15.722393042412504
pre_y is 15.722393042412504  and y is 15.722393042412504
pre_y is 15.722393042412504  and y is 15.722393042412504
pre_y is 15.722393042412504  and y is 15.721923041832127
pre_y is 15.721923041832127  and y is 15.721923041832127
pre_y is 15.721923041832127  and y is 15.72185547987259
pre_y is 15.72185547987259  and y is 15.72185547987259
pre_y is 15.72185547987259  and y is 15.721855483052215
pre_y is 15.721855483052215  and y is 15.722209969603565
pre_y is 15.722209969603565  and y is 15.721357615798093
pre_y is 15.721357615798093  and y is 15.720466562889355
pre_y is 15.720466562889355  and y is 15.719969667546502
pre_y is 15.719969667546502  and y is 15.719059594032018
pre_y is 15.719059594032018  and y is 15.718871137684532
pre_y is 15.718871137684532  and y is 15.718719651464122
pre_y is 15.718719651464122  and y is 15.717248134828798
pre_y is 15.717248134828798  and y is 15.71635016671461
pre_y is 15.71635016671461  and y is

## 随机坐标下降

随机坐标下降的迭代公式：

$$
x_{k+1}^{i} = x_{k}^{i} - \eta (a^{i})^{T}(Ax_{k}-b)
$$

因为$A$的维度为`(15935, 62061)`，$b$的维度为`(15935, 1)`。所以$x$的维度为`(62061, 1)`，$x_{k}$的维度为`(62061, 1)`。$Ax_{k}-b$的纬度为`(15935, 1)`。$a^{i}$的维度为`(15935, 1)`, $(a^{i})^{T}(Ax_{k}-b)$的维度`(1, 1)`能够与$x_{k}^{i}$对齐。

In [10]:
class StochasticCoordinateDescent(Fun):
    def __init__(self, path_a, path_b):
        super(StochasticCoordinateDescent, self).__init__(path_a, path_b)

    def stochastic_coordinate_decs(self, eta=0.002, iter_times=1, x_input=None):

        x_output = None
        y_input = self.min_f(x_input)
        for i in range(iter_times):
            index = np.random.randint(0, self.A.shape[1])
            a_i = self.A[:, index]
            x_input[index] = x_input[index] - eta * (a_i.transpose() * (self.A * x_input - self.b)) # 更新 x 的值
            x_output = x_input
            y_output = self.min_f(x_output)
            print("pre_y is {}  and y is {} pre-y {}".format(y_input, y_output, (y_input-y_output)))
            y_input = y_output

        return x_output

In [11]:
StochasticCD = StochasticCoordinateDescent(path_a='./news20_A.npz', path_b='./news20_b.npz')

StochasticCD.stochastic_coordinate_decs(eta=0.002, iter_times=20, x_input=StochasticCD.x_init)

print('')

pre_y is 15.722300259397239  and y is 15.722300259397239 pre-y 0.0
pre_y is 15.722300259397239  and y is 15.722300243321966 pre-y 1.6075272668558682e-08
pre_y is 15.722300243321966  and y is 15.722298722562046 pre-y 1.5207599197708532e-06
pre_y is 15.722298722562046  and y is 15.722298719107194 pre-y 3.4548524041611017e-09
pre_y is 15.722298719107194  and y is 15.722298603821228 pre-y 1.1528596566279248e-07
pre_y is 15.722298603821228  and y is 15.722298374216027 pre-y 2.296052006300897e-07
pre_y is 15.722298374216027  and y is 15.72229655247028 pre-y 1.8217457480318444e-06
pre_y is 15.72229655247028  and y is 15.722296506176557 pre-y 4.629372263309506e-08
pre_y is 15.722296506176557  and y is 15.722296502944264 pre-y 3.2322926557526444e-09
pre_y is 15.722296502944264  and y is 15.722296049495716 pre-y 4.534485480434114e-07
pre_y is 15.722296049495716  and y is 15.722296049431991 pre-y 6.372502525664459e-11
pre_y is 15.722296049431991  and y is 15.722296049431991 pre-y 0.0
pre_y is 15.

  self._set_arrayXarray_sparse(i, j, x)


pre_y is 15.722295907668123  and y is 15.722295907668123 pre-y 0.0
pre_y is 15.722295907668123  and y is 15.722294841321999 pre-y 1.0663461242899075e-06
pre_y is 15.722294841321999  and y is 15.722294809273171 pre-y 3.204882759177963e-08
pre_y is 15.722294809273171  and y is 15.722292870270227 pre-y 1.9390029439847467e-06
pre_y is 15.722292870270227  and y is 15.722278391814386 pre-y 1.4478455840460924e-05
pre_y is 15.722278391814386  and y is 15.722272061814433 pre-y 6.329999953536003e-06

