In [2]:
import numpy as np


def numerical_gradient(f, x):
    h = 1e-4
    grad = np.zeros_like(x)

    for idx in range(x.size):
        tmp_val = x[idx]

        # f(x+h)的计算
        x[idx] = tmp_val + h
        fxh1 = f(x)

        # f(x-h)的计算
        x[idx] = tmp_val - h
        fxh2 = f(x)

        grad[idx] = (fxh1 - fxh2) / (2 * h)
        x[idx] = tmp_val

    return grad

In [3]:
def function_2(x):
    return np.sum(x ** 2)

In [4]:
numerical_gradient(function_2, np.array([3.0, 4.0]))

array([6., 8.])

In [5]:
numerical_gradient(function_2, np.array([0.0, 2.0]))

array([0., 4.])

In [6]:
numerical_gradient(function_2, np.array([3.0, 0.0]))

array([6., 0.])

In [7]:
numerical_gradient(function_2, np.array([3.0, 4.0, 5.0]))

array([ 6.,  8., 10.])

In [8]:
# 参数 f是要进行最优化的函数，init_x是初始值，lr是学习率 learning rate，step_num是梯度法的重复次数
def gradient_descent(f, init_x, lr=0.01, step_num=100):
    x = init_x

    for i in range(step_num):
        grad = numerical_gradient(f, x)
        x -= lr * grad

    return x

In [9]:
init_x = np.array([-3.0, 4.0])
gradient_descent(function_2, init_x=init_x, lr=0.1, step_num=100)

array([-6.11110793e-10,  8.14814391e-10])

In [10]:
init_x = np.array([-3.0, 4.0])
gradient_descent(function_2, init_x=init_x, lr=10.0, step_num=100) # 学习率过大，会发散

array([-2.58983747e+13, -1.29524862e+12])

In [11]:
init_x = np.array([-3.0, 4.0])
gradient_descent(function_2, init_x=init_x, lr=1e-10, step_num=100) # 学习率过小，几乎不会更新

array([-2.99999994,  3.99999992])