In [1]:
import torch
import math
# this ensures that the current MacOS version is at least 12.3+
print(torch.backends.mps.is_available())
# this ensures that the current current PyTorch installation was built with MPS activated.
print(torch.backends.mps.is_built())

dtype = torch.float
device = torch.device("mps")

# Create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Randomly initialize weights
a = torch.randn((), device=device, dtype=dtype)
b = torch.randn((), device=device, dtype=dtype)
c = torch.randn((), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

# Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights using gradient descent
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d


print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

True
True
99 810.5762939453125
199 544.6072387695312
299 367.09014892578125
399 248.54281616210938
499 169.32968139648438
599 116.36717987060547
699 80.93305206298828
799 57.210445404052734
899 41.317291259765625
999 30.661800384521484
1099 23.512327194213867
1199 18.7115478515625
1299 15.485221862792969
1399 13.315179824829102
1499 11.854280471801758
1599 10.869909286499023
1699 10.205991744995117
1799 9.757780075073242
1899 9.454900741577148
1999 9.250006675720215
Result: y = 0.013798343949019909 + 0.8409788608551025 x + -0.0023804439697414637 x^2 + -0.09108839929103851 x^3


In [2]:
x=0
lr=0.1
epochs=20
y=lambda x:x**2+x*2+1
for epoch in range(epochs):
    dx=2*x+2#梯度
    x=x-lr*dx
    print('x: ',x,'y: ',y(x))

x:  -0.2 y:  0.64
x:  -0.36000000000000004 y:  0.40959999999999996
x:  -0.488 y:  0.26214400000000004
x:  -0.5904 y:  0.16777215999999995
x:  -0.67232 y:  0.10737418239999996
x:  -0.7378560000000001 y:  0.06871947673599998
x:  -0.7902848 y:  0.043980465111040035
x:  -0.83222784 y:  0.028147497671065613
x:  -0.865782272 y:  0.018014398509481944
x:  -0.8926258176 y:  0.011529215046068408
x:  -0.9141006540800001 y:  0.0073786976294838436
x:  -0.931280523264 y:  0.004722366482869611
x:  -0.9450244186112 y:  0.0030223145490365644
x:  -0.95601953488896 y:  0.0019342813113834012
x:  -0.9648156279111679 y:  0.0012379400392853457
x:  -0.9718525023289344 y:  0.0007922816251426656
x:  -0.9774820018631475 y:  0.0005070602400912838
x:  -0.981985601490518 y:  0.0003245185536584483
x:  -0.9855884811924144 y:  0.00020769187434144243
x:  -0.9884707849539315 y:  0.00013292279957843878


In [3]:
import torch
from torch.autograd import Variable
# 定义一个pytorch类型 且可自动求导的的初始值
x = torch.Tensor([0])# 定义一个tensor，相当于np.array
x = Variable(x,requires_grad=True) # x转变为一个variable，建立计算图的起点;开启requires_grad表示自动计算梯度
print('grad',x.grad,'data',x.data) # grad表示x的梯度属性，表明当前累计的梯度；data表示tensor值

lr = 0.1
epochs = 20

for epoch in range(epochs):
    # 设置计算图:建立一个函数y，以x为变量
    y = x ** 2 + 2 * x + 1
    # Variable 能自动求导==》requires_grad
    y.backward()  # 对y做反向传导==》自动计算梯度，由于当前变量为1个，所以不需要指定
    print('grad of epoch' + str(epoch) + ':', x.grad.data)

    x.data -= lr * x.grad.data
    # 在 pytorch 中梯度会累积，则每次需要清0
    x.grad.data.zero_()  # xx_表示对变量做inplace操作；此处将当前梯度清0
print(x.data)

grad None data tensor([0.])
grad of epoch0: tensor([2.])
grad of epoch1: tensor([1.6000])
grad of epoch2: tensor([1.2800])
grad of epoch3: tensor([1.0240])
grad of epoch4: tensor([0.8192])
grad of epoch5: tensor([0.6554])
grad of epoch6: tensor([0.5243])
grad of epoch7: tensor([0.4194])
grad of epoch8: tensor([0.3355])
grad of epoch9: tensor([0.2684])
grad of epoch10: tensor([0.2147])
grad of epoch11: tensor([0.1718])
grad of epoch12: tensor([0.1374])
grad of epoch13: tensor([0.1100])
grad of epoch14: tensor([0.0880])
grad of epoch15: tensor([0.0704])
grad of epoch16: tensor([0.0563])
grad of epoch17: tensor([0.0450])
grad of epoch18: tensor([0.0360])
grad of epoch19: tensor([0.0288])
tensor([-0.9885])


In [6]:
#numpy实现线性回归
import numpy as np
x_data = np.array([1, 2, 3])
y_data = np.array([2, 4, 6])

epochs = 10
lr = 0.01
w = 0
cost = []

for epoch in range(epochs):
    # 计算梯度
	yhat = x_data * w#设计的模型，用x预测y
	loss = np.average((yhat - y_data)**2)#最小二乘法求loss
	cost.append(loss)
	dw = np.dot(-2*(y_data - yhat),x_data.T)/(x_data.shape[0])
    #参数更新
	w = w - lr*dw
print(w)

1.2492307286934012


In [5]:
##PyTorch实现线性回归
import torch
from torch.autograd import Variable

#设置初始变量
x_data = Variable(torch.Tensor([[1], [2], [3]]))
y_data = Variable(torch.Tensor([[2], [4], [6]]))

epochs = 20
lr = 0.1
w = Variable(torch.FloatTensor([0]), requires_grad=True)  # requires_grad一定不要忘记设置
cost = []
for epoch in range(epochs):
    yhat=x_data*w
    loss=torch.mean((yhat-y_data)**2)
    cost.append(loss.data.numpy())  # tensor转化为ndarray
    loss.backward()  # 计算loss偏导（仍用loss做目标优化函数）
    #update w
    w.data-=lr*w.grad.data 
    w.grad.data.zero_()#梯度置零，进入下一次
print(w.data)

tensor([2.])


In [11]:

import torch
#class 创建一个类，通过class model写一个神经网络类
        # super 用来返回Model的父类，在pytorch下定义的类都是继承一个大的父类torch.nn.Module的父类。
        # torch.nn.Module中包含了各种工具，一般我们都是写的都是子类，通过父类我们可以很容易书写子类。
        
        
        # 建立一个linear类，bias表示偏置项,建立一个AX+b

        # forward 是torch.nn.Module定义好的模板，表示前向传播

class Model(torch.nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.linear=torch.nn.Linear(1,1,bias=False)
        
    def forward(self,x):
        y_pred=self.linear(x)
        return y_pred
model=Model()
criterion=torch.nn.MSELoss(reduction='mean')
#其中的SGD就是optim中的一个算法（优化器）：随机梯度下降算法
#PyTorch 的优化器基本都继承于 "class Optimizer"，这是所有 optimizer 的 base class
optimizer=torch.optim.SGD(model.parameters(),lr=0.01)
from torch.autograd import Variable
torch.manual_seed(2)
#设置初始变量
x_data = Variable(torch.Tensor([[1], [2], [3]]))
y_data = Variable(torch.Tensor([[2], [4], [6]]))

epochs = 20
cost = []
for epoch in range(epochs):
    #建立计算图
    yhat=model(x_data)
    loss=criterion(yhat,y_data)
    cost.append(loss.data)
    optimizer.zero_grad()
    loss.backward()
    #参数更新
    optimizer.step()
print(cost)
print(list(model.parameters()))


[tensor(14.6303), tensor(12.0267), tensor(9.8865), tensor(8.1272), tensor(6.6809), tensor(5.4920), tensor(4.5147), tensor(3.7112), tensor(3.0508), tensor(2.5079), tensor(2.0616), tensor(1.6947), tensor(1.3931), tensor(1.1452), tensor(0.9414), tensor(0.7739), tensor(0.6362), tensor(0.5230), tensor(0.4299), tensor(0.3534)]
[Parameter containing:
tensor([[1.7505]], requires_grad=True)]


In [None]:
批量梯度下降法(Batch Gradient Descent, BGD)：是梯度下降法的最原始形式，每迭代一步或更新每一参数时，都要用到训练集中的所有样本数据，当样本数目很多时，训练过程会很慢。

随机梯度下降法(Stochastic Gradient Descent, SGD)：由于批量梯度下降法在更新每一个参数时，都需要所有的训练样本，所以训练过程会随着样本数量的加大而变得异常的缓慢。随机梯度下降法正是为了解决批量梯度下降法这一弊端而提出的。随机梯度下降是通过每个样本来迭代更新一次。SGD伴随的一个问题是噪音较BGD要多，使得SGD并不是每次迭代都向着最优化方向进行。

小批量梯度下降法(Mini-BatchGradient Descent, MBGD)：在每次更新参数时使用m’个样本, m’可能远小于m。