In [1]:
import torch
from torch.utils import data
from torchvision import transforms
import torchvision
from torch import nn

## 数据加载

In [2]:
# 封装
def load_data_fashion_mnist(batch_size):
    trans = transforms.ToTensor()
    # 下载数据
    mnist_train = torchvision.datasets.FashionMNIST(
        root="../../../data/", train=True, download=True, transform=trans
    )
    mnist_test = torchvision.datasets.FashionMNIST(
        root="../../../data/", train=False, download=True, transform=trans
    )
    # 加载数据
    mnist_train_data_loader = data.DataLoader(
        mnist_train, batch_size=batch_size, shuffle=True)
    mnist_test_data_loader = data.DataLoader(
        mnist_test, batch_size=batch_size, shuffle=True)
    return mnist_train_data_loader, mnist_test_data_loader


In [3]:
batch_size = 256

In [4]:
train_data_loader,test_data_loader = load_data_fashion_mnist(batch_size)

In [5]:
tx,ty = next(iter(train_data_loader))

In [6]:
tx.shape, ty.shape

(torch.Size([256, 1, 28, 28]), torch.Size([256]))

## 模型参数初始化

原始输入是一张28\*28的图像, 将其展成一维向量, 长度是28\*28=784

所以最终的输入是长度为784的向量

考虑回归方程:
$$

\begin{align} 
	y_0 = w_{0,0}*x_0 + w_{0,1}*x_1 + ... + w_{0,783}*x_{783} + b_0 \\

    y_1 = w_{1,0}*x_0 + w_{1,1}*x_1 + ... + w_{0,783}*x_{783} + b_1 \\

    ... \\

    y_9 = w_{9,0}*x_0 + w_{9,1}*x_1 + ... + w_{0,783}*x_{783} + b_9 \\

    Y = XW + b \\
    
    X是1*784，W是784*10，b是1*10
\end{align}

$$

所以最终的输出是长度为10的向量1*10

但这个输出不是概率，也不符合概率的性质（非负，和为1），所以需要进行归一化

采用的方法是 output(i) = e^yi/∑e^yi，保证输出符合概率的性质，且不影响yi的大小关系

参数w是784\*10的矩阵, b是1\*10的矩阵

**但实际训练或测试时,一次可能不止输入一张图片,所以x其实是n*784的矩阵**

In [7]:
input_size = 784
output_size = 10

# w = torch.normal(0,1,(input_size,output_size),requires_grad=True)
w = torch.normal(0,0.01,(input_size,output_size),requires_grad=True) # sigma应该比较小
b = torch.zeros((output_size,),requires_grad=True)

In [8]:
w.shape,b.shape

(torch.Size([784, 10]), torch.Size([10]))

In [9]:
w,b

(tensor([[-0.0126,  0.0057, -0.0181,  ...,  0.0172,  0.0156, -0.0069],
         [ 0.0016,  0.0056,  0.0043,  ...,  0.0257, -0.0030, -0.0062],
         [ 0.0008,  0.0052, -0.0082,  ..., -0.0049, -0.0005, -0.0019],
         ...,
         [ 0.0141,  0.0009,  0.0011,  ...,  0.0008, -0.0067, -0.0145],
         [ 0.0021,  0.0102,  0.0037,  ..., -0.0054, -0.0077, -0.0085],
         [ 0.0029, -0.0085,  0.0089,  ..., -0.0124,  0.0083,  0.0063]],
        requires_grad=True),
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True))

## 定义模型

### softmax函数

$$
    output(y_i) = \frac{e^{y_i}}{∑e^{y_k}}
$$

这个函数理论上没有问题, 可以将$y_i$转化为非负数, 小于等于1, 且和为1

但是计算上, 因为计算机的精度有限, 所以可能上溢或下溢, 影响反向传播

In [10]:
tmp = torch.arange(0,2,step=0.1).reshape((2,10))
tmp

tensor([[0.0000, 0.1000, 0.2000, 0.3000, 0.4000, 0.5000, 0.6000, 0.7000, 0.8000,
         0.9000],
        [1.0000, 1.1000, 1.2000, 1.3000, 1.4000, 1.5000, 1.6000, 1.7000, 1.8000,
         1.9000]])

In [11]:
tmp_e = torch.exp(tmp)
tmp_e

tensor([[1.0000, 1.1052, 1.2214, 1.3499, 1.4918, 1.6487, 1.8221, 2.0138, 2.2255,
         2.4596],
        [2.7183, 3.0042, 3.3201, 3.6693, 4.0552, 4.4817, 4.9530, 5.4739, 6.0496,
         6.6859]])

In [12]:
tmp_e.sum(dim=1,keepdim=True)

tensor([[16.3380],
        [44.4113]])

In [13]:
d = tmp_e / tmp_e.sum(dim=1,keepdim=True)
d

tensor([[0.0612, 0.0676, 0.0748, 0.0826, 0.0913, 0.1009, 0.1115, 0.1233, 0.1362,
         0.1505],
        [0.0612, 0.0676, 0.0748, 0.0826, 0.0913, 0.1009, 0.1115, 0.1233, 0.1362,
         0.1505]])

In [14]:
d[0].sum()

tensor(1.0000)

In [15]:
d.sum(dim=1,keepdim=True)

tensor([[1.0000],
        [1.0000]])

In [16]:
def softmax(x):
    return torch.exp(x) / torch.exp(x).sum(dim=1,keepdim=True)

In [17]:
softmax(torch.arange(0,2,0.1).reshape((2,10)))

tensor([[0.0612, 0.0676, 0.0748, 0.0826, 0.0913, 0.1009, 0.1115, 0.1233, 0.1362,
         0.1505],
        [0.0612, 0.0676, 0.0748, 0.0826, 0.0913, 0.1009, 0.1115, 0.1233, 0.1362,
         0.1505]])

In [18]:
t = softmax(torch.normal(0,1,(4,5)))

In [19]:
t

tensor([[0.1861, 0.0533, 0.0542, 0.1478, 0.5586],
        [0.2171, 0.4910, 0.0676, 0.0303, 0.1941],
        [0.1673, 0.1502, 0.1865, 0.4454, 0.0505],
        [0.1698, 0.0164, 0.0631, 0.3146, 0.4360]])

In [20]:
t.sum(dim=1,keepdim=True)

tensor([[1.0000],
        [1.0000],
        [1.0000],
        [1.0000]])

### 模型函数

~~yi = xWi + bi~~

~~output(i) = e^yi/∑e^yi~~

经过对softmax函数计算不可行的分析,上面的计算模型也不可行

In [None]:
# def softmax_regression(x):
#     # x 可能是1*784的向量，也可能是n*784的向量，具体取决于batch_size
#     # 所以不能写成 y = torch.matmul(x.reshape(1,-1),w) + b
#     y = torch.matmul(x.reshape(-1,w.shape[0]),w) + b
#     return softmax(y)

In [21]:
def softmax_regression(x):
    # x 可能是1*784的向量，也可能是n*784的向量，具体取决于batch_size
    # 所以不能写成 y = torch.matmul(x.reshape(1,-1),w) + b
    y = torch.matmul(x.reshape(-1,w.shape[0]),w) + b
    return y

## 损失函数

y_hat是通过softmax函数计算出的概率矩阵
    
y是标签（类别）

y表示正确的类别，所以先选取对应的概率

因为目标是使概率p最大、损失函数f最小，所以应该加上负数，变成-p

又因为-p是负数，不方便用梯度；且 0=<p<=1，所以取 -logp

目标是使-logp接近于0

In [22]:
y_hat = torch.randint(0,10,(4,5))/10
y = torch.tensor([0,3])
y_hat,y

(tensor([[0.5000, 0.8000, 0.5000, 0.4000, 0.9000],
         [0.6000, 0.5000, 0.1000, 0.9000, 0.9000],
         [0.4000, 0.9000, 0.8000, 0.7000, 0.2000],
         [0.7000, 0.6000, 0.5000, 0.4000, 0.3000]]),
 tensor([0, 3]))

In [23]:
y_hat[0:len(y_hat),y]

tensor([[0.5000, 0.4000],
        [0.6000, 0.9000],
        [0.4000, 0.7000],
        [0.7000, 0.4000]])

In [24]:
- torch.log(y_hat[0:len(y_hat),y])

tensor([[0.6931, 0.9163],
        [0.5108, 0.1054],
        [0.9163, 0.3567],
        [0.3567, 0.9163]])

In [25]:
def cross_entropy_loss_function(y_hat,y):
    return - torch.log(y_hat[0:len(y_hat),y])

## 精度评价算法

y_hat的列表示预测某一张图片属于第i个列的概率大小

从所有列中选出最大值，获得i，与标签y中的值进行比较

如果相同，表示预测正确；否则预测失败

精度accuracy = 预测正确次数/总的预测次数

In [26]:
y_hat = torch.randint(0,10,(4,5)) / 10
y_hat

tensor([[0.6000, 0.1000, 0.8000, 0.8000, 0.7000],
        [0.7000, 0.8000, 0.0000, 0.1000, 0.9000],
        [0.0000, 0.1000, 0.5000, 0.1000, 0.8000],
        [0.1000, 0.5000, 0.4000, 0.1000, 0.9000]])

In [27]:
y = torch.tensor([2,2,0,3]).reshape(1,-1)
y,y.shape

(tensor([[2, 2, 0, 3]]), torch.Size([1, 4]))

In [28]:
t = y_hat.max(dim=1,keepdim=True)
t.indices, t.indices.shape

(tensor([[2],
         [4],
         [4],
         [4]]),
 torch.Size([4, 1]))

In [29]:
t.indices.T

tensor([[2, 4, 4, 4]])

In [30]:
(t.indices.T == y).sum()

tensor(1)

### 精度计算过程封装

In [31]:
def accuracy(y_hat,y):
    """
        返回正确的预测数与总的预测数
    """
    y_hat_max = y_hat.max(dim=1,keepdim=True)
    index = y_hat_max.indices.T
    index.type(y.dtype)
    # print(int((index == y).sum()),y.shape[0])
    return int((index == y).sum()),y.shape[0]

In [32]:
accuracy(y_hat,y)

(1, 1)

### 记录正确的预测数与总的预测次数等信息

In [33]:
class Accumulator:
    def __init__(self,n) -> None:
        self.data = [0.0]*n
    
    def add(self,*args):
        for i,arg in enumerate(args):
            self.data[i] += arg
            
    def reset(self):
        self.data = [0.0]*len(self.data)
    
    def __getitem__(self,idx):
        return self.data[idx]

### 封装

In [34]:
def accuracy_evaluate(net,data):
    evaluator = Accumulator(2) # 正确与总数
    with torch.no_grad(): # 不计算梯度，更快
        for x,y in data:
            acc = accuracy(net(x),y)
            evaluator.add(acc[0],acc[1]) # accuracy的计算返回元组
    
    return 1.0 * evaluator[0] / evaluator[1]
    

In [35]:
# 测试
accuracy_evaluate(softmax_regression,train_data_loader)

0.13291666666666666

结果应该接近于1/10, 因为没有对模型参数进行修正, 相当于随机分类

## 优化算法

还是用随机梯度下降来进行模型参数优化

In [36]:
lr = 0.1

In [37]:
def sgd(params, lr, batch_size):
    with torch.no_grad():
        for param in params:
            param -= lr * param.grad / batch_size
            param.grad.zero_()

In [38]:
def updater(params,lr,batch_size):
    sgd([w,b],lr,batch_size)

In [39]:
# updater = torch.optim.SGD([w,b],lr=lr)

## 迭代

单次迭代函数

In [40]:
def train_epoch(net,data,loss_function,updater):
    # 记录loss，正确预测数，总的预测数
    evaluator = Accumulator(3)
    for x,y in data:
        # 预测值
        y_hat = net(x)
        l = loss_function(y_hat,y)
        # 更新模型参数
        if isinstance(updater,torch.optim.Optimizer):
            # pytorch内置优化器
            updater.zero_grad()
            l.sum().backward()
            updater.step()
        else:
            # print(b)
            l.sum().backward()
            updater([w,b],lr,batch_size)
            # print(b)
            
        acc = accuracy(y_hat,y)
        evaluator.add(l.sum(),acc[0],acc[1])
    # loss，acc
    # print(evaluator[0],evaluator[1],evaluator[2])
    return float(evaluator[0] / evaluator[2]), evaluator[1] / evaluator[2]
    

### 总的迭代函数

In [41]:
def train(net,train_data,test_data,loss_function,updater,epochs):
    print("epoch\tloss\ttrain_acc\ttest_acc")
    for epoch in range(epochs):
        loss ,train_acc = train_epoch(net,train_data,loss_function,updater)
        test_acc = accuracy_evaluate(net,test_data)
        print(f"{epoch}\t{round(loss,5)}\t{round(train_acc,5)}\t\t{round(test_acc,5)}")

#### softmax函数的选择问题

In [43]:
train(softmax_regression,train_data_loader,test_data_loader,nn.CrossEntropyLoss(reduction='none'),updater,10)


epoch	loss	train_acc	test_acc
0	0.7868	0.74838		0.7909
1	0.57116	0.8129		0.8119
2	0.52586	0.82522		0.8169
3	0.50055	0.8332		0.8236
4	0.48613	0.83532		0.8264
5	0.47354	0.84005		0.8262
6	0.46509	0.84288		0.8303
7	0.45892	0.84422		0.8329
8	0.45178	0.84697		0.8298
9	0.4478	0.84808		0.8325


In [45]:
train(softmax_regression,train_data_loader,test_data_loader,nn.CrossEntropyLoss(reduction='none'),updater,10)

epoch	loss	train_acc	test_acc
0	1.9808	0.56837		0.6676
1	1.82432	0.68242		0.686
2	1.78427	0.72687		0.738
3	1.75579	0.75528		0.7537
4	1.73989	0.76528		0.7593
5	1.72952	0.77112		0.7623
6	1.72198	0.7747		0.7688
7	1.71608	0.77833		0.7727
8	1.71128	0.78088		0.7743
9	1.70733	0.7834		0.7766


In [46]:
# # 按照定义实现的softmax函数，存在计算问题
# train(softmax_regression,train_data_loader,test_data_loader,cross_entropy_loss_function,updater,3)

## 预测

### 标签到文字的映射

In [47]:
def labels_int_to_text(labels_int):
    labels_text = ['t-shirt', 'trouser', 'pullover', 'dress',
                   'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
    return [labels_text[label_int] for label_int in labels_int]


In [None]:
for x,y in test_data_loader:
    labels = labels_int_to_text(y[:18])
    y_hat = softmax_regression(x[:18])
    print(y_hat)
    print(softmax_regression(x[:18]).max(dim=1).indices.T)
    # predictions = labels_int_to_text(softmax_regression(x[:18]))
    break

tensor([[2.1678e-05, 9.9884e-01, 5.2085e-06, 1.0889e-03, 4.3732e-05, 4.0397e-09,
         4.3896e-07, 1.6934e-07, 2.3326e-07, 3.6351e-07],
        [9.9993e-01, 7.2849e-07, 6.9031e-06, 5.8541e-05, 5.7826e-06, 1.2851e-10,
         1.0127e-08, 2.3651e-10, 2.1574e-07, 2.3925e-07],
        [3.0867e-05, 6.0178e-06, 6.5565e-02, 1.7331e-05, 9.3414e-01, 6.8961e-07,
         5.0407e-07, 1.5158e-08, 2.3395e-04, 1.7595e-06],
        [4.2225e-05, 1.4457e-07, 4.7238e-06, 3.3397e-06, 4.9963e-05, 1.5427e-03,
         4.4332e-07, 4.2196e-06, 5.9217e-05, 9.9829e-01],
        [5.2439e-03, 7.7306e-03, 2.8576e-03, 9.7778e-01, 3.2134e-03, 2.0351e-04,
         7.6654e-04, 5.1421e-04, 7.7272e-04, 9.1442e-04],
        [1.0731e-08, 7.1433e-10, 2.6312e-08, 3.3784e-09, 1.2203e-07, 1.0518e-03,
         3.7678e-09, 8.6618e-05, 5.5761e-07, 9.9886e-01],
        [3.8950e-06, 9.9183e-07, 1.2397e-05, 4.3025e-06, 1.2606e-04, 6.1044e-02,
         6.3827e-06, 8.1774e-01, 7.0572e-04, 1.2036e-01],
        [9.9968e-01, 1.7804

  print(softmax_regression(x[:18]).max(dim=1).indices.T)


In [2]:
print(111)

111
