In [35]:
import torch
import numpy as np
from torch import nn
from torchvision import transforms,datasets
from torch.utils.data import DataLoader

In [36]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [37]:
#通过PyTorch下载数据集
train_dataset = datasets.MNIST(root='',train=True,download=True,transform=transforms.Compose([transforms.Resize(size=(32,32)),transforms.ToTensor()]))
test_dataset = datasets.MNIST(root='',train=False,download=True,transform=transforms.Compose([transforms.Resize(size=(32,32)),transforms.ToTensor()]))

In [38]:
train_dataset_size = len(train_dataset)
test_dataset_size = len(test_dataset)
print(train_dataset_size)
print(test_dataset_size)

batch_size = 64
#shuffle=True可以打乱数据集，batch_size=64将会让这个数据生成器每次给我们64个数据,drop_last=True会把不够64一组的舍去（影响不大）。
train_loader = DataLoader(dataset=train_dataset,batch_size=batch_size,shuffle=True,drop_last=True)
test_loader = DataLoader(dataset=test_dataset,batch_size=batch_size,shuffle=True,drop_last=True)

60000
10000


In [39]:
'''这里先单独获取一次DataLoader的数据，用来观察数据结构'''
#enumerate将可迭代对象组合为索引序列，同时列出数据和数据下标。
for index,data in enumerate(train_loader):
    inputs, labels = data
    print(inputs.shape)
    print(labels.shape)
    print(labels)
    break

torch.Size([64, 1, 32, 32])
torch.Size([64])
tensor([6, 4, 0, 1, 4, 4, 8, 3, 3, 4, 6, 8, 5, 3, 8, 0, 8, 0, 0, 0, 7, 6, 3, 4,
        6, 0, 2, 8, 6, 0, 5, 2, 8, 8, 1, 7, 2, 1, 4, 9, 2, 2, 7, 4, 5, 6, 1, 1,
        2, 5, 1, 4, 0, 9, 7, 3, 8, 3, 9, 7, 7, 1, 5, 3])


In [40]:
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        #在卷积层和全连接层之间添加ReLU激活函数可以让线性层拥有非线性表达能力
        '''
        在PyTorch中使用nn.Conv2d时不需要手动设定卷积核的具体数值，这是深度学习与传统图像处理的关键区别之一。
        卷积核的本质是"可学习参数"，在深度学习中，卷积核的数值不是预先设定的固定值，而是神经网络需要学习的参数，
        比如在 nn.Conv2d(in_channels=1,out_channels=6,kernel_size=5,stride=1,padding=0) 中
        我们创建的卷积核对应的shape是[6,1,5,5]，这里是创建了六个大小为5x5的卷积核，每个核对应一个输入通道。
        并且PyTorch会使用智能初始化策略，自动设置初始值。
        
        后续的训练中，将会有这样的代码：
        optim.zero_grad()
        loss.backward()   # 计算梯度
        optim.step()      # 更新所有参数（包括卷积核权重）
        
        每次迭代后，卷积核的数值都会根据损失函数的梯度方向进行微调。    
        '''
        self.model = nn.Sequential(
            nn.Conv2d(in_channels=1,out_channels=6,kernel_size=5,stride=1,padding=0),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2,2)),
            nn.Conv2d(in_channels=6,out_channels=28,kernel_size=5,stride=1,padding=0),
            nn.ReLU(), 
            nn.MaxPool2d(kernel_size=(2,2)),
            nn.Flatten(),
            nn.Linear(in_features=28*5*5,out_features=1*120),
            nn.ReLU(),
            nn.Linear(in_features=1*120,out_features=84),
            nn.ReLU(),
            nn.Linear(in_features=84,out_features=10)
        )

    def forward(self,x):
        x = self.model(x)
        return x
        

In [41]:
LR = 1e-2
#神经网络模型对象创建
net = Net()
net = net.to(device)
print(net)
loss_fn = nn.CrossEntropyLoss()
loss_fn = loss_fn.to(device)
optim = torch.optim.SGD(net.parameters(),LR)

Net(
  (model): Sequential(
    (0): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(6, 28, kernel_size=(5, 5), stride=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
    (6): Flatten(start_dim=1, end_dim=-1)
    (7): Linear(in_features=700, out_features=120, bias=True)
    (8): ReLU()
    (9): Linear(in_features=120, out_features=84, bias=True)
    (10): ReLU()
    (11): Linear(in_features=84, out_features=10, bias=True)
  )
)


In [42]:
def train():
    for index,data in enumerate(train_loader):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = net(inputs)
        loss = loss_fn(outputs,labels)
        optim.zero_grad()
        loss.backward()
        optim.step()

def test():
    times = 0
    for index,data in enumerate(test_loader):
        times += 1
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = net(inputs)
        accuracy = (outputs.argmax(axis=1) == labels).sum()
        if times % 30 == 0:
            print("Test accuracy:{0}".format(accuracy/len(labels)))

In [43]:
for epoch in range(15):
    train()
    if epoch % 5 == 0:
        print("epoch {0}".format(epoch))
        test()
print("Final accuracy")
test()

epoch 0
Test accuracy:0.875
Test accuracy:0.921875
Test accuracy:0.859375
Test accuracy:0.921875
Test accuracy:0.96875
epoch 5
Test accuracy:0.984375
Test accuracy:0.96875
Test accuracy:0.953125
Test accuracy:0.984375
Test accuracy:0.96875
epoch 10
Test accuracy:0.953125
Test accuracy:0.984375
Test accuracy:1.0
Test accuracy:0.984375
Test accuracy:0.953125
Final accuracy
Test accuracy:0.953125
Test accuracy:0.984375
Test accuracy:1.0
Test accuracy:0.96875
Test accuracy:0.953125


In [44]:
from PIL import ImageFilter
#手写数字检验模型效果
from PIL import Image, ImageDraw, ImageOps, ImageGrab
import tkinter as tk
from tkinter import ttk
import io

# 图像预处理转换
transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor()
])


class DigitRecognizerApp:
    def __init__(self, root):
        self.root = root
        root.title("手写数字识别器")
        root.geometry("500x600")
        root.resizable(False, False)
        
        # 创建顶部标题
        title_frame = ttk.Frame(root)
        title_frame.pack(pady=10)
        ttk.Label(title_frame, text="MNIST手写数字识别", font=("Arial", 16, "bold")).pack()
        
        # 创建画布区域
        canvas_frame = ttk.LabelFrame(root, text="手写区域")
        canvas_frame.pack(pady=10, padx=20, fill="both", expand=True)
        
        self.canvas = tk.Canvas(canvas_frame, width=280, height=280, bg="white", cursor="pencil")
        self.canvas.pack(pady=10)
        
        # 绑定鼠标事件
        self.canvas.bind("<B1-Motion>", self.paint)
        self.canvas.bind("<ButtonRelease-1>", self.predict)
        
        # 创建按钮区域
        button_frame = ttk.Frame(root)
        button_frame.pack(pady=10)
        
        self.clear_btn = ttk.Button(button_frame, text="清除画板", command=self.clear_canvas)
        self.clear_btn.pack(side="left", padx=10)
        
        # 创建预测结果显示区域
        result_frame = ttk.LabelFrame(root, text="识别结果")
        result_frame.pack(pady=10, padx=20, fill="both", expand=True)
        
        self.result_label = ttk.Label(result_frame, text="请手写一个数字...", font=("Arial", 24))
        self.result_label.pack(pady=20)
        
        self.confidence_label = ttk.Label(result_frame, text="", font=("Arial", 14))
        self.confidence_label.pack(pady=5)
        
        # 初始化变量
        self.last_x = None
        self.last_y = None
        self.line_width = 15
        
        # 存储画布位置信息
        self.canvas.update_idletasks()
        self.canvas_x = self.root.winfo_rootx() + self.canvas.winfo_x()
        self.canvas_y = self.root.winfo_rooty() + self.canvas.winfo_y()
        self.canvas_width = self.canvas.winfo_width()
        self.canvas_height = self.canvas.winfo_height()
    
    def paint(self, event):
        x, y = event.x, event.y
        if self.last_x and self.last_y:
            self.canvas.create_line(self.last_x, self.last_y, x, y, 
                                    width=self.line_width, fill="black", 
                                    capstyle=tk.ROUND, smooth=True)
        self.last_x = x
        self.last_y = y
    
    def clear_canvas(self):
        self.canvas.delete("all")
        self.result_label.config(text="请手写一个数字...")
        self.confidence_label.config(text="")
        self.last_x = None
        self.last_y = None
    
    def predict(self, event):
        self.canvas.update_idletasks()  # 确保UI更新
        self.canvas_x = self.root.winfo_rootx() + self.canvas.winfo_x()
        self.canvas_y = self.root.winfo_rooty() + self.canvas.winfo_y()
        # 重置最后位置
        self.last_x = None
        self.last_y = None
        
        # 获取画布在屏幕上的位置
        self.canvas.update_idletasks()
        self.canvas_x = self.root.winfo_rootx() + self.canvas.winfo_x()
        self.canvas_y = self.root.winfo_rooty() + self.canvas.winfo_y()
        
        # 使用ImageGrab直接截取画布区域
        bbox = (
            self.canvas_x, 
            self.canvas_y, 
            self.canvas_x + self.canvas.winfo_width(), 
            self.canvas_y + self.canvas.winfo_height()
        )
        
        img = ImageGrab.grab(bbox=bbox)
        
        # 转换为模型需要的格式
        img = img.convert('L')  # 转换为灰度图
        img = img.resize((32, 32))
        
        # 转换为张量并进行预测
        img_tensor = transform(img).unsqueeze(0).to(device)
        print(img_tensor)
        
        with torch.no_grad():
            output = net(img_tensor)
            probabilities = torch.nn.functional.softmax(output[0], dim=0)
            pred = output.argmax(dim=1, keepdim=True).item()
            confidence = probabilities[pred].item()
        
        # 显示结果
        self.result_label.config(text=f"识别结果: {pred}")
        self.confidence_label.config(text=f"置信度: {confidence:.2%}")
        
        # 可选：显示处理后的图像用于调试
        #img.show(title="处理后的图像")

        
root = tk.Tk()
app = DigitRecognizerApp(root)
root.mainloop()


tensor([[[[0.9412, 0.9412, 0.9412,  ..., 0.8824, 0.9490, 0.9412],
          [0.9412, 0.9412, 0.9412,  ..., 0.9451, 0.9412, 0.9412],
          [0.9412, 0.9412, 0.9412,  ..., 0.9412, 0.9412, 0.9412],
          ...,
          [0.9412, 0.9373, 0.9725,  ..., 1.0000, 1.0000, 1.0000],
          [0.9412, 0.9373, 0.9725,  ..., 1.0000, 1.0000, 1.0000],
          [0.9412, 0.9373, 0.9725,  ..., 1.0000, 1.0000, 1.0000]]]],
       device='cuda:0')
tensor([[[[0.9412, 0.9412, 0.9412,  ..., 0.8824, 0.9490, 0.9412],
          [0.9412, 0.9412, 0.9412,  ..., 0.9451, 0.9412, 0.9412],
          [0.9412, 0.9412, 0.9412,  ..., 0.9412, 0.9412, 0.9412],
          ...,
          [0.9412, 0.9373, 0.9725,  ..., 1.0000, 1.0000, 1.0000],
          [0.9412, 0.9373, 0.9725,  ..., 1.0000, 1.0000, 1.0000],
          [0.9412, 0.9373, 0.9725,  ..., 1.0000, 1.0000, 1.0000]]]],
       device='cuda:0')
tensor([[[[0.9412, 0.9412, 0.9412,  ..., 0.8824, 0.9490, 0.9412],
          [0.9412, 0.9412, 0.9412,  ..., 0.9451, 0.9412, 0