In [1]:
import platform
import torch

def showinfo(tip, info):
    print("{}:{}".format(tip,info))

showinfo("操作系统及版本信息",platform.platform())
showinfo('系统位数', platform.architecture())
showinfo('pytorch版本', torch.__version__)
showinfo('cuda版本', torch.version.cuda)
showinfo('cudnn版本', torch.backends.cudnn.version())

操作系统及版本信息:Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
系统位数:('64bit', 'ELF')
pytorch版本:2.3.0
cuda版本:12.1
cudnn版本:8902


In [2]:
from torch import nn
import torch.nn.functional as F


class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dim_feedforward=2048, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src):
        src2 = self.norm1(src)
        # Multi-head attention
        # Ensure the output of self_attn has the same shape as src
        attn_output, _ = self.self_attn(src2, src2, src2)
        src = src + attn_output

        # Another normalization
        src2 = self.norm2(src)

        # Feed-forward network
        # Check output shape of linear1; should be [batch_size, seq_len, dim_feedforward]
        src2 = F.relu(self.linear1(src2))
        src2 = self.dropout1(src2)  # Apply dropout after activation

        # Ensure linear2 shapes the output back to [batch_size, seq_len, d_model]
        src2 = self.linear2(src2)
        src2 = self.dropout2(src2)  # Apply dropout

        # Residual connection
        src = src + src2
        return src

In [3]:
# 定义卷积神经网络
class TransformerCNN(nn.Module):
    def __init__(self):
        super(TransformerCNN, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=5, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))

        self.transformer_encoder = TransformerEncoderLayer(d_model=64, num_heads=8)
        self.fc1 = nn.Linear(7 * 7 * 64, 1000)
        self.fc2 = nn.Linear(1000, 10)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        # Reshape for Transformer
        b, c, h, w = out.size()
        out = out.view(b, c, h * w).permute(2, 0, 1)  # [seq_len, batch, features]

        # Transformer
        out = self.transformer_encoder(out)

        # Reshape back for FC layers
        out = out.permute(1, 2, 0).contiguous().view(b, -1)

        out = self.fc1(out)
        out = self.fc2(out)
        return out

In [4]:
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_set = MNIST(root='./data', train=True, download=True, transform=transform)
test_set = MNIST(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
test_loader = DataLoader(test_set, batch_size=64, shuffle=False)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import torch.optim as optim


lr = 0.001
epochs = 1

model = TransformerCNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    for i, (images, labels) in enumerate(train_loader):
        ## 前向传播
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        ## 反向传播
        optimizer.zero_grad() # 清楚上次训练的遗留梯度信息
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

torch.save(model.state_dict(), 'cnn_mnist_transformer.pth')

Epoch [1/1], Step [100/938], Loss: 0.2461
Epoch [1/1], Step [200/938], Loss: 0.2178
Epoch [1/1], Step [300/938], Loss: 0.1823
Epoch [1/1], Step [400/938], Loss: 0.0396
Epoch [1/1], Step [500/938], Loss: 0.1195
Epoch [1/1], Step [600/938], Loss: 0.0499
Epoch [1/1], Step [700/938], Loss: 0.1414
Epoch [1/1], Step [800/938], Loss: 0.0938
Epoch [1/1], Step [900/938], Loss: 0.1635


In [6]:
model = TransformerCNN()
model.load_state_dict(torch.load('cnn_mnist_transformer.pth'))
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        # 给出最大可能的1个结果
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(f'Accuracy on the test set: {100 * correct / total}%')

Accuracy on the test set: 98.04%
