In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 设置设备为GPU（如果可用）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 定义超参数
num_epochs = 25
batch_size = 128
learning_rate = 0.001

In [4]:
# 加载和预处理CIFAR-10数据集
transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

train_dataset = torchvision.datasets.CIFAR10(root='../data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR10(root='../data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


Files already downloaded and verified
Files already downloaded and verified


In [5]:
# 定义Vision Transformer模型
class VisionTransformer(nn.Module):
    def __init__(self, num_classes=10, image_size=32, patch_size=4, hidden_dim=128, num_heads=8, num_layers=2):
        super(VisionTransformer, self).__init__()
        
        self.embedding = nn.Conv2d(3, hidden_dim, kernel_size=patch_size, stride=patch_size)
        self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads), num_layers)
        self.fc = nn.Linear(hidden_dim * (image_size // patch_size) ** 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.flatten(2).permute(2, 0, 1)
        x = self.transformer(x)
        x = x.permute(1, 0, 2).flatten(1)
        x = self.fc(x)
        return x

In [6]:
# 定义Vision Transformer模型
class VisionTransformer1(nn.Module):
    def __init__(self, num_classes=10, image_size=32, patch_size=4, hidden_dim=128, num_heads=8, num_layers=1):
        super(VisionTransformer1, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5, padding=2) # 28x28
        self.pool1 = nn.MaxPool2d(2, 2) # 14x14
        self.conv2 = nn.Conv2d(6, 16, 5, padding=2) # 10x10
        self.pool2 = nn.MaxPool2d(2, 2) # 5x5
        self.conv3 = nn.Conv2d(16, 128, 5)

        # self.embedding = nn.Conv2d(3, hidden_dim, kernel_size=patch_size, stride=patch_size)
        self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads), num_layers)
        self.fc = nn.Linear(2048, 1024)
        self.fc1 = nn.Linear(1024,128)
        self.fc2 = nn.Linear(128, 10)
    
    def forward(self, x):
        out = self.conv1(x) #24
        out = F.relu(out)
        out = self.pool1(out)  #12
        out = self.conv2(out) #10
        out = F.relu(out)
        out = self.pool2(out)
        out = self.conv3(out)
        
        # out = F.interpolate(out, size=(32, 32), mode='bilinear', align_corners=False)
        # out = self.embedding(x)
        out = out.flatten(2).permute(2, 0, 1)
        out = self.transformer(out)
        out = out.permute(1, 0, 2).flatten(1)
        
        out = self.fc(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

In [7]:
# 定义ResNet18模型
class ResNet18(nn.Module):
    def __init__(self, num_classes=10):
        super(ResNet18, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        
        self.layer1 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64)
        )
        
        self.layer2 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(128)
        )
        
        self.layer3 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(256)
        )
        
        self.layer4 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(512)
        )
        
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)
        
        self.identity_conv1 = nn.Conv2d(64, 128, kernel_size=1, stride=2, padding=0, bias=False)
        self.identity_conv2 = nn.Conv2d(128, 256, kernel_size=1, stride=2, padding=0, bias=False)
        self.identity_conv3 = nn.Conv2d(256 , 512, kernel_size=1, stride=2, padding=0, bias=False)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        
        x1 = self.layer1(x)
        x = x + x1
        
        x2 = self.layer2(x1)
        x1_identity = self.identity_conv1(x)
        x = x1_identity + x2
        
        x3 = self.layer3(x)
        x2_identity = self.identity_conv2(x)
        x = x2_identity + x3
        
        x4 = self.layer4(x)
        x3_identity = self.identity_conv3(x)
        x = x3_identity + x4
        
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

In [8]:
# 初始化VIT模型
model1 = VisionTransformer().to(device)
model1.load_state_dict(torch.load('5ViT2.pth'))

model2 = ResNet18().to(device)
model2.load_state_dict(torch.load('ResNet18.pth'))

# 初始化VIT模型
model3 = VisionTransformer1().to(device)
model3.load_state_dict(torch.load('2LeNet_ViT.pth'))

<All keys matched successfully>

In [13]:
from collections import Counter

# 初始化计数器
correct = 0
total = 0
correct1 = 0
correct2 = 0
correct3 = 0

# 确保模型处于评估模式
model1.eval()
model2.eval()
model3.eval()

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        # 分别获取每个模型的输出
        outputs1 = model1(images)
        outputs2 = model2(images)
        outputs3 = model3(images)

        # 将输出转换为预测类别
        _, predicted1 = torch.max(outputs1.data, 1)
        _, predicted2 = torch.max(outputs2.data, 1)
        _, predicted3 = torch.max(outputs3.data, 1)

        total += labels.size(0)

        # 计算集成模型的准确率
        for i in range(labels.size(0)):
            predictions = [predicted2[i], predicted3[i],predicted1[i]]
            counter = Counter(predictions)
            most_common = counter.most_common(1)[0][0]
            correct += (most_common == labels[i]).item()

            # 单独计算每个模型的准确率
            correct1 += (predicted1[i] == labels[i]).item()
            correct2 += (predicted2[i] == labels[i]).item()
            correct3 += (predicted3[i] == labels[i]).item()

# 计算并打印准确率
accuracy = 100 * correct / total
accuracy1 = 100 * correct1 / total
accuracy2 = 100 * correct2 / total
accuracy3 = 100 * correct3 / total

print(f"Ensemble Test Accuracy: {accuracy:.2f}%")
print(f"Model 1 Test Accuracy: {accuracy1:.2f}%")
print(f"Model 2 Test Accuracy: {accuracy2:.2f}%")
print(f"Model 3 Test Accuracy: {accuracy3:.2f}%")


Ensemble Test Accuracy: 79.89%
Model 1 Test Accuracy: 78.50%
Model 2 Test Accuracy: 79.89%
Model 3 Test Accuracy: 77.31%
