In [39]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torch.utils.data import DataLoader, random_split, RandomSampler, Subset
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoModelForImageClassification
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import _LRScheduler, CosineAnnealingLR, OneCycleLR, ReduceLROnPlateau, StepLR
import timm
from torchvision.models import ResNet50_Weights
from timm.data.auto_augment import rand_augment_transform
import io
import base64
from openai import OpenAI
from datetime import datetime
import copy
import time

In [None]:
# 运行前需要填入这两项
openai_api_rul = ""
openai_api_key = ""

In [2]:
seeds = [0,0,0,0,0]
# seeds = [0,0,0,0,0]
eval_seed = 0

train_augmentations = [
    torchvision.transforms.Compose([
        rand_augment_transform(
            config_str='rand-m7-mstd0.5',
            hparams=dict()
        ) 
    ]),
    
    torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(), 
        torchvision.transforms.RandomErasing(p=0.25),
        torchvision.transforms.ToPILImage()
    ]),
    
    torchvision.transforms.Compose([
        torchvision.transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    ]),
    
    torchvision.transforms.Compose([
        torchvision.transforms.Pad(4),                  # Padding
        torchvision.transforms.RandomResizedCrop(32),  # 裁剪并resize
    ]),
    
    torchvision.transforms.Compose([
        torchvision.transforms.RandomHorizontalFlip(p=0.5),          # 随机水平翻转
    ]),
]

# train_aug_config = [{},{},{},{"mixup_alpha":0.1},{"cutmix_alpha":1.0}]
train_aug_config = [{},{},{},{},{}]
if_aug_matrix = False
aug_matrix_train_aug_combines = []
aug_matrix_val_results = []
if if_aug_matrix:
    temp_aug_config = []
    for i in range(len(train_augmentations)):
        for j in range(len(train_augmentations)):
            if i == j:
                aug_matrix_train_aug_combines.append(torchvision.transforms.Compose([train_augmentations[i]]))
            else:
                aug_matrix_train_aug_combines.append(torchvision.transforms.Compose([train_augmentations[i],train_augmentations[j]]))
            temp_aug_config.append(train_aug_config[i] | train_aug_config[j])
    train_aug_config = temp_aug_config
# 超参
is_eval = True
use_one_model = False

num_epochs = 90
learning_rate = 0.001
batch_size = 128
weight_decay = 0
grad_clip  = 0
save_bin_name = "202501101029"
datasets = "uoft-cs/cifar10"
datasets_image_column_name = "img"
number_classes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pretrain_model_list = []
model_list = []

model_name1 = "microsoft/resnet-50"
model1 = AutoModelForImageClassification.from_pretrained(model_name1, trust_remote_code=True)
pretrain_model_list.append(model1)
model_list.append("acc_best_model_202412300900.bin")

model_name2 = "leftthomas/resnet50"
model2 = AutoModelForImageClassification.from_pretrained(model_name2, trust_remote_code=True)
pretrain_model_list.append(model2)
model_list.append("acc_best_model_202412301709.bin")

model3 = torch.hub.load('pytorch/vision:v0.10.0', 'resnet50', weights=ResNet50_Weights.IMAGENET1K_V1)
pretrain_model_list.append(model3)
model_list.append("acc_best_model_202501052203-0.bin")

model4 = torch.hub.load('pytorch/vision:v0.10.0', 'resnet50', weights=ResNet50_Weights.IMAGENET1K_V2)
pretrain_model_list.append(model4)
model_list.append("acc_best_model_202501052203-1.bin")

model5 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_resnet50', pretrained=True)
pretrain_model_list.append(model5)
model_list.append("acc_best_model_202412302006.bin")

if use_one_model:
    number_models = len(pretrain_model_list)
    pretrain_model_list = []
    for i in range(number_models):
        pretrain_model_list.append(model1)

for i,model in enumerate(pretrain_model_list):

    # 修改分类头
    if not hasattr(model,"fc") and hasattr(model, "classifier"):
        model.classifier = nn.Sequential(
            model.classifier[0],
            nn.Linear(in_features=2048, out_features=number_classes, bias=True)
        )
    elif hasattr(model, "model") and hasattr(model.model, "fc"):
        model.model.fc = nn.Linear(in_features=2048, out_features=number_classes, bias=True)
    elif hasattr(model,"fc"):
        model.fc = nn.Linear(in_features=2048, out_features=number_classes, bias=True)
    
    model.to(device)
    if is_eval:
        model.load_state_dict(torch.load(model_list[i]))
        print(f"{i}th Model loaded from {model_list[i]}")
    

use_adam = True

use_sgd = False
sgd_momentum = 0.9
sgd_nesterov = False

use_custom_scheduler = False
custom_scheduler_gammas = [0.1, 0.01, 0.001, 0.0005]
custom_scheduler_milestones = [80, 120,160,180]

use_cosine_annealing = False

use_onecyclelr = False

use_reducelronplateau = False
reducelronplateau_mode = "max"
reducelronplateau_factor = 0.1
reducelronplateau_patience = 3
reducelronplateau_threshold = 0.001

use_steplr = True
steplr_step_size = 30
steplr_gamma = 0.1


test_augmentation_transforms = torchvision.transforms.Compose([
    # torchvision.transforms.Pad(4),                                # Padding
    # torchvision.transforms.RandomCrop(32, padding=4),
    # torchvision.transforms.RandomCrop(32, padding=4, padding_mode='reflect'),
    # torchvision.transforms.Resize((160,160)),
    # torchvision.transforms.RandomCrop((128,128)),
    # torchvision.transforms.RandomCrop(32),                       # 随机裁剪
    # torchvision.transforms.RandomHorizontalFlip(p=0.5),          # 随机水平翻转
    # torchvision.transforms.RandomResizedCrop(32),  # 裁剪并resize
    # torchvision.transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05),
    # torchvision.transforms.Resize((int(32/0.95),int(32/0.95))),
    # torchvision.transforms.CenterCrop((32, 32)),
    # torchvision.transforms.RandomAdjustSharpness(2, p=0.5)
    # torchvision.transforms.RandomCrop((32,32)),
    # torchvision.transforms.RandomHorizontalFlip(p=0.5),
    # torchvision.transforms.RandomCrop(32, padding=4),  # 随机裁剪
])

# pre_get_mean = [0.4214581847190857, 0.3764420747756958, 0.28500789403915405] 
# pre_get_std = [0.293678343296051, 0.24473334848880768, 0.27143558859825134]
pre_get_mean = []
pre_get_std = []

criterion = nn.CrossEntropyLoss()

Using cache found in C:\Users\admin/.cache\torch\hub\pytorch_vision_v0.10.0
Using cache found in C:\Users\admin/.cache\torch\hub\pytorch_vision_v0.10.0
Using cache found in C:\Users\admin/.cache\torch\hub\NVIDIA_DeepLearningExamples_torchhub
  model.load_state_dict(torch.load(model_list[i]))


0th Model loaded from acc_best_model_202412300900.bin
1th Model loaded from acc_best_model_202412301709.bin
2th Model loaded from acc_best_model_202501052203-0.bin
3th Model loaded from acc_best_model_202501052203-1.bin
4th Model loaded from acc_best_model_202412302006.bin


In [3]:
def set_random_seed(seed):
    """
    设置随机种子以确保实验的可重复性
    """
    # 设置 PyTorch 的随机种子
    torch.manual_seed(seed)
    # 如果使用 GPU，也需要设置随机种子
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # 如果有多个 GPU
    # 设置 NumPy 的随机种子
    np.random.seed(seed)
    # 设置 Python 内置的随机数生成器的种子
    random.seed(seed)
    # 确保 PyTorch 的随机性是可重复的
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False  # 关闭自动优化


In [4]:
def calculate_mean_std(dataset, batch_size=1):
    """
    动态计算数据集的均值和标准差，适用于任何数据集。
    Args:
        dataset (torch.utils.data.Dataset): 目标数据集。
        batch_size (int): 数据加载的批量大小（默认64）。
    Returns:
        mean (list): 每个通道的均值。
        std (list): 每个通道的标准差。
    """
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4)
    # 用于存储累计的均值和方差
    total_sum = 0.0
    total_squared_sum = 0.0
    total_pixels = 0
    # 遍历数据集，计算总和和平方和
    for images, _ in loader:
        # 图像形状为 (batch_size, channels, height, width)
        batch_samples = images.size(0)  # 当前批次的样本数量
        pixels_per_image = images.size(2) * images.size(3)  # 每张图片的像素数
        total_pixels += batch_samples * pixels_per_image
        # 将图像展开为 (batch_size, channels, -1) 后求和
        total_sum += images.sum(dim=[0, 2, 3])  # 每个通道的总和
        total_squared_sum += (images ** 2).sum(dim=[0, 2, 3])  # 每个通道的平方和
    # 计算均值和标准差
    mean = total_sum / total_pixels
    std = torch.sqrt((total_squared_sum / total_pixels) - (mean ** 2))
    return mean.tolist(), std.tolist()


In [5]:
# Training function
def train(model, train_loader, optimizer, criterion, device, train_aug_config):
    model.train()
    running_loss = 0.0
    for images, labels in tqdm(train_loader, desc="Training", leave=False):
        images, labels = images.to(device), labels.to(device)

        if hasattr(train_aug_config,"mixup_alpha") or hasattr(train_aug_config,"cutmix_alpha"):
            mixup_alpha = train_aug_config["mixup_alpha"] if hasattr(train_aug_config,"mixup_alpha") else 0
            cutmix_alpha = train_aug_config["cutmix_alpha"] if hasattr(train_aug_config,"cutmix_alpha") else 0
            # 初始化 Mixup
            mixup_fn = timm.data.mixup.Mixup(
                mixup_alpha= mixup_alpha,  # Mixup 的 alpha 参数
                cutmix_alpha= cutmix_alpha, # 如果不使用 CutMix，可以将其设为 0
                label_smoothing=0.0,  # 标签平滑，如果需要
                num_classes=number_classes    # 数据集的类别数量
            )
            images, labels = mixup_fn(images, labels) 

        optimizer.zero_grad()
        
        if hasattr(model(images), "logits"):
                outputs = model(images).logits
        elif isinstance(model(images), dict):
            outputs = model(images)["logits"]
        else:
            outputs = model(images)
        
        loss = criterion(outputs, labels)
        loss.backward()

        if grad_clip != 0:
            # 裁剪梯度的范数
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip)
        
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(train_loader)

def evaluate_model(model, data_loader, criterion, device, need_record=False):
    """
    评估模型并记录所有样本的最大信心分数，同时区分正确分类和错误分类的样本。

    Args:
        model: 评估的模型。
        data_loader: 数据加载器（测试集或验证集）。
        criterion: 损失函数。
        device: 设备（CPU/GPU）。

    Returns:
        avg_loss: 平均损失。
        accuracy: 准确率。
        correct_confidences: 正确分类样本的最大信心分数列表。
        wrong_confidences: 错误分类样本的最大信心分数列表。
    """
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    correct_confidences = []
    wrong_confidences = []

    with torch.no_grad():
        for images, labels in tqdm(data_loader, desc="Evaluating", leave=False):
            images, labels = images.to(device), labels.to(device)
            if hasattr(model(images), "logits"):
                outputs = model(images).logits
            elif isinstance(model(images), dict):
                outputs = model(images)["logits"]
            else:
                outputs = model(images)
            
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            probabilities = torch.softmax(outputs, dim=1)
            max_confidences, predicted = probabilities.max(1)
            
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
            
            if need_record:
                # 根据分类正确与否记录最大信心分数
                for i in range(len(labels)):
                    if predicted[i] == labels[i]:
                        correct_confidences.append(max_confidences[i].item())
                    else:
                        wrong_confidences.append(max_confidences[i].item())

    accuracy = 100. * correct / total

    return total_loss / len(data_loader), accuracy, correct_confidences, wrong_confidences


In [44]:
def get_remote_predict(image,test_mean,test_std):
    
    print("start request remote model.")
    start_time = datetime.now()
    
    image = image.cpu()
    # 对图像进行反归一化
    denormalized_image = denormalize(image.unsqueeze(0), test_mean, test_std).squeeze(0).clamp(0, 1)

    # 转换图像为PIL格式并保存
    transform = torchvision.transforms.ToPILImage()
    pil_image = transform(denormalized_image)
    
    # 将PIL图像转换为字节流
    image_bytes = io.BytesIO()
    pil_image.save(image_bytes, format='PNG')
    image_bytes.seek(0)
    
    client = OpenAI(
            base_url= openai_api_rul,
            api_key= openai_api_key,
    )
    
    result = ""
    
    messages = [
        {
            "role": "system", 
            "content": "Please analyze the given image and determine which category it belongs to from the following list: Airplane, Automobile, Bird, Cat, Deer, Dog, Frog, Horse, Ship, Truck. Respond with only the index number corresponding to the category, where the indexes are as follows: 0: Airplane; 1: Automobile; 2: Bird; 3: Cat; 4: Deer; 5: Dog; 6: Frog; 7: Horse; 8: Ship; 9: Truck. Provide ONLY the index number as your answer. DO NOT add more text."
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text", "text": "What is the category of the image? Please analyze the given image and determine which category it belongs to from the following list: Airplane, Automobile, Bird, Cat, Deer, Dog, Frog, Horse, Ship, Truck. Respond with only the index number corresponding to the category, where the indexes are as follows: 0: Airplane; 1: Automobile; 2: Bird; 3: Cat; 4: Deer; 5: Dog; 6: Frog; 7: Horse; 8: Ship; 9: Truck. Provide ONLY the index number as your answer. DO NOT add more text."
                },
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpg;base64,{base64.b64encode(image_bytes.read()).decode('utf-8')}"},
                },
            ]
        }
    ]
    
    while True:  # 无限循环直到成功
        try:
            chat_completion = client.chat.completions.create(
                messages=messages,
                model="gpt-4o",
            )
            print(chat_completion.choices[0].message.content)
            result = chat_completion.choices[0].message.content
            break  # 如果请求成功，退出循环
        except Exception as e:
            error = str(e)
            if "Cloudflare" not in error:
                # 如果不是 Cloudflare 错误，直接打印并退出
                print(error)
                break
            else:
                # 如果是 Cloudflare 错误，等待 3 秒后重试
                print("遇到 Cloudflare 错误，等待 3 秒后重试...")
                time.sleep(3)
    
    end_time = datetime.now()
    elapsed_time = (end_time - start_time).total_seconds()
    print(f"请求远程模型花费时间: {elapsed_time} 秒")
    return result, elapsed_time

In [29]:
def ensemble_evaluate_model(ensemble_model_list, data_loader, criterion, device,need_record=False, remote_threshold=1.0,test_mean=None,test_std=None):
    for model in ensemble_model_list:
        model.eval()

    total_loss = 0.0
    correct = 0
    total = 0
    confidences = []
    is_correct = []
    total_time = 0
    local_predict_time = 0
    remote_predict_time = 0
    remote_predict_count = 0
    remote_reject_count = 0
    remote_correct_count = 0

    with torch.no_grad():
        for images, labels in tqdm(data_loader, desc="Evaluating", leave=False):
            images, labels = images.to(device), labels.to(device)
            output_list = []
            start_time = datetime.now()
            for model in ensemble_model_list:
                if hasattr(model(images), "logits"):
                    outputs = model(images).logits
                elif isinstance(model(images), dict):
                    outputs = model(images)["logits"]
                else:
                    outputs = model(images)
                output_list.append(outputs)
            stacked_outputs = torch.stack(output_list)
            outputs = torch.mean(stacked_outputs, dim=0)
            
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            probabilities = torch.softmax(outputs, dim=1)
            max_confidences, predicted = probabilities.max(1)
            
            end_time = datetime.now()
            elapsed_time = (end_time - start_time).total_seconds()
            total_time += elapsed_time
            local_predict_time += elapsed_time
            if 0.0 <= remote_threshold <= 1.0:
                masks = max_confidences < remote_threshold
                for i,mask in enumerate(masks):
                    if mask:
                        remote_predict,time_spend = get_remote_predict(images[i],test_mean,test_std)
                        remote_predict_time += time_spend
                        total_time += time_spend
                        remote_predict_count += 1
                        try:
                            predicted[i] = int(remote_predict)
                            if predicted[i] == labels[i]:
                                remote_correct_count += 1
                        except Exception as e:
                            remote_reject_count += 1
                            remote_predict_count -= 1
            
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
            
            if need_record:
                # 保存置信度和分类正确性
                confidences.extend(max_confidences.cpu().numpy())
                is_correct.extend(predicted.eq(labels).cpu().numpy())

    print("total avg time:",total_time/len(data_loader.dataset))        
    print("total count:",len(data_loader.dataset))
    print("local avg time:",local_predict_time/len(data_loader.dataset))    
    if remote_predict_count+remote_reject_count > 0:
        print("remote avg time:",remote_predict_time/(remote_predict_count+remote_reject_count))
    print("remote_predict_count:",remote_predict_count)
    print("remote_reject_count:",remote_reject_count)
    print("remote_correct_count:",remote_correct_count)
    print("total correct:", correct)
    accuracy = 100. * correct / total

    return total_loss / len(data_loader), accuracy, np.array(confidences), np.array(is_correct)


In [8]:
def save_image(path,name,image,test_mean,test_std):
    image = image.cpu()
    # 对图像进行反归一化
    denormalized_image = denormalize(image.unsqueeze(0), test_mean, test_std).squeeze(0).clamp(0, 1)

    # 转换图像为PIL格式并保存
    transform = torchvision.transforms.ToPILImage()
    pil_image = transform(denormalized_image)
    save_path = path
    save_name = name
    pil_image.save(save_path+"/"+save_name)

In [9]:
def get_dataloaders(augmentation_transforms,val_augmentation_transforms, batch_size=128):
    # Load dataset from Hugging Face
    dataset = load_dataset(datasets)
    transform = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),  # 将 PIL.Image 转换为 Tensor
    ])

    # 定义训练集和验证集的划分比例
    train_size = int(0.8 * len(dataset['train']))  # 80% 作为训练集
    val_size = len(dataset['train']) - train_size  # 剩下的作为验证集
    train_subset, val_subset = random_split(dataset['train'], [train_size, val_size])

    # 转换为 Tensor 格式
    train_dataset = [(transform(item[datasets_image_column_name].convert("RGB")), item["label"]) for item in train_subset]
 
    if len(pre_get_mean) > 0 and len(pre_get_std) > 0:
        mean = pre_get_mean
        std = pre_get_std
    else:
        # **只使用训练集计算均值和标准差**
        mean, std = calculate_mean_std(train_dataset)
        print("Train dataset mean and std:", mean, std)

    # 定义训练集的 transform（包括数据增强）
    train_transform = torchvision.transforms.Compose([
        augmentation_transforms,
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(mean=mean, std=std)
    ])

    # 验证集和测试集的 transform（不包括数据增强）
    eval_transform = torchvision.transforms.Compose([
        val_augmentation_transforms,
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(mean=mean, std=std)
    ])

    # 应用 transforms
    train_dataset = [(train_transform(item[datasets_image_column_name].convert("RGB")), item["label"]) for item in train_subset]
    val_dataset = [(eval_transform(item[datasets_image_column_name].convert("RGB")), item["label"]) for item in val_subset]

    # 创建 DataLoader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

    return train_loader, val_loader

In [10]:
def get_test_loader(test_augmentation_transforms, batch_size=128,size=10000):
    
    # Load dataset from Hugging Face
    dataset = load_dataset(datasets)
    transform = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),  # 将 PIL.Image 转换为 Tensor
    ])

    # 定义训练集和验证集的划分比例
    train_size = int(0.8 * len(dataset['train']))  # 80% 作为训练集
    val_size = len(dataset['train']) - train_size  # 剩下的作为验证集
    train_subset, val_subset = random_split(dataset['train'], [train_size, val_size])

    # 转换为 Tensor 格式
    train_dataset = [(transform(item[datasets_image_column_name].convert("RGB")), item["label"]) for item in train_subset]
 
    if len(pre_get_mean) > 0 and len(pre_get_std) > 0:
        mean = pre_get_mean
        std = pre_get_std
    else:
        # **只使用训练集计算均值和标准差**
        mean, std = calculate_mean_std(train_dataset)
        print("Train dataset mean and std:", mean, std)

    # 验证集和测试集的 transform（不包括数据增强）
    eval_transform = torchvision.transforms.Compose([
        test_augmentation_transforms,
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(mean=mean, std=std)
    ])

    test_dataset = [(eval_transform(item[datasets_image_column_name].convert("RGB")), item["label"]) for item in dataset["test"]]
    
    if size > len(test_dataset):
        raise ValueError("测试集子集大小不能超过测试集大小")
    
    subset_indices = random.sample(range(len(test_dataset)), size)
    
    test_subset = Subset(test_dataset, subset_indices)

    test_loader = DataLoader(test_subset, batch_size=batch_size, shuffle=False, num_workers=4)

    return test_loader, mean, std

In [11]:
def visualize_confidences(correct_confidences, wrong_confidences):
    """
    可视化正确分类样本和错误分类样本的最大信心分数。

    Args:
        correct_confidences: 正确分类样本的最大信心分数列表。
        wrong_confidences: 错误分类样本的最大信心分数列表。
    """
    plt.figure(figsize=(number_classes, 6))
    plt.hist(correct_confidences, bins=50, alpha=0.7, label='Correctly Classified', color='green')
    plt.hist(wrong_confidences, bins=50, alpha=0.7, label='Misclassified', color='orange')
    plt.xlabel('Maximum Confidence Score')
    plt.ylabel('Frequency')
    plt.title('Confidence Distribution of Correct and Wrong Classifications')
    plt.legend()
    plt.grid(True)
    plt.show()

In [12]:
class CustomMultiStepLR(_LRScheduler):
    def __init__(self, optimizer, milestones, gammas, last_epoch=-1):
        self.milestones = milestones
        self.gammas = gammas
        assert len(milestones) == len(gammas), "Milestones and gammas must have the same length"
        super(CustomMultiStepLR, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        factor = 1.0
        for milestone, gamma in zip(self.milestones, self.gammas):
            if self.last_epoch >= milestone:
                factor = gamma  # 只应用当前 milestone 的 gamma
        return [base_lr * factor for base_lr in self.base_lrs]

In [13]:
# 定义反归一化函数
def denormalize(tensor, mean, std):
    """
    将归一化的张量还原为原始像素值范围
    :param tensor: 标准化后的张量
    :param mean: 均值
    :param std: 标准差
    :return: 非归一化的张量
    """
    mean = torch.tensor(mean).view(1, -1, 1, 1)
    std = torch.tensor(std).view(1, -1, 1, 1)
    return tensor * std + mean

In [14]:
def plot_confidence_accuracy_and_ece(confidences, is_correct, num_bins=10):
    """
    绘制置信度与准确率、样本数量的关系曲线，并计算 ECE
    :param confidences: 每个样本的最大置信度
    :param is_correct: 每个样本的分类正确性 (1: 正确, 0: 错误)
    :param num_bins: 分箱数量，用于 ECE 计算
    """
    thresholds = np.linspace(0.0, 1.0, 50)  # 设置不同的置信度阈值
    accuracies = []
    sample_counts = []

    for tau in thresholds:
        mask = confidences >= tau  # 筛选置信度大于 tau 的样本
        if mask.sum() > 0:  # 如果筛选后的样本数大于0
            accuracy = is_correct[mask].mean()  # 计算准确率
        else:
            accuracy = 0.0  # 无样本时准确率为0
        accuracies.append(accuracy)
        sample_counts.append(mask.sum())
    
    # 计算 ECE
    bin_boundaries = np.linspace(0.0, 1.0, num_bins + 1)  # 分箱边界
    ece = 0.0
    bin_accuracies = []
    bin_confidences = []
    bin_sample_counts = []

    for i in range(num_bins):
        bin_lower = bin_boundaries[i]
        bin_upper = bin_boundaries[i + 1]
        bin_mask = (confidences > bin_lower) & (confidences <= bin_upper)
        bin_size = bin_mask.sum()

        if bin_size > 0:
            bin_accuracy = is_correct[bin_mask].mean()
            bin_confidence = confidences[bin_mask].mean()
            ece += (bin_size / len(confidences)) * abs(bin_accuracy - bin_confidence)

            bin_accuracies.append(bin_accuracy)
            bin_confidences.append(bin_confidence)
            bin_sample_counts.append(bin_size)
        else:
            bin_accuracies.append(0.0)
            bin_confidences.append(0.0)
            bin_sample_counts.append(0)

    # 绘制曲线
    plt.figure(figsize=(10, 8))

    # 绘制准确率和样本数量曲线
    plt.subplot(2, 1, 1)
    plt.plot(thresholds, accuracies, label="Accuracy vs. Confidence Threshold", color="b")
    plt.xlabel("Confidence Threshold (τ)")
    plt.ylabel("Accuracy")
    plt.title("Accuracy vs. Confidence Threshold")
    plt.grid(True)
    plt.legend()

    plt.subplot(2, 1, 2)
    plt.plot(thresholds, sample_counts, label="Sample Count vs. Confidence Threshold", color="g")
    plt.xlabel("Confidence Threshold (τ)")
    plt.ylabel("Sample Count")
    plt.title("Sample Count vs. Confidence Threshold")
    plt.grid(True)
    plt.legend()

    plt.tight_layout()
    plt.show()

    return ece, bin_accuracies, bin_confidences, bin_sample_counts


In [15]:
def evaluate_score_vs_threshold(confidences, is_correct, alpha=0.5, beta=0.5):
    """
    计算不同置信度阈值下的准确率、覆盖率和得分
    :param confidences: 每个样本的最大置信度
    :param is_correct: 每个样本的分类正确性 (1: 正确, 0: 错误)
    :param alpha: 准确率的权重
    :param beta: 覆盖率的权重
    :return: thresholds, accuracies, coverages, scores
    """
    thresholds = np.linspace(0.0, 1.0, 50)
    accuracies = []
    coverages = []
    scores = []

    total_samples = len(confidences)

    for tau in thresholds:
        mask = confidences >= tau
        
        if mask.sum() > 0:
            accuracy = is_correct[mask].mean()  # 准确率
            coverage = mask.sum() / total_samples  # 覆盖率
        else:
            accuracy = 0.0
            coverage = 0.0
        
        if alpha + beta > 1.0:
            beta = 1 - alpha
            if beta >= 0 and beta <= 1.0:
                print(f"beta has been justified to {beta}")
            else:
                alpha = 0.5
                beta = 0.5
                print(f"alpha has been justified to {alpha}")
                print(f"beta has been justified to {beta}")
        
        gamma = 1 - alpha - beta
        
        # 计算得分
        score  = alpha * accuracy + beta * coverage + gamma * accuracy * coverage
        
        accuracies.append(accuracy)
        coverages.append(coverage)
        scores.append(score)
    
    best_score_index = scores.index(max(scores))
    print("Threshold of the best score:", thresholds[best_score_index])
    print("Accuracy of the best score:", accuracies[best_score_index])
    print("Coverage of the best score:", coverages[best_score_index])
    return thresholds, accuracies, coverages, scores

def plot_score_vs_threshold(thresholds, accuracies, coverages, scores):
    """
    绘制准确率、覆盖率和得分随置信度阈值变化的曲线
    """
    plt.figure(figsize=(10, 6))
    plt.plot(thresholds, accuracies, label="Accuracy", color="blue")
    plt.plot(thresholds, coverages, label="Coverage", color="green")
    plt.plot(thresholds, scores, label="Score", color="red", linestyle="--")
    plt.xlabel("Confidence Threshold (τ)")
    plt.ylabel("Value")
    plt.title("Accuracy, Coverage, and Score vs. Confidence Threshold")
    plt.legend()
    plt.grid(True)
    plt.show()

In [30]:
train_loader_list = []
val_loader_list = []

set_random_seed(eval_seed)
test_loader,test_mean,test_std = get_test_loader(test_augmentation_transforms, batch_size=batch_size, size=1000)
    
if not is_eval:
    for i,model in enumerate(pretrain_model_list):
        set_random_seed(seeds[i])
        if if_aug_matrix:
            for j in range(len(aug_matrix_train_aug_combines)):
                print(f"aug:{aug_matrix_train_aug_combines[j]}")
                train_loader, val_loader = get_dataloaders(aug_matrix_train_aug_combines[j],test_augmentation_transforms,batch_size)
                train_loader_list.append(train_loader)
                val_loader_list.append(val_loader)
        else:
            train_loader, val_loader = get_dataloaders(train_augmentations[i],test_augmentation_transforms,batch_size)
            train_loader_list.append(train_loader)
            val_loader_list.append(val_loader)

Train dataset mean and std: [0.4917724132537842, 0.4823954701423645, 0.44665318727493286] [0.24700681865215302, 0.24335941672325134, 0.2616914212703705]


In [17]:
if not is_eval:
    for i,model in enumerate(pretrain_model_list):
        original_state = copy.deepcopy(model.state_dict())
        aug_times = 1
        if if_aug_matrix and len(aug_matrix_train_aug_combines)>0:
            aug_times = len(aug_matrix_train_aug_combines)
        for j in range(aug_times):
            set_random_seed(seeds[i])
            best_val_loss = float('inf')
            loss_best_model = None
            best_val_acc = 0
            acc_best_model = None
            
            if use_adam:
                optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
            elif use_sgd:
                optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=sgd_momentum, weight_decay=weight_decay, nesterov=sgd_nesterov)
            else:
                optimizer = optim.SGD(model.parameters(), lr=learning_rate)
                
            if use_custom_scheduler:
                scheduler = CustomMultiStepLR(optimizer, custom_scheduler_milestones, custom_scheduler_gammas)
            elif use_cosine_annealing:
                scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)
            elif use_onecyclelr:
                scheduler = OneCycleLR(optimizer, learning_rate, epochs=num_epochs, steps_per_epoch=len(train_loader_list[i]))
            elif use_reducelronplateau:
                scheduler = ReduceLROnPlateau(optimizer, mode=reducelronplateau_mode, factor=reducelronplateau_factor, patience=reducelronplateau_patience, threshold=reducelronplateau_threshold)
            elif use_steplr:
                scheduler = StepLR(optimizer, step_size=steplr_step_size, gamma=steplr_gamma)
            else:
                scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)
            
            
            for epoch in range(num_epochs):
                print(f"Epoch {epoch + 1}/{num_epochs}, LR: {scheduler.get_last_lr()[0]}")
                train_loss = train(model, train_loader_list[i*aug_times+j], optimizer, criterion, device, train_aug_config[i*aug_times+j])
                val_loss, val_accuracy,_,_ = evaluate_model(model, val_loader_list[i*aug_times+j], criterion, device, need_record=False)
                
                if(val_loss <= best_val_loss):
                    best_val_loss = val_loss
                    loss_best_model = model.state_dict()
                    torch.save(loss_best_model, "loss_best_model_"+save_bin_name+"-"+str(i*aug_times+j)+".bin")
                    print(f"val loss best model saved, best_val_loss: {best_val_loss}")
                if(val_accuracy > best_val_acc):
                    best_val_acc = val_accuracy
                    acc_best_model = model.state_dict()
                    torch.save(acc_best_model, "acc_best_model_"+save_bin_name+"-"+str(i*aug_times+j)+".bin")
                    print(f"val accuracy best model saved, best_val_acc: {best_val_acc}")
                print(f"Train Loss: {train_loss:.4f}, val Loss: {val_loss:.4f}, val Accuracy: {val_accuracy:.2f}%")
                if use_reducelronplateau:
                    scheduler.step(metrics=val_accuracy)
                else:
                    scheduler.step()
            if if_aug_matrix and len(aug_matrix_train_aug_combines)>0:
                aug_matrix_val_results.append(best_val_acc)
                print(aug_matrix_val_results)
                model.load_state_dict(original_state)

In [45]:
if is_eval:
    set_random_seed(eval_seed)
    test_loss, test_accuracy, confidences, is_correct = ensemble_evaluate_model(pretrain_model_list,test_loader, criterion, device, need_record=True,remote_threshold=0.9796,test_mean=test_mean, test_std=test_std)
    
    print(f"test_loss: {test_loss}, test_accuracy: {test_accuracy}")
    
    # 绘制曲线并计算 ECE
    ece, bin_accuracies, bin_confidences, bin_sample_counts = plot_confidence_accuracy_and_ece(confidences, is_correct, num_bins=10)
    
    # 输出 ECE 信息
    print(f"Expected Calibration Error (ECE): {ece:.4f}")
    
    # 打印分箱详细信息
    print("\nBin Information:")
    for i, (acc, conf, count) in enumerate(zip(bin_accuracies, bin_confidences, bin_sample_counts)):
        print(f"Bin {i + 1}: Accuracy = {acc:.4f}, Confidence = {conf:.4f}, Samples = {count}")
    

    # 计算不同阈值下的准确率、覆盖率和得分
    thresholds, accuracies, coverages, scores = evaluate_score_vs_threshold(
        confidences, is_correct, alpha=0.8, beta=0.1
    )
    
    # 绘制曲线
    plot_score_vs_threshold(thresholds, accuracies, coverages, scores)

Evaluating:   0%|          | 0/8 [00:00<?, ?it/s]

start request remote model.
**Category:** Animal 

请求远程模型花费时间: 6.214716 秒
start request remote model.
I currently cannot view or analyze images directly. However, if you can describe the image to me, I'd be happy to help you determine which category it belongs to!
请求远程模型花费时间: 2.828109 秒
start request remote model.
**Image Category:**

* The image appears to be of a **skull**. 

请求远程模型花费时间: 2.141016 秒
start request remote model.


                                                 

KeyboardInterrupt: 

In [None]:
# set_random_seed(eval_seed)
# for i,model in enumerate(pretrain_model_list):
#     global_correct_confidences = []
#     global_wrong_confidences = []
#     test_loss, test_accuracy, correct_confidences, wrong_confidences = evaluate_model(model, test_loader, criterion, device, need_record=True)
#     global_correct_confidences.extend(correct_confidences)
#     global_wrong_confidences.extend(wrong_confidences)
#     print(f"test_loss: {test_loss}, test_accuracy: {test_accuracy}")
#     visualize_confidences(global_correct_confidences, global_wrong_confidences)

In [None]:
# set_random_seed(eval_seed)
# for i,model in enumerate(pretrain_model_list):
#     global_correct_confidences = []
#     global_wrong_confidences = []
#     model.load_state_dict(torch.load("loss_best_model_"+save_bin_name+"-"+str(i)+".bin"))
#     test_loss, test_accuracy, correct_confidences, wrong_confidences = evaluate_model(model, test_loader, criterion, device, need_record=True)
#     global_correct_confidences.extend(correct_confidences)
#     global_wrong_confidences.extend(wrong_confidences)
#     print(f"test_loss: {test_loss}, test_accuracy: {test_accuracy}")
#     visualize_confidences(global_correct_confidences, global_wrong_confidences)

In [None]:
# set_random_seed(eval_seed)
# for i,model in enumerate(pretrain_model_list):
#     global_correct_confidences = []
#     global_wrong_confidences = []
#     model.load_state_dict(torch.load("acc_best_model_"+save_bin_name+"-"+str(i)+".bin"))
#     test_loss, test_accuracy, correct_confidences, wrong_confidences = evaluate_model(model, test_loader, criterion, device, need_record=True)
#     global_correct_confidences.extend(correct_confidences)
#     global_wrong_confidences.extend(wrong_confidences)
#     print(f"test_loss: {test_loss}, test_accuracy: {test_accuracy}")
#     visualize_confidences(global_correct_confidences, global_wrong_confidences)