In [21]:
import numpy as np
import SearchSpace as ss
import ModelBuild as Builder
import TrainModel as Trainer
from google.colab import drive
import pandas as pd
import os
import torch
import random

import torch
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
random_params = ss.create_param_combinations(strategy="genetic")
print("random Search Combinations:", random_params)

random Search Combinations: [{'num_layers': 2, 'units_per_layer': 111, 'activation': 'tanh', 'learning_rate': 0.0011443604255654146, 'batch_size': 256, 'dropout_rate': 0.17944005149147468, 'l2_reg_strength': 0.0025300435383487723}]


In [23]:
def successive_halving(train_set, val_set, save_dir="/content/drive/MyDrive/DL_HPO/SHResult"):
    search_space = ss.get_search_space("successive_halving")
    mutation_rate = 0.1
    all_results = []
    best_result = None
    best_val_acc = 0.0
    best_model_wts = None
    total_training_time = 0

    # 确保保存路径存在
    os.makedirs(save_dir, exist_ok=True)

    # 定义阶段的参数
    stages = [
        {"num_models": 1500, "epochs": 2},
        {"num_models": 750, "epochs": 4},
        {"num_models": 375, "epochs": 6},
        {"num_models": 187, "epochs": 8},
        {"num_models": 20, "epochs": 10},
        {"num_models": 2, "epochs": 27}
    ]

    # 遍历阶段
    for stage_idx, stage in enumerate(stages):
        num_models = stage["num_models"]
        epochs = stage["epochs"]
        print(f"Stage {stage_idx + 1} - Models: {num_models}, Epochs: {epochs}")

        # 为每个阶段生成或筛选种群
        if stage_idx == 0:
            # 初始阶段生成种群
            population = initialize_population(num_models)
        else:
            # 后续阶段，保留上一阶段适应度最高的模型
            top_indices = sorted(range(len(fitness_scores)), key=lambda i: fitness_scores[i])[:num_models]
            population = [population[i] for i in top_indices]

        # 评估当前种群的适应度
        fitness_scores = []
        for param_index, params in enumerate(population):
            # 构建模型并加载数据
            print(params)
            model = Builder.build_model(params)
            batch_size = int(params["batch_size"])
            train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

            # 训练模型
            result = Trainer.train_model(model, train_loader, val_loader, epochs=epochs)
            fitness_scores.append(result["val_losses"][-1])  # 使用验证损失作为适应度

            # 累加当前模型的训练时间
            total_training_time += result["training_time"]

            # 记录结果
            result_summary = {
                "stage": stage_idx + 1,
                "param_index": param_index,
                "params": params,
                "strategy": "successive_halving",
                "train_loss": result["train_losses"][-1],
                "val_loss": result["val_losses"][-1],
                "train_accuracy": result["train_accuracies"][-1],
                "val_accuracy": result["val_accuracies"][-1],
                "training_time": result["training_time"],
                "total_training_time": total_training_time,
                "epochs": epochs
            }
            all_results.append(result_summary)

            # 将结果保存到主汇总文件
            summary_file_path = f"{save_dir}/sh_result.csv"
            result_df = pd.DataFrame([result_summary])
            if os.path.exists(summary_file_path):
                result_df.to_csv(summary_file_path, mode='a', header=False, index=False)
            else:
                result_df.to_csv(summary_file_path, mode='w', index=False)

            # 更新最佳模型并保存
            if result["val_accuracies"][-1] > best_val_acc:
                best_val_acc = result["val_accuracies"][-1]
                best_result = result_summary
                best_model_wts = model.state_dict()

                # 保存最佳结果到 Google Drive
                best_result_df = pd.DataFrame([best_result])
                best_result_df.to_csv(f"{save_dir}/best_sh_result.csv", index=False)
                torch.save(best_model_wts, f"{save_dir}/best_model_weights.pt")

        # 单独保存阶段 5（10 个 epoch）和阶段 6（27 个 epoch）的结果
        if epochs == 10:
            result_df = pd.DataFrame(all_results[-num_models:])  # 仅保存本阶段结果
            result_df.to_csv(f"{save_dir}/sh_stage5_epoch10.csv", index=False)
        elif epochs == 27:
            result_df = pd.DataFrame(all_results[-num_models:])  # 仅保存本阶段结果
            result_df.to_csv(f"{save_dir}/sh_stage6_epoch27.csv", index=False)

    return all_results, best_result, best_model_wts

# 初始化种群
def initialize_population(size=1500):
    population = []
    for _ in range(size):
        individual = ss.create_param_combinations(strategy="successive_halving")
        population.append(individual[0])
    return population

In [24]:
# 下载并预处理 MNIST 数据集
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# 加载 MNIST 数据集
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# 截取 1000 个训练样本和 100 个验证样本
train_subset = Subset(train_dataset, range(3000))
val_subset = Subset(test_dataset, range(500))

In [None]:
all_results, best_result, best_model_wts = successive_halving(train_subset, val_subset,save_dir="/content/drive/MyDrive/DL_HPO/SFResult")

Stage 1 - Models: 1500, Epochs: 2
{'num_layers': 2, 'units_per_layer': 70, 'activation': 'tanh', 'learning_rate': 0.0009704174675462485, 'batch_size': 64, 'dropout_rate': 0.32654297061198934, 'l2_reg_strength': 4.7906175484690725e-06}
{'num_layers': 1, 'units_per_layer': 97, 'activation': 'tanh', 'learning_rate': 0.0024690173844576528, 'batch_size': 16, 'dropout_rate': 0.07583940866097771, 'l2_reg_strength': 1.2895482018754454e-05}
{'num_layers': 4, 'units_per_layer': 22, 'activation': 'sigmoid', 'learning_rate': 0.0007526872992185445, 'batch_size': 64, 'dropout_rate': 0.4557088393159684, 'l2_reg_strength': 2.823314399090332e-07}
{'num_layers': 2, 'units_per_layer': 6, 'activation': 'sigmoid', 'learning_rate': 0.00012399740803034807, 'batch_size': 128, 'dropout_rate': 0.23181433887334063, 'l2_reg_strength': 0.0001359723978681468}
{'num_layers': 4, 'units_per_layer': 81, 'activation': 'relu', 'learning_rate': 0.2523745664719103, 'batch_size': 256, 'dropout_rate': 0.2577936429707882, 'l2