In [1]:
import numpy as np
import SearchSpace as ss
import ModelBuild as Builder
import TrainModel as Trainer
from google.colab import drive
import pandas as pd
import os
import torch

import torch
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms

drive.mount('/content/drive')

Output hidden; open in https://colab.research.google.com to view.

In [2]:
# import torch

# # 检查 GPU 是否可用
# print("GPU available:", torch.cuda.is_available())
# print("GPU device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU available")
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
grid_params = ss.create_param_combinations(strategy="grid")
print("Grid Search Combinations:", grid_params[0])

Grid Search Combinations: {'num_layers': 1, 'units_per_layer': 2, 'activation': 'relu', 'learning_rate': 0.0001, 'batch_size': 16, 'dropout_rate': 0.05, 'l2_reg_strength': 1e-07}


In [4]:
# model = Builder.build_model(grid_params[0])
# print(model)

In [5]:

# 定义超参数搜索函数
def grid_search(param_grid, train_set, val_set, epochs=10, save_dir="/content/drive/MyDrive/DL_HPO/GridResult"):
    all_results = []
    best_result = None
    best_val_acc = 0.0
    best_model_wts = None
    total_training_time = 0  # 初始化总训练时间

    # 确保保存路径存在
    os.makedirs(save_dir, exist_ok=True)

    # 检查是否已经存在汇总文件，如果存在，直接加载以继续
    summary_file_path = f"{save_dir}/grid_result.csv"
    if os.path.exists(summary_file_path):
        existing_results_df = pd.read_csv(summary_file_path)
        total_training_time = existing_results_df["total_training_time"].max()  # 获取已保存的总时间

    # 遍历超参数组合，使用指定的起始索引
    for param_index in range(14256, len(param_grid) + 1):

        params = param_grid[param_index - 1]  # 获取当前超参数组合
        print(params)
        # 使用当前超参数组合构建模型
        model = Builder.build_model(params)

        # 创建 DataLoader
        batch_size = params["batch_size"]
        train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

        # 训练模型
        result = Trainer.train_model(model, train_loader, val_loader,epochs=epochs)

        # 累加当前模型的训练时间到总时间
        total_training_time += result["training_time"]

        # 将当前超参数组合和结果合并
        result_summary = {
            "param_index": param_index,
            "params": params,
            "strategy": "your_strategy_name",  # 替换为当前使用的策略名称
            "train_loss": result["train_losses"][-1],
            "val_loss": result["val_losses"][-1],
            "train_accuracy": result["train_accuracies"][-1],
            "val_accuracy": result["val_accuracies"][-1],
            "training_time": result["training_time"],
            "total_training_time": total_training_time  # 记录总训练时间
        }

        all_results.append(result_summary)

        # 追加当前结果到汇总文件
        result_df = pd.DataFrame([result_summary])
        if os.path.exists(summary_file_path):
            result_df.to_csv(summary_file_path, mode='a', header=False, index=False)
        else:
            result_df.to_csv(summary_file_path, mode='w', index=False)

        # 更新最佳模型并保存
        if result["val_accuracies"][-1] > best_val_acc:
            best_val_acc = result["val_accuracies"][-1]
            best_result = result_summary
            best_model_wts = model.state_dict()  # 保存最佳模型权重

            # 保存最佳结果到 Google Drive
            best_result_df = pd.DataFrame([best_result])
            best_result_df.to_csv(f"{save_dir}/best_grid_result.csv", index=False)
            torch.save(best_model_wts, f"{save_dir}/best_model_weights.pt")

    return all_results, best_result, best_model_wts



In [6]:

# 下载并预处理 MNIST 数据集
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# 加载 MNIST 数据集
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# 截取 1000 个训练样本和 100 个验证样本
train_subset = Subset(train_dataset, range(3000))
val_subset = Subset(test_dataset, range(500))

# # 创建 DataLoader
# train_loader = DataLoader(train_subset, batch_size=32, shuffle=True)
# val_loader = DataLoader(val_subset, batch_size=32, shuffle=False)

# # 检查数据集大小
# print("Training set size:", len(train_loader.dataset))
# print("Validation set size:", len(val_loader.dataset))


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:00<00:00, 75.6MB/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 24.7MB/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz





Failed to download (trying next):
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:00<00:00, 92.8MB/s]

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw






Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 8.74MB/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw






In [7]:
print(len(grid_params))

15360


In [None]:
all_results, best_result, best_model_wts = grid_search(grid_params, train_subset, val_subset, epochs=10,save_dir="/content/drive/MyDrive/DL_HPO/GridResult")

{'num_layers': 4, 'units_per_layer': 86, 'activation': 'sigmoid', 'learning_rate': 0.02080083823051904, 'batch_size': 16, 'dropout_rate': 0.6, 'l2_reg_strength': 0.01}
{'num_layers': 4, 'units_per_layer': 86, 'activation': 'sigmoid', 'learning_rate': 0.02080083823051904, 'batch_size': 32, 'dropout_rate': 0.05, 'l2_reg_strength': 1e-07}
{'num_layers': 4, 'units_per_layer': 86, 'activation': 'sigmoid', 'learning_rate': 0.02080083823051904, 'batch_size': 32, 'dropout_rate': 0.05, 'l2_reg_strength': 4.641588833612782e-06}
{'num_layers': 4, 'units_per_layer': 86, 'activation': 'sigmoid', 'learning_rate': 0.02080083823051904, 'batch_size': 32, 'dropout_rate': 0.05, 'l2_reg_strength': 0.00021544346900318845}
{'num_layers': 4, 'units_per_layer': 86, 'activation': 'sigmoid', 'learning_rate': 0.02080083823051904, 'batch_size': 32, 'dropout_rate': 0.05, 'l2_reg_strength': 0.01}
{'num_layers': 4, 'units_per_layer': 86, 'activation': 'sigmoid', 'learning_rate': 0.02080083823051904, 'batch_size': 32