In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import emukit
import GPy
from emukit.core import ParameterSpace, ContinuousParameter, DiscreteParameter
from emukit.core.initial_designs.random_design import RandomDesign
from emukit.core.initial_designs.latin_design import LatinDesign
from GPy.models import GPRegression
from emukit.model_wrappers import GPyModelWrapper

from emukit.bayesian_optimization.loops import BayesianOptimizationLoop
from emukit.bayesian_optimization.acquisitions import ExpectedImprovement, \
                                                      NegativeLowerConfidenceBound, \
                                                      MaxValueEntropySearch, \
                                                      ProbabilityOfImprovement
from emukit.core.acquisition import IntegratedHyperParameterAcquisition



In [11]:
import os
import pickle

from sklearn.preprocessing import MinMaxScaler, StandardScaler
import tqdm

import logging
logging.getLogger('variance').setLevel(logging.ERROR)
logging.getLogger('lengthscale').setLevel(logging.ERROR)

min_max_scaler = MinMaxScaler()


# 定义文件夹路径
folder_path = r'C:\Users\13282\Desktop\papercode\data\Aryl\original_processed' 

# 创建文件夹（如果不存在）
if not os.path.exists(folder_path):
    os.makedirs(folder_path)


# 处理15个CSV文件
for i in tqdm.tqdm(range(1, 16), desc="Loop Progress"): 
    source_name = f'aryl-{i}'
    file_path = os.path.join(folder_path, f'aryl-{i}.xlsx')
    df = pd.read_excel(file_path)
    
    X_all = df.iloc[:, :-1].values  
    Y_all = - df.iloc[:, -1].values  # y 加负号
    
    x_all_normalized = min_max_scaler.fit_transform(X_all)
    
    nan_indices = np.isnan(Y_all)
    true_indices = np.where(nan_indices)[0]
    x_all_normalized = x_all_normalized[~nan_indices.flatten()]
    Y_all = Y_all[~nan_indices.flatten()]
    
    X = x_all_normalized
    Y = Y_all.reshape(-1, 1)


    # 建立模型
    input_dim = len(X[0])
    ker = GPy.kern.Matern52(input_dim = input_dim, ARD = True)#

    model_var = 0.1
    ker.lengthscale.constrain_bounded(0.001, 5) #lengthscale决定着点与点之间的相互作用程度
    ker.variance.constrain_bounded(1e-2, 1e4) #方差

    model_gpy = GPRegression(X,Y, ker)

    model_gpy.Gaussian_noise.variance = model_var**2
    model_gpy.Gaussian_noise.variance.fix()

    model_gpy.randomize()
    model_gpy.optimize_restarts(num_restarts=20,verbose =False, messages=False)

    base_model_1 = GPyModelWrapper(model_gpy)


    # 保存 base_model_1 到文件
    with open(os.path.join( 'pca',  f'{source_name}.pkl'), 'wb') as f:
        pickle.dump(base_model_1, f)





Loop Progress: 100%|██████████| 15/15 [06:17<00:00, 25.14s/it]
