In [1]:
import optuna
import yaml
import time
import numpy as np
import pandas as pd
import torch
from multiprocessing import Pool

from ili.dataloaders import StaticNumpyLoader
from ili.inference import InferenceRunner
from ili.validation import ValidationRunner

from CASBI.utils.create_dataframe import rescale



N_subhalos = 2
data = pd.read_parquet('../../../../data/dataframe/dataframe.parquet')
data = rescale(data, mean_and_std_path='../../../../data/preprocess/mean_and_std.parquet', scale_observations=True, scale_parameter=True, inverse=True) 
data =  data.drop(['gas_log10mass', 'a','redshift', 'mean_metallicity', 'std_metallicity','mean_FeMassFrac', 'std_FeMassFrac', 'mean_OMassFrac', 'std_OMassFrac'], axis=1)
min_feh, max_feh = min(data['feh']), max(data['feh'])
min_ofe, max_ofe = min(data['ofe']), max(data['ofe'])
conditions = data[data.columns.difference(['feh', 'ofe', 'Galaxy_name'], sort=False)].drop_duplicates()

minimum_theta = [conditions[col].values.min() for col in conditions.columns]   
maximum_theta = [conditions[col].values.max() for col in conditions.columns]       
minimum_theta = np.array(minimum_theta)
maximum_theta = np.array(maximum_theta)
def repeat_array(arr, repetitions):
    return np.repeat(arr, repetitions)
repeat_minimum_theta = repeat_array(minimum_theta, N_subhalos)
repeat_maximum_theta = repeat_array(maximum_theta, N_subhalos) 

def write_to_yaml(repeat_minimum_theta, repeat_maximum_theta):
    # Load the existing data
    with open('./training.yaml', 'r') as file:
        data = yaml.safe_load(file)

    repeat_minimum_theta = repeat_minimum_theta.tolist()
    repeat_maximum_theta = repeat_maximum_theta.tolist()
    # Update the value
    data['prior']['args']['low'] = repeat_minimum_theta
    data['prior']['args']['high'] = repeat_maximum_theta

    # Write the data back to the file
    with open('./training.yaml', 'w') as file:
        yaml.safe_dump(data, file)
        
write_to_yaml(repeat_minimum_theta, repeat_maximum_theta)
print('write the right prior in the training.yaml file')

N_test = 1_000
def preprocess_testset(i):
    galaxies = set(data['Galaxy_name'].drop_duplicates().sample(N_subhalos, random_state=i))
    parameters =  data[data['Galaxy_name'].isin(galaxies)].drop(['feh', 'ofe', 'Galaxy_name'], axis=1).drop_duplicates().values.T.reshape(-1)
    galaxy_data = data[data['Galaxy_name'].isin(galaxies)].values
    histogram_galaxy, _, _ = np.histogram2d(galaxy_data[:, 0], galaxy_data[:, 1], bins=64, range=[[min_feh, max_feh], [min_ofe, max_ofe]])
    sim_data =  np.expand_dims(np.log10(histogram_galaxy + 1e-6 +1), axis=0)
    return parameters, sim_data, galaxies

# Create a pool of workers
with Pool() as pool:
    # Map the function to the data
    results = pool.map(preprocess_testset, range(N_test))
    
# Unpack the results
theta_test, x_test, galaxies_test = zip(*results)

#take the first test set element as x_0 and theta_0    
galaxies_0 = galaxies_test[0]
data_to_plot_halos = data[data['Galaxy_name'].isin(galaxies_0)].to_parquet('./halos_0.parquet')
theta_0 =  theta_test[0]
x_0 =  x_test[0]

N = 10_000
def process_sample(i):
    galaxies = data['Galaxy_name'].drop_duplicates().sample(N_subhalos, random_state=i+int(time.time()))
    while (any(set(galaxies) == galaxy_in_testset for galaxy_in_testset in galaxies_test)):
        print('matched galaxies, try again')
        print('galaxies', set(galaxies))
        print('test galaxies', galaxies_test)
        galaxies = data['Galaxy_name'].drop_duplicates().sample(N_subhalos, random_state=i)
    parameters =  data[data['Galaxy_name'].isin(galaxies)].drop(['feh', 'ofe', 'Galaxy_name'], axis=1).drop_duplicates().values.T.reshape(-1)
    galaxy_data = data[data['Galaxy_name'].isin(galaxies)].values
    histogram_galaxy, _, _ = np.histogram2d(galaxy_data[:, 0], galaxy_data[:, 1], bins=64, range=[[min_feh, max_feh], [min_ofe, max_ofe]])
    sim_data =  np.expand_dims(np.log10(histogram_galaxy + 1e-6 +1), axis=0)
    return parameters, sim_data

# Create a pool of workers
with Pool() as pool:
    # Map the function to the data
    results = pool.map(process_sample, range(N))

# Unpack the results
theta, x = zip(*results)

#save in .npy files, we remove the first element of the test set since it will be stored as x_0 and theta_0
np.save('./x_test.npy', x_test[1:])
np.save('./theta_test.npy', theta_test[1:])
np.save('./x_0.npy', x_0)
np.save('./theta_0.npy', theta_0)
np.save('./x.npy', x)
np.save('./theta.npy', theta)
print('finish prepare the data')


write the right prior in the training.yaml file
matched galaxies, try again
galaxies {'g4.86e10.00784', 'g7.66e11.00240'}
test galaxies ({'g2.37e10.00880', 'g6.96e10.00368'}, {'g3.23e11.00784', 'g2.39e11.00400'}, {'g7.05e09.00288', 'g1.59e11.00960'}, {'g6.91e10.00192', 'g3.54e09.00192'}, {'g3.67e10.00640', 'g1.47e10.00944'}, {'g1.89e10.00944', 'g8.28e11.00496'}, {'g2.64e10.00480', 'g8.06e11.00256'}, {'g6.31e09.00400', 'g4.94e10.00784'}, {'g3.59e11.00320', 'g1.23e10.01008'}, {'g6.12e10.00672', 'g1.47e10.00832'}, {'g4.94e10.00368', 'g6.31e09.00688'}, {'g3.19e10.01008', 'g3.59e11.00224'}, {'g3.54e09.00928', 'g2.19e11.00928'}, {'g2.54e11.00064', 'g1.59e11.00944'}, {'g6.96e10.00720', 'g4.48e10.00352'}, {'g5.05e10.00656', 'g2.39e10.00256'}, {'g1.08e11.00672', 'g5.46e11.00752'}, {'g1.18e10.01024', 'g2.42e11.00896'}, {'g2.39e10.00320', 'g6.31e09.00432'}, {'g4.48e09.00912', 'g4.94e10.00080'}, {'g1.89e10.00480', 'g1.88e10.00128'}, {'g2.37e10.00656', 'g3.06e11.00208'}, {'g6.96e10.00128', 'g4.90e1

In [7]:
def objective(trial):
    # Suggest values for the hyperparameters
    model = trial.suggest_categorical('model', ['maf', 'nsf'])
    hidden_features = trial.suggest_categorical('hidden_features', [10, 50, 70, 100])
    num_transforms = trial.suggest_categorical('num_transforms', [5, 10, 15, 20, 30])
    learning_rate = trial.suggest_categorical('learning_rate', [1e-5, 5e-5, 1e-4]) #suggest_loguniform('learning_rate', 1e-5, 1e-4)
    output_dim = trial.suggest_categorical('output_dim', [5, 10, 32, 64])

    # Load the existing hyperparameters from training.yaml
    with open('training.yaml', 'r') as f:
        hyperparameters = yaml.safe_load(f)

    # Update the hyperparameters with the suggested values
    for net in hyperparameters['model']['nets']:
        net['hidden_features'] = hidden_features
        net['num_transforms'] = num_transforms
        net['model'] = model
    hyperparameters['embedding_net']['args']['output_dim'] = output_dim
    hyperparameters['train_args']['learning_rate'] = learning_rate

    # Save the updated hyperparameters back to training.yaml
    with open('training.yaml', 'w') as f:
        yaml.dump(hyperparameters, f)
        
    # reload all simulator examples as a dataloader
    all_loader = StaticNumpyLoader.from_config("./data.yaml")

    # train a model to infer x -> theta. save it as toy/posterior.pkl
    # runner = InferenceRunner.from_config(f"./training.yaml")
    # _, summaries = runner(loader=all_loader)

    
    
    return summaries[0]['validation_log_probs'][-1]

In [8]:
study =  optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=10)

[I 2024-05-29 14:05:59,148] A new study created in memory with name: no-name-e3d2b790-bb03-451f-a69a-a794856bb7f7
[W 2024-05-29 14:05:59,185] Trial 4 failed with parameters: {'model': 'nsf', 'hidden_features': 70, 'num_transforms': 15, 'learning_rate': 0.0001, 'output_dim': 10} because of the following error: TypeError("'NoneType' object is not subscriptable").
Traceback (most recent call last):
  File "/export/home/vgiusepp/miniconda3/envs/fff/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_3161542/222344881.py", line 14, in objective
    for net in hyperparameters['model']['nets']:
TypeError: 'NoneType' object is not subscriptable
[W 2024-05-29 14:05:59,186] Trial 2 failed with parameters: {'model': 'nsf', 'hidden_features': 100, 'num_transforms': 10, 'learning_rate': 5e-05, 'output_dim': 32} because of the following error: TypeError("'NoneType' object is not subscriptable").
Traceback (most rec

KeyboardInterrupt: 

In [None]:
# def objective(trial, device):
#     # Suggest values for the hyperparameters
#     model = trial.suggest_categorical('model', ['maf', 'nsf'])
#     hidden_features = trial.suggest_categorical('hidden_features', [10, 50, 70, 100])
#     num_transforms = trial.suggest_categorical('num_transforms', [5, 10, 15, 20, 30])
#     learning_rate = trial.suggest_categorical('learning_rate', [1e-5, 5e-5, 1e-4]) #suggest_loguniform('learning_rate', 1e-5, 1e-4)
#     output_dim = trial.suggest_categorical('output_dim', [5, 10, 32, 64])

#     # Load the existing hyperparameters from training.yaml
#     with open('training.yaml', 'r') as f:
#         hyperparameters = yaml.safe_load(f)

#     # Update the hyperparameters with the suggested values
#     for net in hyperparameters['model']['nets']:
#         net['hidden_features'] = hidden_features
#         net['num_transforms'] = num_transforms
#         net['model'] = model
#     hyperparameters['embedding_net']['args']['output_dim'] = output_dim
#     hyperparameters['train_args']['learning_rate'] = learning_rate
#     hyperparameters['device'] = device

#     # Save the updated hyperparameters back to training.yaml
#     with open('training.yaml', 'w') as f:
#         yaml.dump(hyperparameters, f)
        
#     # reload all simulator examples as a dataloader
#     all_loader = StaticNumpyLoader.from_config("./data.yaml")

#     # train a model to infer x -> theta. save it as toy/posterior.pkl
#     runner = InferenceRunner.from_config(f"./training.yaml")
#     _, summaries = runner(loader=all_loader)

#     return summaries[0]['validation_log_probs']

In [None]:
# study_name = 'example_study'  # Unique identifier of the study.
# storage_name = 'sqlite:///example.db'
# study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True)

[I 2024-05-29 10:41:09,367] A new study created in RDB with name: example_study


In [None]:
# import torch
# import multiprocessing

# def optimize_on_device(device_id):
#     device = f'cuda:{device_id}'
#     study.optimize(lambda trial: objective(trial, device), n_trials=100)

# if __name__ == '__main__':
#     num_gpus = torch.cuda.device_count()
#     with multiprocessing.Pool(num_gpus) as p:
#         p.map(optimize_on_device, range(num_gpus))

INFO:root:MODEL INFERENCE CLASS: NPE
INFO:root:MODEL INFERENCE CLASS: NPE
INFO:root:MODEL INFERENCE CLASS: NPE
INFO:root:MODEL INFERENCE CLASS: NPE
INFO:root:Training model 1 / 1.
INFO:root:Training model 1 / 1.
INFO:root:Training model 1 / 1.
INFO:root:Training model 1 / 1.


 raining neural network. Epochs trained: 221raining neural network. Epochs trained: 1Training neural network. Epochs trained: 1Training neural network. Epochs trained: 1Training neural network. Epochs trained: 2Training neural network. Epochs trained: 2Training neural network. Epochs trained: 2Training neural network. Epochs trained: 2Training neural network. Epochs trained: 3Training neural network. Epochs trained: 3Training neural network. Epochs trained: 3Training neural network. Epochs trained: 3Training neural network. Epochs trained: 4Training neural network. Epochs trained: 4Training neural network. Epochs trained: 4Training neural network. Epochs trained: 4Training neural network. Epochs trained: 5  Training neural network. Epochs trained: 5Training neural network. Epochs trained: 5Training neural network. Epochs trained: 5Training neural network. Epochs trained: 6Training neural network. Epochs trained: 6Training neural network. Epochs trained: 6Training neural network. Epochs



 Training neural network. Epochs trained: 361

  self._weights = torch.tensor(weights) / sum(weights)
INFO:root:It took 1065.1026091575623 seconds to train models.
INFO:root:Saving model to .


 Training neural network. Epochs trained: 363

[W 2024-05-29 10:59:02,371] Trial 2 failed with parameters: {'model': 'maf', 'hidden_features': 72, 'num_transforms': 14, 'learning_rate': 5e-05, 'output_dim': 64} because of the following error: The number of the values 365 did not match the number of the objectives 1.
[W 2024-05-29 10:59:02,373] Trial 2 failed with value [-11.238193359375, -11.1837421875, -11.11859765625, -11.051810546875, -10.989634765625, -10.92475, -10.8642451171875, -10.793447265625, -10.71277734375, -10.6235322265625, -10.544611328125, -10.453900390625, -10.360392578125, -10.2717763671875, -10.1816015625, -10.0954716796875, -10.0034111328125, -9.92290625, -9.843298828125, -9.756330078125, -9.674427734375, -9.57765234375, -9.486142578125, -9.3875009765625, -9.291708984375, -9.1958408203125, -9.090654296875, -8.9892880859375, -8.88846875, -8.769263671875, -8.6483369140625, -8.534103515625, -8.40283203125, -8.303240234375, -8.17621875, -8.0787900390625, -7.96617822265625, -7.87684765625, -7.77866943359375, -7.68594

 Training neural network. Epochs trained: 360

INFO:root:MODEL INFERENCE CLASS: NPE
[W 2024-05-29 10:59:03,046] Trial 4 failed with parameters: {'model': 'maf', 'hidden_features': 61, 'num_transforms': 8, 'learning_rate': 5e-05, 'output_dim': 64} because of the following error: AssertionError('Unrecognized device cuda:1, should be one of [`cpu`, `cuda`, f`cuda:{index}`]').
Traceback (most recent call last):
  File "/export/home/vgiusepp/miniconda3/envs/fff/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_1459283/1212460247.py", line 6, in <lambda>
    study.optimize(lambda trial: objective(trial, device), n_trials=100)
  File "/tmp/ipykernel_1459283/1168232568.py", line 31, in objective
    _, summaries = runner(loader=all_loader)
  File "/export/home/vgiusepp/CASBI/src/ltu-ili/ili/inference/runner_sbi.py", line 289, in __call__
    models = [self._setup_engine(net) for net in self.nets]
  File "/export/home/vgiusepp/CASBI/src/ltu-ili/ili/infer

 Training neural network. Epochs trained: 362Training neural network. Epochs trained: 364Training neural network. Epochs trained: 361

[W 2024-05-29 10:59:05,758] Trial 6 failed with parameters: {'model': 'nsf', 'hidden_features': 40, 'num_transforms': 9, 'learning_rate': 0.0001, 'output_dim': 32} because of the following error: RuntimeError('CUDA error: out of memory\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1.\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n').
Traceback (most recent call last):
  File "/export/home/vgiusepp/miniconda3/envs/fff/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_1459283/1212460247.py", line 6, in <lambda>
    study.optimize(lambda trial: objective(trial, device), n_trials=100)
  File "/tmp/ipykernel_1459283/1168232568.py", line 30, in objective
    runner = InferenceRunner.from_config(f"./training.yaml")
  File "/export/home/vgiusepp/CASBI/src/ltu

 Training neural network. Epochs trained: 363Training neural network. Epochs trained: 365Neural network successfully converged after 365 epochs.Training neural network. Epochs trained: 362

  self._weights = torch.tensor(weights) / sum(weights)
INFO:root:It took 1070.6305184364319 seconds to train models.
INFO:root:Saving model to .
[W 2024-05-29 10:59:07,857] Trial 1 failed with parameters: {'model': 'nsf', 'hidden_features': 10, 'num_transforms': 15, 'learning_rate': 0.0001, 'output_dim': 64} because of the following error: The number of the values 365 did not match the number of the objectives 1.
[W 2024-05-29 10:59:07,859] Trial 1 failed with value [-11.238193359375, -11.1837421875, -11.11859765625, -11.051810546875, -10.989634765625, -10.92475, -10.8642451171875, -10.793447265625, -10.71277734375, -10.6235322265625, -10.544611328125, -10.453900390625, -10.360392578125, -10.2717763671875, -10.1816015625, -10.0954716796875, -10.0034111328125, -9.92290625, -9.843298828125, -9.756330078125, -9.674427734375, -9.57765234375, -9.486142578125, -9.3875009765625, -9.291708984375, -9.1958408203125, -9.090654296875, -8.9892880859375, -8.88846875, -8.769263671875, -8.648336914062

 Training neural network. Epochs trained: 364

INFO:root:MODEL INFERENCE CLASS: NPE
[W 2024-05-29 10:59:08,675] Trial 8 failed with parameters: {'model': 'maf', 'hidden_features': 96, 'num_transforms': 14, 'learning_rate': 5e-05, 'output_dim': 10} because of the following error: AssertionError('Unrecognized device cuda:2, should be one of [`cpu`, `cuda`, f`cuda:{index}`]').
Traceback (most recent call last):
  File "/export/home/vgiusepp/miniconda3/envs/fff/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_1459283/1212460247.py", line 6, in <lambda>
    study.optimize(lambda trial: objective(trial, device), n_trials=100)
  File "/tmp/ipykernel_1459283/1168232568.py", line 31, in objective
    _, summaries = runner(loader=all_loader)
  File "/export/home/vgiusepp/CASBI/src/ltu-ili/ili/inference/runner_sbi.py", line 289, in __call__
    models = [self._setup_engine(net) for net in self.nets]
  File "/export/home/vgiusepp/CASBI/src/ltu-ili/ili/infe

 Training neural network. Epochs trained: 363Training neural network. Epochs trained: 365Neural network successfully converged after 365 epochs.Training neural network. Epochs trained: 364

  self._weights = torch.tensor(weights) / sum(weights)
INFO:root:It took 1074.1652762889862 seconds to train models.
INFO:root:Saving model to .
[W 2024-05-29 10:59:11,006] Trial 3 failed with parameters: {'model': 'nsf', 'hidden_features': 22, 'num_transforms': 10, 'learning_rate': 5e-05, 'output_dim': 10} because of the following error: The number of the values 365 did not match the number of the objectives 1.
[W 2024-05-29 10:59:11,008] Trial 3 failed with value [-11.238193359375, -11.1837421875, -11.11859765625, -11.051810546875, -10.989634765625, -10.92475, -10.8642451171875, -10.793447265625, -10.71277734375, -10.6235322265625, -10.544611328125, -10.453900390625, -10.360392578125, -10.2717763671875, -10.1816015625, -10.0954716796875, -10.0034111328125, -9.92290625, -9.843298828125, -9.756330078125, -9.674427734375, -9.57765234375, -9.486142578125, -9.3875009765625, -9.291708984375, -9.1958408203125, -9.090654296875, -8.9892880859375, -8.88846875, -8.769263671875, -8.6483369140625

 Training neural network. Epochs trained: 365Neural network successfully converged after 365 epochs.

INFO:root:MODEL INFERENCE CLASS: NPE
INFO:root:Training model 1 / 1.
  self._weights = torch.tensor(weights) / sum(weights)
INFO:root:It took 1075.449700832367 seconds to train models.
INFO:root:Saving model to .
[W 2024-05-29 10:59:12,561] Trial 0 failed with parameters: {'model': 'maf', 'hidden_features': 24, 'num_transforms': 5, 'learning_rate': 0.0001, 'output_dim': 10} because of the following error: The number of the values 365 did not match the number of the objectives 1.
[W 2024-05-29 10:59:12,563] Trial 0 failed with value [-11.238193359375, -11.1837421875, -11.11859765625, -11.051810546875, -10.989634765625, -10.92475, -10.8642451171875, -10.793447265625, -10.71277734375, -10.6235322265625, -10.544611328125, -10.453900390625, -10.360392578125, -10.2717763671875, -10.1816015625, -10.0954716796875, -10.0034111328125, -9.92290625, -9.843298828125, -9.756330078125, -9.674427734375, -9.57765234375, -9.486142578125, -9.3875009765625, -9.291708984375, -9.1958408203125, -9.0906542968

 Training neural network. Epochs trained: 1Training neural network. Epochs trained: 2Training neural network. Epochs trained: 3Training neural network. Epochs trained: 4Training neural network. Epochs trained: 5Training neural network. Epochs trained: 6Training neural network. Epochs trained: 7Training neural network. Epochs trained: 8Training neural network. Epochs trained: 9Training neural network. Epochs trained: 10Training neural network. Epochs trained: 11Training neural network. Epochs trained: 12Training neural network. Epochs trained: 13Training neural network. Epochs trained: 14Training neural network. Epochs trained: 15Training neural network. Epochs trained: 16Training neural network. Epochs trained: 17Training neural network. Epochs trained: 18Training neural network. Epochs trained: 19Training neural network. Epochs trained: 20Training neural network. Epochs trained: 21Training neural network. Epochs trained: 22Training neural network. Epochs trained: 23Training neural net

KeyboardInterrupt: 