In [1]:
import os
import pickle
import torch
from parsing_functions import extract_specs, specs_match

# Set the parameters for filtering
target_Npts = 5e+04
target_dt = 1e-02
target_oversampling = 4
target_prerun = 1e+02

# Directory containing the .pickle files
directory_path = "./saved_datasets/"

# List of all files in the directory with .pickle extension
all_files = [file for file in os.listdir(directory_path) if file.endswith(".pickle")]

# List only files that are in line with target parameters and not already from a previous merge
dataset_files = [file for file in all_files if specs_match(file, target_Npts, target_dt, target_oversampling, target_prerun)]

# Lists to store the data from all files
all_data_theta = torch.tensor([])
all_data_x = torch.tensor([])

# Load data from each file and combine them
for filename in dataset_files:
    file_path = os.path.join(directory_path, filename)
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
        data_theta = data['theta']
        data_x = data['x']
        all_data_theta = torch.cat((all_data_theta, data_theta))
        all_data_x = torch.cat((all_data_x, data_x))

# Determine the size of each half of data_x
half_size = all_data_x.shape[1] // 2

# Option to filter out either the first or the second half of data_x
filter_option = 'svr'  # Set to 'corr' to keep only the first half, 'svr' to keep only the second half

if filter_option == 'corr':
    all_data_x = all_data_x[:, :half_size]
    prefix = 'merged_corr_'
elif filter_option == 'svr':
    all_data_x = all_data_x[:, half_size:]
    prefix = 'merged_svr_'
else:
    prefix = 'merged_'



# Calculate the total number of simulations
total_num_simulations = all_data_theta.shape[0]

# Construct the new filename with updated specs
output_filename = f"{prefix}dataset_{total_num_simulations:.0f}sim_{target_Npts:.0e}np_{target_dt:.0e}dt_{target_oversampling}os_{target_prerun:.0e}pre.pickle"

save_path = os.path.join(directory_path, output_filename)

# Save the combined data into the new file
combined_data = {'theta': all_data_theta, 'x': all_data_x}
with open(save_path, 'wb') as file:
    pickle.dump(combined_data, file)

print("Dataset unificato salvato con successo nel file:", save_path)
print("Dimensioni del dataset unificato:", all_data_theta.shape, all_data_x.shape)


Dataset unificato salvato con successo nel file: ./saved_datasets/merged_svr_dataset_10000sim_5e+04np_1e-02dt_4os_1e+02pre.pickle
Dimensioni del dataset unificato: torch.Size([10000, 3]) torch.Size([10000, 1000])
