So my approach was completely wrong...

I'll try again. You'll see how differently I handle data preprocessing here.

In [1]:
import numpy as np
from sklearn import datasets
import torch
import torch.nn as nn
import torch.optim as optim
import torchbnn as bnn
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

#allocating datasets and model to GPU for speed's sake
is_available = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [2]:
def read_params(filename: str, filepath: str = '../../data/radiative_transfer/input/'):
    """
    Reads parameters from a specified file and returns them as a dictionary.

    The function reads a text file where each line contains a parameter in the format:
    `key = value # optional comment`. The function parses these lines to extract the keys 
    and values, ignoring any text following a '#' as a comment.

    Parameters:
    - file (str, optional): The path to the file containing the parameters. 
    Default is '../../data/radiative_transfer/input/parameters.txt'.

    Returns:
    - dict: A dictionary where each key-value pair corresponds to a parameter 
    and its respective value. If a line contains a comma-separated list of values, 
    they are converted to a NumPy array. If the value is a single number (except for 
    the 'theta' parameter), it is converted to a float.

    Note:
    - This function assumes that each parameter is defined only once in the file.
    - The function is designed to handle special cases where the value is a list 
    (converted to a NumPy array) or a single float. The exception is the 'theta' 
    parameter, which is always treated as a NumPy array.
    """

    lines = open(filepath+filename, 'r').readlines()

    keys = []
    values = []
    for i in range(len(lines)):

        line_i = lines[i]
        line1 = line_i.split('\n')[0]
        line2 = line1.split('#')[0]
        line3 = line2.split('=')
        line4 = []
        for j in range(len(line3)):
            line4.append( line3[j].strip(' ') )

        if len(line4) == 2:
            keys.append(line4[0])
            line5 = line4[1].split(', ')
            line5 = np.array(line5).astype(float)
            if len(line5) == 1 and line4[0]!='theta':
                line5 = line5[0]
            values.append(line5)

    table = dict(zip(keys, values) )
    return table


In [3]:
def read_h5_file(filename: str, thetas, log_mstar, log_mdust_over_mstar, filepath: str = '../../data/radiative_transfer/output/'):
    """
    Reads HDF5 files and compiles data into a single DataFrame with additional parameters.

    Parameters:
    - filename (str): The name of the HDF5 file to be read.
    - thetas (array-like): An array of viewing angles corresponding to each entry in the HDF5 file.
    - log_mstar (float): Logarithmic value of stellar mass.
    - log_mdust_over_mstar (float): Logarithmic value of the dust mass over stellar mass ratio.
    - filepath (str, optional): Path to the directory containing the HDF5 file. Defaults to '../../data/radiative_transfer/output/'.

    Returns:
    - pd.DataFrame: A DataFrame containing wavelength, flux, half-light radius, Sersic index, viewing angle, logarithm of stellar mass, and logarithm of dust mass over stellar mass ratio.

    This function iterates over keys in the HDF5 file, extracts relevant data, and compiles it into a comprehensive DataFrame, adding constant parameters for stellar mass and dust mass ratios.
    """

    filepath += filename 
    print(filepath)

    # Finding hdf keys
    hdf_keys = np.array([])
    with pd.HDFStore(filepath, 'r') as hdf:
        hdf_keys = np.append(hdf_keys, hdf.keys())

    input_output_pairs = []

    for i in range(len(hdf_keys)):
        input_vector = np.array([log_mstar, log_mdust_over_mstar, thetas[i]])

        table = pd.read_hdf(filepath, hdf_keys[i]) # Face-on view
        wvl = table['wvl'].to_numpy() # rest-frame wavelength [micron]
        flux = table['flux'].to_numpy() # flux [W/m^2]
        r = table['r'].to_numpy() # half-light radius [kpc]
        n = table['n'].to_numpy() # Sersic index

        output_vector = np.array([flux, r, n])

        input_output_pairs.append((input_vector, output_vector))

    return input_output_pairs


In [4]:
def read_parameter_files(filenames: list, filepath: str = "../../data/radiative_transfer/input/"):
    """
    Reads multiple parameter files and extracts key information.

    Parameters:
    - filenames (list): A list of filenames for the parameter files to be read.
    - filepath (str, optional): Path to the directory containing the parameter files. Defaults to "../../data/radiative_transfer/input/".

    Returns:
    - tuple: A tuple containing three arrays - list_log_mstar, list_log_mdust_over_mstar, and list_theta. 
        - list_log_mstar (numpy.ndarray): Array of logarithmic stellar mass values.
        - list_log_mdust_over_mstar (numpy.ndarray): Array of logarithmic dust mass over stellar mass ratio values.
        - list_theta (numpy.ndarray): Array of viewing angles.

    The function iterates over each file, reads its parameters, and compiles key data into arrays for further processing.
    """

    list_log_mstar = np.array([])
    list_log_mdust = np.array([])
    list_theta = np.array([])

    for filename in filenames:
        table = read_params(filename, filepath)
        list_log_mstar = np.append(list_log_mstar, table['logMstar'])
        list_log_mdust = np.append(list_log_mdust, table['logMdust'])
        list_theta = np.append(list_theta, table['theta'])

    list_log_mdust_over_mstar = list_log_mdust - list_log_mstar

    return list_log_mstar, list_log_mdust_over_mstar, list_theta


In [5]:
def generate_dataset(params, files):
    list_log_mstar, list_log_mdust_over_mstar, list_theta = read_parameter_files(params)

    input_output_pairs = []
    for i in range(len(files)):
        input_output_pair = read_h5_file(files[i], list_theta, list_log_mstar[i], list_log_mdust_over_mstar[i])
        for pair in input_output_pair:
            input_output_pairs.append(pair)

    data = []
    for pair in input_output_pairs:
        data.append((pair[0], pair[1]))

    df = pd.DataFrame(data, columns=['input', 'output'])
    return df


In [6]:
#obtaining logs of stellar mass, and ratio of dust to stellar mass
parameter_files = [f"parameters{i}.txt" for i in range(1, 7)]
h5_files = [f"data{i}.h5" for i in range(1, 7)]


In [9]:
dataset = generate_dataset(parameter_files, h5_files)

dataset 



../../data/radiative_transfer/output/data1.h5
../../data/radiative_transfer/output/data2.h5
../../data/radiative_transfer/output/data3.h5
../../data/radiative_transfer/output/data4.h5
../../data/radiative_transfer/output/data5.h5
../../data/radiative_transfer/output/data6.h5


Unnamed: 0,input,output
0,"[11.32, -2.74, 0.0]","[[6.501817484772457e-12, 2.803091827610573e-11..."
1,"[11.32, -2.74, 18.67]","[[6.4466429084170395e-12, 2.7748459723856662e-..."
2,"[11.32, -2.74, 26.53]","[[5.515552922411008e-12, 2.3455318268139408e-1..."
3,"[11.32, -2.74, 32.64]","[[5.246167290273239e-12, 2.24901439817031e-11,..."
4,"[11.32, -2.74, 37.86]","[[4.986853732036401e-12, 2.1278108468096852e-1..."
...,...,...
115,"[11.0, -3.25, 77.85]","[[5.307616937707205e-12, 2.4068326185422138e-1..."
116,"[11.0, -3.25, 80.92]","[[5.241944199752592e-12, 2.375376125980818e-11..."
117,"[11.0, -3.25, 83.96]","[[5.089481414692804e-12, 2.311341074149899e-11..."
118,"[11.0, -3.25, 86.98]","[[5.017208456263936e-12, 2.26820185382521e-11,..."


In [27]:
array = dataset.iloc[0:1,1:2]

print(array)

print(len(array))


                                              output
0  [[6.501817484772457e-12, 2.803091827610573e-11...
1
