In [None]:
import datasetmaker
import tifffile
import numpy as np
from tqdm import tqdm
from pathlib import Path
import os
import pandas as pd
import sys
# Get the parent directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the parent directory to sys.path
sys.path.append(parent_dir)
from Onlypores import onlypores as op
from UTXCTregister import UTXCTregister as reg

from Database import queries as qrs
import psycopg2

# Aux functions

In [None]:
def read_sequence(folder_path):
    """
    Read a sequence of TIFF files in a folder as a 3D volume.
    
    Args:
    folder_path (str): Path to the folder containing TIFF files.

    Returns:
    numpy.ndarray: A 3D array where each slice corresponds to a TIFF file.
    """

    # List and sort the TIFF files
    tiff_files = sorted([os.path.join(folder_path, f) for f in os.listdir(folder_path) if (f.endswith('.tiff') or f.endswith('.tif'))])

    tiff_sequence = tifffile.TiffSequence(tiff_files)
    
    # Get the total number of TIFF files
    total_files = len(tiff_files)
    
    # Read each TIFF file and update progress
    volume = []
    with tqdm(total=total_files, desc="Progress") as pbar:
        for i, file_path in enumerate(tiff_files):
            slice_data = tifffile.imread(file_path)
            volume.append(slice_data)
            
            # Update progress
            pbar.update(1)
    
    return np.array(volume)

def write_sequence(folder_path, name, volume):
    """
    Save a 3D volume as a sequence of TIFF files in a folder.
    
    Args:
    folder_path (str): Path to the folder where TIFF files will be saved.
    name (str): Name of the TIFF files.
    volume (numpy.ndarray): A 3D array where each slice corresponds to an image.
    """

    folder_path = folder_path / name

    # Create the folder if it doesn't exist
    Path(folder_path).mkdir(parents=True, exist_ok=True)

    # Save each slice as a TIFF file with progress bar
    with tqdm(total=volume.shape[0], desc="Saving") as pbar:
        for i in range(volume.shape[0]):
            tifffile.imwrite(f"{folder_path}/{name}_{i:04d}.tif", volume[i])
            pbar.update(1)
    
    print("Saving complete.")

def to_matrix(string):

    matrix1 = float(string[2:17])

    matrix2 = float(string[17:33])

    matrix3 = float(string[33:49])

    matrix4 = float(string[53:68])

    matrix5 = float(string[68:84])

    matrix6 = float(string[84:100])

    matrix7 = float(string[105:121])

    matrix8 = float(string[121:137])

    matrix9 = float(string[137:151])

    matrix = np.array([[matrix1,matrix2,matrix3],[matrix4,matrix5,matrix6],[matrix7,matrix8,matrix9]])

    return matrix

def load_volume(path):
    """
    Load a 3D volume from a TIFF file.

    Args:
    path (str): Path to the TIFF file.

    Returns:
    numpy.ndarray: A 3D array representing the volume.
    """
    
    # Check if the path is a directory or a file
    if os.path.isdir(path):
        # If it's a directory, read all TIFF files in the directory
        volume = read_sequence(path)
    elif os.path.isfile(path):
        # If it's a file, read the single TIFF file
        volume = tifffile.imread(path)
    else:
        raise ValueError("Invalid path: must be a directory or a TIFF file.")
    
    return volume

# Database conection

In [None]:
# Database connection parameters
host = "airbus-pc"  # IP address of the PostgreSQL server
database = "UTvsXCT"
user = "alberto.vicente"
password = "&y8imPdS&7MqkL4a"

try:
    conn = qrs.connect(host, database, user, password)
    print("Connected to the database")

except (Exception, psycopg2.DatabaseError) as error:
    print(error)

# Data retrieval

We have to load the data from the database to get:

1. UT paths

2. XCT paths

3. Registration parameters

Then we have to load the registration matrix into the database.

We have to decide the 2 types of measurements we want to create datasets from, and then all the measurements of those two types registered will be used to create the datasets

## Measurement types selecting

We are going to select the reference type measurement and the registered type measurement.

In [None]:
measurementtypes_data = qrs.get_data_metadata('measurementtypes', host, database, user, password)

measurementtypes_data

In [None]:
reference_measurementtype_id = 4

registered_measurementtype_id = 2

## Measurements file paths

We are going to load the measurements data and filter it to get the reference measurements and the registered measurements

In [None]:
measurements_data = qrs.relation_metadata('measurements','samples','sample_measurements', host, database, user, password)

#get only the measurements of the registered and reference measurement types
measurements_data = measurements_data[measurements_data['measurementtype_id_measurement'].isin([float(registered_measurementtype_id), float(reference_measurementtype_id)])]
#get only the measurements where the sample has keyholes
measurements_data = measurements_data[measurements_data['keyhole_sample']== 'True Boolean']

#gte only the measurements that contain only one sample
#group by 'id_measurement' and count the number of unique 'id_sample' values
measurements_data = measurements_data.groupby('id_measurement').filter(lambda x: x['id_sample'].nunique() == 1)

measurements_data

In [None]:
reference_measurements = measurements_data[measurements_data['measurementtype_id_measurement'] == float(reference_measurementtype_id)]
registered_measurements = measurements_data[measurements_data['measurementtype_id_measurement'] == float(registered_measurementtype_id)]

#for the measurements where measurementtype_id_measurement is the registered measurement type, we need to get the ones where 'aligned_measurement' is not 'True Boolean'
registered_measurements = registered_measurements[registered_measurements['aligned_measurement'] == 'True boolean']

#keep only the measurements whose samples are present in both datasets
reference_samples = reference_measurements['id_sample'].values.tolist()
registered_samples = registered_measurements['id_sample'].values.tolist()
combined_samples = list(set(reference_samples) & set(registered_samples))
print('Number of samples in both datasets:', len(combined_samples))

#keep only the measurements whose samples are present in both datasets
reference_measurements = reference_measurements[reference_measurements['id_sample'].isin(combined_samples)]
registered_measurements = registered_measurements[registered_measurements['id_sample'].isin(combined_samples)]

#we only need the id_measurement, file_path_measurement, id_sample and name_sample columns
reference_measurements = reference_measurements[['id_measurement', 'file_path_measurement', 'id_sample','name_sample']]
registered_measurements = registered_measurements[['id_measurement', 'file_path_measurement', 'id_sample','name_sample']]

#we only need the 'id_measurement', 'file_path_measurement' and 'id_sample' columns
print('Number of reference measurements:', len(reference_measurements))
print('Number of registered measurements:', len(registered_measurements))

registered_measurements

## Registration parameters

We have to get the registration parameters, if a pair of measurements has no registration parameters we dont create a dastaset from it

In [None]:
registration_data = qrs.get_data('measurement_registrations', host, database, user, password)

registration_data

## Check if already created the dataset

If the dataset of a pair of measurements is already done, we dont compute it again

In [None]:
datasets_data = qrs.relation_metadata('datasets','measurements','dataset_measurements', host, database, user, password)

#I want to group by id_dataset because there is a column 'id_measurement' that has a 1 to N relationship with id_dataset
#I just want to have a dataset that has the 'id_dataset' columnd and a column with the list of measurements linked to id

#group by 'id_dataset' and get the list of measurements linked to it
datasets_data = datasets_data.groupby('id_dataset')['id_measurement'].apply(list).reset_index()
#rename the columns
datasets_data.columns = ['id_dataset', 'id_measurements']

datasets_data

In [34]:
DatasetsFolder = Path(r'\\192.168.10.106\imdea\DataDriven_UT_AlbertoVicente\04_ML_data\Juan Ignacio\10mm range')

description = 'Monoelement RF with 10 mm range vs volfrac and areafrac. First functional group of datasets from 2025'

for sample_id in registered_measurements['id_sample'].values.tolist():

    sample_name = registered_measurements[registered_measurements['id_sample']==sample_id]['name_sample'].values[0]

    print('Creating datasets for sample:', sample_name)

    #select the rows for sample_id
    reference_row = reference_measurements[reference_measurements['id_sample']==sample_id]
    registered_row = registered_measurements[registered_measurements['id_sample']==sample_id]

    #get the registration matrix

    registration_row = registration_data[registration_data['reference_measurement_id_measurement_registration']==reference_row['id_measurement'].values[0]]
    registration_row = registration_row[registration_row['registered_measurement_id_measurement_registration']==registered_row['id_measurement'].values[0]]
    
    if len(registration_row) == 0:
        print('No registration parameters found for sample:', sample_id)
        continue
    
    registration_parameters = registration_row['registration_matrix_measurement_registration'].values[0]

    parameters = to_matrix(registration_parameters)

    #check if the dataset exists
    reference_measurement_id = reference_row['id_measurement'].values[0]
    aux_table = datasets_data[datasets_data['id_measurements'].apply(lambda x: reference_measurement_id in x)]
    registration_measurement_id = registered_row['id_measurement'].values[0]
    aux_table = aux_table[aux_table['id_measurements'].apply(lambda x: registration_measurement_id in x)]

    if len(aux_table) > 0:

        print('Already created dataset.')

        continue

    #load the files
    reference_file = Path(reference_row['file_path_measurement'].values[0])
    registered_file = Path(registered_row['file_path_measurement'].values[0])

    
    
    rf = load_volume(reference_file)
    xct = load_volume(registered_file)

    rf = np.swapaxes(rf, 0, 1)
    rf = np.swapaxes(rf, 1, 2)
    xct = np.swapaxes(xct, 0, 1)
    xct = np.swapaxes(xct, 1, 2)

    print('Aplying registration')
    xct = reg.apply_registration(rf,xct,parameters)
    rf = np.swapaxes(rf, 1, 2)
    rf = np.swapaxes(rf, 0, 1)
    xct = np.swapaxes(xct, 1, 2)
    xct = np.swapaxes(xct, 0, 1)
    print('Generating onlypores')
    onlypores, mask, _ = op.onlypores_parallel(xct)
    print('Onlypores generated')

    output_folder = DatasetsFolder / str(sample_name)

    if output_folder.exists() == False:
        output_folder.mkdir()
    
    output_folder = output_folder / 'MonoElement'

    if output_folder.exists() == False:
        output_folder.mkdir()

    #save the information of the datasets to write it in the database
    dataset_rows = []
    patch_sizes = []
    dataset_paths = []
    shapes = []

    print('Processing 3x3')
    shape,rows,save_path = datasetmaker.main(onlypores,mask,rf,output_folder,ut_patch_size=3)
    patch_sizes.append(3)
    dataset_paths.append(save_path)
    dataset_rows.append(rows)
    shapes.append(shape)

    print('Processing 5x5')
    shape,rows,save_path = datasetmaker.main(onlypores,mask,rf,output_folder,ut_patch_size=5)
    patch_sizes.append(5)
    dataset_paths.append(save_path)
    dataset_rows.append(rows)
    shapes.append(shape)

    print('Processing 7x7')
    shape,rows,save_path = datasetmaker.main(onlypores,mask,rf,output_folder,ut_patch_size=7)
    patch_sizes.append(7)
    dataset_paths.append(save_path)
    dataset_rows.append(rows)
    shapes.append(shape)

    print('Processing 9x9')
    shape,rows,save_path = datasetmaker.main(onlypores,mask,rf,output_folder,ut_patch_size=9)
    patch_sizes.append(9)
    dataset_paths.append(save_path)
    dataset_rows.append(rows)
    shapes.append(shape)

    #save into the database
    for df_path,df_row,patch_size,shape in zip(dataset_paths,dataset_rows,patch_sizes,shapes):
        
        main_parameters = {'file_path':df_path, 'description': description}

        table_name = 'datasets'

        # Extract column names and values from the attributes dictionary
        columns = ', '.join(main_parameters.keys())
        values = ', '.join([f"'{v}'" for v in main_parameters.values()])

        # Construct the SQL INSERT statement
        sql = f"INSERT INTO {table_name} ({columns}) VALUES ({values})"

        # Create a cursor object using the cursor() method
        cursor = conn.cursor()

        # Execute the SQL statement
        cursor.execute(sql)

        cursor.execute('COMMIT')

        cursor.close()

        data = qrs.get_data_metadata(table_name,host,database,user,password)

        row_id = data['id_dataset'].values[-1]

        metadata_parameters = []

        #rows
        metadata_parameters.append({'key':'rows', 'value':rows, 'type':'cardinal'})

        #shape
        metadata_parameters.append({'key':'shape', 'value':shape, 'type':'cardinal'})

        #patch_size
        metadata_parameters.append({'key':'patch_size', 'value':patch_size, 'type':'pixels'})

        #target
        metadata_parameters.append({'key':'target', 'value':'volfrac', 'type':'nominal'})

        #target
        metadata_parameters.append({'key':'target2', 'value':'areafrac', 'type':'nominal'})

        metadata_table_name =  table_name[:-1] + '_metadata'

        for attributes in metadata_parameters:

            attributes[table_name[:-1] + '_id'] = row_id

            # Extract column names and values from the attributes dictionary
            columns = ', '.join(attributes.keys())
            values = ', '.join([f"'{v}'" for v in attributes.values()])

            # Construct the SQL INSERT statement
            sql = f"INSERT INTO {metadata_table_name} ({columns}) VALUES ({values})"

            # Create a cursor object using the cursor() method
            cursor = conn.cursor()

            # Execute the SQL statement
            cursor.execute(sql)

            cursor.execute('COMMIT')

            cursor.close()
        
        measurement_ids = [reference_measurement_id, registration_measurement_id]

        relational_table_name = 'dataset_measurements'

        for measurement_id in measurement_ids:

            relational_parameters = {'dataset_id': row_id, 'measurement_id': measurement_id}

            # Extract column names and values from the attributes dictionary
            columns = ', '.join(relational_parameters.keys())
            values = ', '.join([f"'{v}'" for v in relational_parameters.values()])

            # Construct the SQL INSERT statement
            sql = f"INSERT INTO {relational_table_name} ({columns}) VALUES ({values})"

            # Create a cursor object using the cursor() method
            cursor = conn.cursor()

            # Execute the SQL statement
            cursor.execute(sql)

            cursor.execute('COMMIT')

            cursor.close()

Creating datasets for sample: JI_5
Aplying registration
Applying transformation
Aplying registration
Applying transformation
Transformation applied
Generating onlypores
masking
Transformation applied
Generating onlypores
masking
computing otsu
computing otsu
thresholding with value:  122
thresholding with value:  122
Onlypores generated
Processing 3x3
Preprocessing and patching the images...
Onlypores generated
Processing 3x3
Preprocessing and patching the images...
Layer cleaning
Patching the images...
Layer cleaning
Patching the images...
Cleaning the pores...
Creating the datasets...
Cleaning the pores...
Creating the datasets...
Processing 5x5
Preprocessing and patching the images...
Processing 5x5
Preprocessing and patching the images...
Layer cleaning
Patching the images...
Layer cleaning
Patching the images...
Cleaning the pores...
Creating the datasets...
Cleaning the pores...
Creating the datasets...
Processing 7x7
Preprocessing and patching the images...
Processing 7x7
Prepro

# check the database

In [35]:
data = qrs.relation_metadata('datasets','measurements','dataset_measurements',host, database, user, password)

data

Unnamed: 0,id_dataset,file_path_dataset,description_dataset,patch_size_dataset,target_dataset,target2_dataset,rows_dataset,shape_dataset,id_measurement,file_path_measurement,parent_measurement_id_measurement,measurementtype_id_measurement,height_measurement,width_measurement,depth_measurement,dtype_measurement,file_type_measurement,signal_type_measurement,aligned_measurement
0,1,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,Monoelement RF vs volfrac and areafrac. First ...,3 pixels,volfrac nominal,areafrac nominal,2686 cardinal,,3,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,1.0,3,505 cardinal,120 cardinal,45 cardinal,uint8 nominal,tif nominal,RF nominal,
1,8,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,Monoelement RF vs volfrac and areafrac. First ...,5 pixels,volfrac nominal,areafrac nominal,2356 cardinal,,3,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,1.0,3,505 cardinal,120 cardinal,45 cardinal,uint8 nominal,tif nominal,RF nominal,
2,15,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,Monoelement RF vs volfrac and areafrac. First ...,7 pixels,volfrac nominal,areafrac nominal,2059 cardinal,,3,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,1.0,3,505 cardinal,120 cardinal,45 cardinal,uint8 nominal,tif nominal,RF nominal,
3,1,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,Monoelement RF vs volfrac and areafrac. First ...,3 pixels,volfrac nominal,areafrac nominal,2686 cardinal,,41,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,26.0,2,225 cardinal,3279 cardinal,1542 cardinal,uint8 nominal,folder nominal,,True boolean
4,8,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,Monoelement RF vs volfrac and areafrac. First ...,5 pixels,volfrac nominal,areafrac nominal,2356 cardinal,,41,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,26.0,2,225 cardinal,3279 cardinal,1542 cardinal,uint8 nominal,folder nominal,,True boolean
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,53,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,Monoelement RF with 10 mm range vs volfrac and...,9 pixels,volfrac nominal,areafrac nominal,2044 cardinal,"(448, 81, 36) cardinal",49,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,43.0,4,448 cardinal,125 cardinal,50 cardinal,uint8 nominal,tif nominal,RF nominal,
94,54,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,Monoelement RF with 10 mm range vs volfrac and...,3 pixels,volfrac nominal,areafrac nominal,1792 cardinal,"(448, 78, 39) cardinal",50,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,43.0,4,448 cardinal,121 cardinal,49 cardinal,uint8 nominal,tif nominal,RF nominal,
95,55,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,Monoelement RF with 10 mm range vs volfrac and...,5 pixels,volfrac nominal,areafrac nominal,1792 cardinal,"(448, 80, 40) cardinal",50,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,43.0,4,448 cardinal,121 cardinal,49 cardinal,uint8 nominal,tif nominal,RF nominal,
96,56,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,Monoelement RF with 10 mm range vs volfrac and...,7 pixels,volfrac nominal,areafrac nominal,1792 cardinal,"(448, 77, 35) cardinal",50,\\192.168.10.106\imdea\DataDriven_UT_AlbertoVi...,43.0,4,448 cardinal,121 cardinal,49 cardinal,uint8 nominal,tif nominal,RF nominal,
