# Retrieve MetaData from .lif file Header for SP8 Leica microscopy files

This notebook need to read the leica microscopy files stored on the Seagate hard disk drive.  
The path to save the parquets and pickles needs to be absolut path. Does not work with relative  path: 'parquets_and_pickles/images_metadata.parquet'.

## Import the libraries

In [2]:
%matplotlib inline

# To import the files
import glob
from pathlib import Path

# To read the metadata of the lif file
import read_lif

# To format the lif metadata as xml and put them into a dictionnary
from display_xml import XML
import xmltodict

# To read metadata in Dataframe
import pandas as pd
pd.set_option('max_colwidth', 500)
pd.set_option('display.max_columns', None) 
import numpy as np
from itertools import repeat

# To plot some of the data
import seaborn as sns

## .lif Metadata to Dataframe

#### Get the .lif files

In [None]:
files = glob.glob('/Volumes/Seagate/eth/0_Leica_SP8/*/*.lif')

In [None]:
len(files)

In [None]:
#files

In [None]:
# File example for debugging

#files[0]
#files[39]
#files[6]

#### Retrieve the metadata information from the .lif files

In [None]:
# Example files:

#xml_0 = read_lif.get_xml(files[0])
#xml_39 = read_lif.get_xml(files[39])

In [None]:
# Print long xml formatted
#XML(xml_39, style='colorful')

#### Metadata information to dataframe

In [None]:
# To explore the doc 
#doc = xmltodict.parse(xml_39)

In [None]:
# To explore the structure of the dictionnary
#list(doc.values())[0].keys()         # --> level1
#list(doc.values())[0]['Element']     # --> level2
#...

In [None]:
xmls = []
xmls_attachments = []
file_counter = 0

for file in files:
    
    # Retrieve the metadata information as string from .lif files
    xml_file = read_lif.get_xml(file)
    
    # Metadata to ordered dictionnary
    doc = xmltodict.parse(xml_file)
    
    try:
        df = pd.json_normalize(list(doc.values())[0]['Element']['Children']['Element'])
        df['FileName'] = (f'{Path(file).parent.name}_{Path(file).name}')
        
        
        try:
            # Get FileName, Image names and unique Ids
            df_name = df[['FileName', '@Name', '@UniqueID']].rename(columns={'@Name' : 'Image.@Name', '@UniqueID': 'Image.@UniqueID'})

            # To get the attachement part of each files
            attachment = pd.concat(df['Data.Image.Attachment'].explode().apply(pd.json_normalize).values)
            attachment.index = list(df['Data.Image.Attachment'].explode().apply(pd.json_normalize).index)

            attachment = df_name.join(attachment, how='outer')
            del(df_name)
            
            # Remove the original columns relative to attachment
            df = df.drop(['Data.Image.Attachment'], axis=1)
            
            # To get channel and dimension information on different rows
            df = df.explode('Data.Image.ImageDescription.Channels.ChannelDescription').explode('Data.Image.ImageDescription.Dimensions.DimensionDescription')
            df = df.reset_index(drop=True)
            
            # New dataframe containing the information relative to the channnels on different columns
            channels = pd.concat(df['Data.Image.ImageDescription.Channels.ChannelDescription'].apply(pd.json_normalize).values, ignore_index=True)
            cols_channel = list(map(lambda x: f'Channel.{x}', channels.columns))
            channels.columns = cols_channel

            # New dataframe containing the information relative to the dimensions on different columns
            dimensions = pd.concat(df['Data.Image.ImageDescription.Dimensions.DimensionDescription'].apply(pd.json_normalize).values, ignore_index=True)
            cols_dimension = list(map(lambda x: f'Dimension.{x}', dimensions.columns))
            dimensions.columns = cols_dimension

            # Remove the original columns relative to channel and dimension information
            df_images = df.drop(['Data.Image.ImageDescription.Channels.ChannelDescription', 'Data.Image.ImageDescription.Dimensions.DimensionDescription'], axis=1).rename(columns={'@Name' : 'Image.@Name', '@UniqueID': 'Image.@UniqueID'})
            cols_images = list(df_images.columns)
            cols_images.remove('FileName')
            cols_images.remove('Image.@Name')
            cols_images.remove('Image.@UniqueID')

            df_temp = pd.concat([channels, dimensions], axis=1, sort=False)
            cols_temp = cols_channel + cols_dimension
            
            df_final = df_images.join(df_temp)
            
            # Re-arrange the columns order
            cols_final = ['FileName'] + ['Image.@Name'] + ['Image.@UniqueID'] +cols_temp + cols_images
            df_final = df_final.reindex(columns=cols_final)
            del(df, df_images, df_temp, channels, dimensions)
        
        
        except KeyError:
            
            if 'Children.Element' in df.columns:
                print(f'The file {Path(file).name} contains collections and images.')
                df = df.rename(columns={'@Name' : 'Collection.@Name', '@Visibility' : 'Collection.@Visibility', '@CopyOption' : 'Collection.@CopyOption',
                                                                                        '@UniqueID' : 'Collection.@UniqueID', 'Children' : 'Collection.Children', 'Memory.@Size' : 'Collection.Memory.@Size',
                                                                                        'Memory.@MemoryBlockID' : 'Collection.Memory.@MemoryBlockID'})
                df_collec = df.explode('Children.Element').dropna(subset=['Children.Element'])
                df_collec = df_collec.reset_index(drop=False)

                images = pd.concat(df_collec['Children.Element'].apply(pd.json_normalize).values, ignore_index=True)
                df_collec = df_collec.drop(['Children.Element'], axis=1)
                df_collec = df_collec.join(images)
                del(images)
                df_collec['FileName'] = (f'{Path(file).parent.name}_{Path(file).name}')

                # Get FileName, Image names and unique Ids
                df_name = df_collec[['FileName', '@Name', '@UniqueID', 'Collection.@Name', 'Collection.@UniqueID']].rename(columns={'@Name' : 'Image.@Name', '@UniqueID': 'Image.@UniqueID'})

                # To get the attachement part of each files
                attachment = pd.concat(df_collec['Data.Image.Attachment'].explode().apply(pd.json_normalize).values)
                attachment.index = list(df_collec['Data.Image.Attachment'].explode().apply(pd.json_normalize).index)

                attachment = df_name.join(attachment, how='outer')
                del(df_name)

                # Remove the original columns relative to attachment
                df_collec = df_collec.drop(['Data.Image.Attachment'], axis=1)

                # To get channel and dimension information on different rows
                df_collec = df_collec.explode('Data.Image.ImageDescription.Channels.ChannelDescription').explode('Data.Image.ImageDescription.Dimensions.DimensionDescription')
                df_collec = df_collec.reset_index(drop=True)

                # New dataframe containing the information relative to the channnels on different columns
                channels = pd.concat(df_collec['Data.Image.ImageDescription.Channels.ChannelDescription'].apply(pd.json_normalize).values, ignore_index=True)
                cols_channel = list(map(lambda x: f'Channel.{x}', channels.columns))
                channels.columns = cols_channel

                # New dataframe containing the information relative to the dimensions on different columns
                dimensions = pd.concat(df_collec['Data.Image.ImageDescription.Dimensions.DimensionDescription'].apply(pd.json_normalize).values, ignore_index=True)
                cols_dimension = list(map(lambda x: f'Dimension.{x}', dimensions.columns))
                dimensions.columns = cols_dimension

                # Remove the original columns relative to channel and dimension information
                df_images = df_collec.drop(['Data.Image.ImageDescription.Channels.ChannelDescription', 'Data.Image.ImageDescription.Dimensions.DimensionDescription'], axis=1).rename(columns={'@Name' : 'Image.@Name', '@UniqueID': 'Image.@UniqueID'})
                cols_images = list(df_images.columns)
                cols_images.remove('FileName')
                cols_images.remove('Image.@Name')
                cols_images.remove('Image.@UniqueID')

                df_collec_temp = pd.concat([channels, dimensions], axis=1, sort=False)
                cols_collec_temp = cols_channel + cols_dimension

                df_collec_final = df_images.join(df_collec_temp)
                del(df_collec_temp, channels, dimensions)

                # Re-arrange the columns order
                cols_collec_final = ['FileName'] + ['Image.@Name'] + ['Image.@UniqueID'] + cols_collec_temp + cols_images
                df_collec_final = df_collec_final.reindex(columns=cols_collec_final)


                df = df.drop(['Children.Element'], axis=1)
                cols_df = list(df.columns)
                cols_df.remove('FileName')
                cols_df.remove('Collection.@Name')
                cols_df.remove('Collection.@UniqueID')


                df_final = df.join(df_collec_final.set_index('index'), how='outer', rsuffix='_right')
                df_final = df_final.drop(df_final.filter(list(filter(lambda x: [x for y in x if x.endswith('_right')], df_final.columns))), axis=1)
                cols_df_final = list(df_final.columns)
                cols_df_final.remove('FileName')
                cols_df_final.remove('Image.@Name')
                cols_df_final.remove('Image.@UniqueID')
                cols_df_final.remove('Collection.@Name')
                cols_df_final.remove('Collection.@UniqueID')
                for c in cols_df:
                    cols_df_final.remove(c)

                cols_final = ['FileName'] + ['Image.@Name'] + ['Image.@UniqueID'] + ['Collection.@Name'] + ['Collection.@UniqueID'] + cols_df_final + cols_df
                df_final = df_final.reindex(columns=cols_final)
                del(df, df_collec_final)
                
            else:
                print(f'The file {Path(file).name} contains collections, but collections are empty.')
                df_final = df.rename(columns={'@Name' : 'Collection.@Name', '@Visibility' : 'Collection.@Visibility', '@CopyOption' : 'Collection.@CopyOption',
                                                                                        '@UniqueID' : 'Collection.@UniqueID', 'Children' : 'Collection.Children', 'Memory.@Size' : 'Collection.Memory.@Size',
                                                                                        'Memory.@MemoryBlockID' : 'Collection.Memory.@MemoryBlockID'})
                attachment = df_final[['FileName', 'Collection.@Name', 'Collection.@UniqueID']]
                del(df)
        
    except TypeError:
        print(f'The file {Path(file).name} does not contain any serie.')
        df_final = pd.json_normalize(list(doc.values())[0]['Element'])
        df_final.insert(0, column='FileName', value=(f'{Path(file).parent.name}_{Path(file).name}'))
        attachment = df_final[['FileName', '@Name', '@UniqueID']].rename(columns={'@Name': 'Serie@Name', '@UniqueID': 'Serie@UniqueID'})
    
        
    finally:
        xmls.append(df_final)
        xmls_attachments.append(attachment)
        del(df_final)
        del(attachment)
        file_counter = file_counter + 1
    
        #df_final.to_parquet(f'parquet/{file}.parquet')
        
print(file_counter)

In [None]:
# Store the metadata relative to the images
if len(xmls) == len(files):
    pd.concat(xmls, ignore_index=True).to_parquet('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/images_metadata.parquet')
else:
    print('Error: Missing image metadata')

In [None]:
dff_images = pd.read_parquet('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/images_metadata.parquet')

In [None]:
# Store the metadata relative to the images'attachments
if len(xmls_attachments) == len(files):
    pd.concat(xmls_attachments, ignore_index=True).to_pickle('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/attachments_metadata.pickle')
else:
    print('Error: Missing attachment metadata')

In [6]:
dff_attachments = pd.read_pickle('attachments_metadata.pickle')

FileNotFoundError: [Errno 2] No such file or directory: 'attachments_metadata.pickle'

In [None]:
# Select and keep only the rows containing information : seem to correspond only at 1 row out of 4 named @Name = HardwareSetting
lst_index = list(dff_attachments[[col for col in dff_attachments.columns if "Element.@UniqueID" in col]].dropna(axis=0, how='all').index)

new_attachment = dff_attachments[dff_attachments.index.isin(lst_index)].reset_index(drop=True)
new_attachment_cols = new_attachment.columns


# New dataframe containing the information relative to the image_attachement on different columns
image_attachments = new_attachment.filter(list(filter(lambda x: [x for y in x if x == 'FileName' or x == 'Image.@Name' or x == 'Image.@UniqueID' or x.startswith('@')], new_attachment.columns)))
cols_image_attachments = list(map(lambda x: f'Image.Attachment.{x}', image_attachments.columns))
image_attachments.columns = cols_image_attachments


# New dataframe containing the information relative to ATLConfocalSettingDefinition
ATLConfocalSettingDefinition = new_attachment.filter(list(filter(lambda x: [x for y in x if x == 'FileName' or x == 'Image.@Name' or x == 'Image.@UniqueID' or x.startswith('ATLConfocalSettingDefinition')], 
                                                                 new_attachment.columns)))


# New dataframe containing the information relative to LDM_Block_Sequential.LDM_Block_Sequential_Master
LDM_Block_Sequential = new_attachment.filter(list(filter(lambda x: [x for y in x if x == 'FileName' or x == 'Image.@Name' or x == 'Image.@UniqueID' or x.startswith('LDM_Block_Sequential')], new_attachment.columns)))


new_attachment = new_attachment.drop(columns=list(filter(lambda x: [x for y in x if x.startswith('@') or x.startswith('ATLConfocalSettingDefinition') or x.startswith('LDM_Block_Sequential')], new_attachment.columns)))


# Get multiple DataFrames for columns with multiple attributes
    
lst_ATLConfocalSettingDefinition_element = ['ATLConfocalSettingDefinition.AdditionalZPositionList.AdditionalZPosition',
                                            'ATLConfocalSettingDefinition.ShutterList.Shutter',
                                            'ATLConfocalSettingDefinition.FilterWheel.Wheel',
                                            'ATLConfocalSettingDefinition.Spectro.MultiBand',
                                            'ATLConfocalSettingDefinition.AotfList.Aotf',
                                            'ATLConfocalSettingDefinition.LUT_List.LUT',
                                            'ATLConfocalSettingDefinition.DetectorList.Detector',
                                            'ATLConfocalSettingDefinition.LaserArray.Laser']
lst_LDM_Block_Sequential_element = [
                'LDM_Block_Sequential.LDM_Block_Sequential_Master.ATLConfocalSettingDefinition.AdditionalZPositionList.AdditionalZPosition',
                'LDM_Block_Sequential.LDM_Block_Sequential_Master.ATLConfocalSettingDefinition.ShutterList.Shutter',
                'LDM_Block_Sequential.LDM_Block_Sequential_Master.ATLConfocalSettingDefinition.FilterWheel.Wheel',
                'LDM_Block_Sequential.LDM_Block_Sequential_Master.ATLConfocalSettingDefinition.Spectro.MultiBand',
                'LDM_Block_Sequential.LDM_Block_Sequential_Master.ATLConfocalSettingDefinition.AotfList.Aotf',
                'LDM_Block_Sequential.LDM_Block_Sequential_Master.ATLConfocalSettingDefinition.LUT_List.LUT',
                'LDM_Block_Sequential.LDM_Block_Sequential_Master.ATLConfocalSettingDefinition.DetectorList.Detector',
                'LDM_Block_Sequential.LDM_Block_Sequential_Master.ATLConfocalSettingDefinition.LaserArray.Laser',
                'LDM_Block_Sequential.LDM_Block_Sequential_List.ATLConfocalSettingDefinition']


lst_sub_xml_ATLConfocalSettingDefinition = []
sub_xml_ATLConfocalSettingDefinition_dict = dict() 

ATL_element_counter = 0

for idx_ATL_elem, ATL_elem in enumerate(lst_ATLConfocalSettingDefinition_element):

    sub_xml_ATLConfocalSettingDefinition_dict.update({idx_ATL_elem:ATL_elem})

    df = pd.concat(ATLConfocalSettingDefinition[ATL_elem].explode().apply(pd.json_normalize).values)
    cols_df = list(map(lambda x: f'{ATL_elem}.{x}', df.columns))
    df.columns = cols_df
    df.index = list(ATLConfocalSettingDefinition[ATL_elem].explode().apply(pd.json_normalize).index)
    
    df1 = ATLConfocalSettingDefinition.filter(list(filter(lambda x: [x for y in x if x == 'FileName' or x == 'Image.@Name' or x == 'Image.@UniqueID'], ATLConfocalSettingDefinition.columns)))
    
    sub_xml_ATLConfocalSettingDefinition = df1.join(df, how='outer')
    del(df)
    del(df1)
    
    lst_sub_xml_ATLConfocalSettingDefinition.append(sub_xml_ATLConfocalSettingDefinition)
    ATLConfocalSettingDefinition = ATLConfocalSettingDefinition.drop([ATL_elem], axis=1)
    
    ATL_element_counter = ATL_element_counter + 1


if (len(lst_ATLConfocalSettingDefinition_element) == ATL_element_counter) and (len(lst_ATLConfocalSettingDefinition_element) == len(lst_sub_xml_ATLConfocalSettingDefinition)):
    print('All the ATL elements of the list have been treated')
else:
    print('An error occurred with the ATL elements!')
    
    


lst_sub_xml_LDM_Block_Sequential = []
sub_xml_LDM_Block_Sequential_dict = dict() 

LDM_element_counter = 0

for idx_LDM_elem, LDM_elem in enumerate(lst_LDM_Block_Sequential_element):
    
    sub_xml_LDM_Block_Sequential_dict.update({idx_LDM_elem:LDM_elem})
    
    df = pd.concat(LDM_Block_Sequential[LDM_elem].explode().apply(pd.json_normalize).values)
    cols_df = list(map(lambda x: f'{LDM_elem}.{x}', df.columns))
    df.columns = cols_df
    df.index = list(LDM_Block_Sequential[LDM_elem].explode().apply(pd.json_normalize).index)
    
    df1 = LDM_Block_Sequential.filter(list(filter(lambda x: [x for y in x if x == 'FileName' or x == 'Image.@Name' or x == 'Image.@UniqueID'], LDM_Block_Sequential.columns)))
    
    sub_xml_LDM_Block_Sequential = df1.join(df, how='outer')
    del(df)
    del(df1)
    
    lst_sub_xml_LDM_Block_Sequential.append(sub_xml_LDM_Block_Sequential)
    LDM_Block_Sequential = LDM_Block_Sequential.drop([LDM_elem], axis=1)
    
    LDM_element_counter = LDM_element_counter + 1
    

if (len(lst_LDM_Block_Sequential_element) == LDM_element_counter) and (len(lst_LDM_Block_Sequential_element) == len(lst_sub_xml_LDM_Block_Sequential)):
    print('All the LDM elements of the list have been treated')
else:
    print('An error occurred with the LDM elements!')
    
    
print(f'ATL_element_counter:{ATL_element_counter}, LDM_element_counter:{LDM_element_counter}')

In [None]:
# Output Dataframes:

#list(image_attachments.columns)
#list(new_attachment.columns) # --> Does not contain any information
#list(ATLConfocalSettingDefinition.columns)
#list(LDM_Block_Sequential.columns)

In [None]:
# Store the metadata relative to the microscope and software
image_attachments.to_parquet('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/image_attachments_metadata.parquet')

In [None]:
dff_image_attachments = pd.read_parquet('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/image_attachments_metadata.parquet')

In [None]:
# Store the metadata left after sorting information of interest
new_attachment.to_parquet('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/new_attachment_metadata.parquet')

In [None]:
dff_new_attachment = pd.read_parquet('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/new_attachment_metadata.parquet')

In [None]:
# Store the metadata relative to ATLConfocalSettingDefinition
# Zoom, Pinhole, ...
ATLConfocalSettingDefinition.to_parquet('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/ATLConfocalSettingDefinition_metadata.parquet')

In [None]:
dff_ATLConfocalSettingDefinition = pd.read_parquet('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/ATLConfocalSettingDefinition_metadata.parquet')

In [None]:
# Store the metadata relative to LDM_Block_Sequential
# Zoom, Pinhole, ...
LDM_Block_Sequential.to_parquet('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/LDM_Block_Sequential_metadata.parquet')

In [None]:
dff_Block_Sequential = pd.read_parquet('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/LDM_Block_Sequential_metadata.parquet')

In [None]:
# Output dictionnaries with respective DataFrame lists:

# sub_xml_ATLConfocalSettingDefinition_dict
# len(lst_sub_xml_ATLConfocalSettingDefinition)

# sub_xml_LDM_Block_Sequential_dict
# len(lst_sub_xml_LDM_Block_Sequential)

In [None]:
# Build detector info dataframe for each files / image ID / channel color

for key, value in sub_xml_ATLConfocalSettingDefinition_dict.items(): 
    if 'Detector' in value:
        df = lst_sub_xml_ATLConfocalSettingDefinition[key]
        df = df.reset_index(drop=False)
        
        # Select the rows where Gain > 100
        _Gain = df.filter(list(filter(lambda x: [x for y in x if x.endswith('Gain')], df.columns))).astype('float') > 100
        df.drop(_Gain[~_Gain['ATLConfocalSettingDefinition.DetectorList.Detector.@Gain']].index, axis=0, inplace=True)
        
        detector_information = df.set_index('index')
        del(df)
        
        
        
for key, value in sub_xml_ATLConfocalSettingDefinition_dict.items():   
    if 'MultiBand' in value:
        df = lst_sub_xml_ATLConfocalSettingDefinition[key]
        df = df.reset_index(drop=False)
        
        # Select the rows where @ChannelName correspond at @ChannelName in the 'Detector' dataframe
        _ChannelName = list(filter(lambda x: [x for y in x if x.endswith('@ChannelName')], df.columns))

        dict_multiband = dict()

        for i, (uniq_id, channel) in enumerate(zip(list(detector_information['Image.@UniqueID'].values), detector_information[list(filter(lambda x: [x for y in x if x.endswith('@ChannelName')], detector_information.columns))].values.tolist())):
            dict_multiband.update({i:df[(df['Image.@UniqueID'] == uniq_id) & (df[_ChannelName[0]].isin(channel))]})

        multiband = pd.concat(dict_multiband)
        multiband = multiband.set_index('index')
        del(df)
        
        
        
for key, value in sub_xml_ATLConfocalSettingDefinition_dict.items():   
    if 'LUT' in value:
        df = lst_sub_xml_ATLConfocalSettingDefinition[key]
        df = df.reset_index(drop=False)
        
        # Select the rows where @Channel correspond at @Channel in the 'Detector' dataframe
        _Channel = list(filter(lambda x: [x for y in x if x.endswith('@Channel')], df.columns))

        dict_LUT = dict()

        for i, (uniq_id, channel) in enumerate(zip(list(detector_information['Image.@UniqueID'].values), detector_information[list(filter(lambda x: [x for y in x if x.endswith('@Channel')], detector_information.columns))].values.tolist())):
            dict_LUT.update({i:df[(df['Image.@UniqueID'] == uniq_id) & (df[_Channel[0]].isin(channel))]})

        lut = pd.concat(dict_LUT)
        lut = lut.set_index('index')
        del(df)



# Merge both Dataframe on the Name of the files, the Image Names, the Image Unique IDs and the Channel column:
detector_information = detector_information.merge(multiband, how='outer', left_on=['FileName', 'Image.@Name', 'Image.@UniqueID', 'ATLConfocalSettingDefinition.DetectorList.Detector.@Channel'],
                 right_on=['FileName', 'Image.@Name', 'Image.@UniqueID', 'ATLConfocalSettingDefinition.Spectro.MultiBand.@Channel'], sort=False, suffixes=('', '@Repeat'),
                            indicator=True)

# Check that all the rows from multiband have a counterpart in the detector_information:
if detector_information['_merge'].unique().tolist() == ['both', 'left_only']:
    # Check which are the images that got 'left_only':
    if detector_information[detector_information['_merge'] == 'left_only']['ATLConfocalSettingDefinition.DetectorList.Detector.@Name'].unique().tolist() == ['PMT Trans']:
        print('"left_only" corresponds to the PMT Trans channel')
        # Drop the '_merge' column:
        detector_information.drop(['_merge'], axis=1, inplace=True)
    else:
        print('Error while merging the dataframes.')
elif detector_information['_merge'].unique().tolist() == ['both']:
    print('No transmission channel')
    # Drop the '_merge' column:
    detector_information.drop(['_merge'], axis=1, inplace=True)
else:
    print('Error while merging the dataframes.')


# Merge both Dataframe on the Name of the files, the Image Names, the Image Unique IDs and the Channel column:
detector_information = detector_information.merge(lut, how='outer', left_on=['FileName', 'Image.@Name', 'Image.@UniqueID', 'ATLConfocalSettingDefinition.DetectorList.Detector.@Channel'],
                 right_on=['FileName', 'Image.@Name', 'Image.@UniqueID', 'ATLConfocalSettingDefinition.LUT_List.LUT.@Channel'], sort=False, suffixes=('', '@Repeat'),
                            indicator=True)

# Check that all the rows from lut have a counterpart in the detector_information:
if detector_information['_merge'].unique().tolist() == ['both']:
    print('"lut" and "detector_information" dataframes fully merged.')
    # Drop the '_merge' column:
    detector_information.drop(['_merge'], axis=1, inplace=True)
else:
    print('Error while merging the dataframes.')

In [None]:
# Store the metadata relative to detector_information
# Zoom, Pinhole, ...
detector_information.to_parquet('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/detector_information_metadata.parquet')

In [None]:
dff_detector_information = pd.read_parquet('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/detector_information_metadata.parquet')

In [None]:
# Build laser info dataframe for each files / image ID

for key, value in sub_xml_ATLConfocalSettingDefinition_dict.items():
    if 'Laser' in value:
        df = lst_sub_xml_ATLConfocalSettingDefinition[key]
        df = df.reset_index(drop=False)

        # Select the rows where PowerState is On
        _PowerState = df.filter(list(filter(lambda x: [x for y in x if x.endswith('PowerState')], df.columns))) == 'On'
        df.drop(_PowerState[~_PowerState['ATLConfocalSettingDefinition.LaserArray.Laser.@PowerState']].index, inplace=True)

        laser_information = df.set_index('index')
        del(df)


for key, value in sub_xml_ATLConfocalSettingDefinition_dict.items():
    if 'Shutter' in value:
        df = lst_sub_xml_ATLConfocalSettingDefinition[key]
        df = df.reset_index(drop=False)

        # Select the rows where @LightSourceName correspond at @LightSourceName in the 'Laser' dataframe
        _LightSourceName = list(filter(lambda x: [x for y in x if x.endswith('@LightSourceName')], laser_information.columns))
        
        dict_shutter = dict()
        
        for j, (uniq_id, light_source) in enumerate(zip(list(laser_information['Image.@UniqueID'].values), list(laser_information[_LightSourceName].values.tolist()))):

            df1 = df[(df['Image.@UniqueID'] == uniq_id) & (df['ATLConfocalSettingDefinition.ShutterList.Shutter.@LightSourceName'].isin(light_source))]

            _Active_Shutter = df1.filter(list(filter(lambda x: [x for y in x if x.endswith('@IsActive')], df1.columns))) == '1'
            df1.drop(_Active_Shutter[~_Active_Shutter['ATLConfocalSettingDefinition.ShutterList.Shutter.@IsActive']].index, inplace=True)

            dict_shutter.update({j:df1})
            del(df1)

        shutter = pd.concat(dict_shutter)
        shutter = shutter.set_index('index')
        del(df)
    
    
for key, value in sub_xml_ATLConfocalSettingDefinition_dict.items():
        
    if 'Aotf' in value:
        df = lst_sub_xml_ATLConfocalSettingDefinition[key]
        df = df.reset_index(drop=False)

        # Select the rows where @LightSourceName correspond at @LightSourceName in the 'Laser' dataframe
        dict_Aotf = dict()

        for k, (uniq_id, light_source) in enumerate(zip(list(laser_information['Image.@UniqueID'].values), laser_information[_LightSourceName].values.tolist())):

            df1 = df[(df['Image.@UniqueID'] == uniq_id) & (df['ATLConfocalSettingDefinition.AotfList.Aotf.@LightSourceName'].isin(light_source))]

            dict_Aotf.update({k:df1})
            del(df1)

        aotf = pd.concat(dict_Aotf)
        aotf = aotf.set_index('index')
        del(df)

    
    
# Merge both Dataframe on the Name of the files, the Image Names, the Image Unique IDs and the LightSourceType column:
laser_information = laser_information.merge(shutter, how='inner', left_on=['FileName', 'Image.@Name', 'Image.@UniqueID', 'ATLConfocalSettingDefinition.LaserArray.Laser.@LightSourceType'],
                 right_on=['FileName', 'Image.@Name', 'Image.@UniqueID', 'ATLConfocalSettingDefinition.ShutterList.Shutter.@LightSourceType'], sort=False, suffixes=('', '@Repeat'),
                            indicator=True)

# Check that all the rows from shutter have a counterpart in the laser_information:
if laser_information['_merge'].unique().tolist() == ['both']:
    print('"shutter" and "laser_information" dataframes fully merged.')
    # Drop the '_merge' column:
    laser_information.drop(['_merge'], axis=1, inplace=True)
else:
    print('Error while merging the dataframes.')
    


# Merge both Dataframe on the Name of the files, the Image Names, the Image Unique IDs and the LightSourceType column:
laser_information = laser_information.merge(aotf, how='left', left_on=['FileName', 'Image.@Name', 'Image.@UniqueID', 'ATLConfocalSettingDefinition.LaserArray.Laser.@LightSourceType'],
                 right_on=['FileName', 'Image.@Name', 'Image.@UniqueID', 'ATLConfocalSettingDefinition.AotfList.Aotf.@LightSourceType'], sort=False, suffixes=('', '@Repeat'),
                            indicator=True)

# Check that all the rows from aotf have a counterpart in the laser_information:
if laser_information['_merge'].unique().tolist() == ['both']:
    print('"aotf" and "laser_information" dataframes fully merged.')
    # Drop the '_merge' column:
    laser_information.drop(['_merge'], axis=1, inplace=True)
else:
    print('Error while merging the dataframes.')

In [None]:
# Store the metadata relative to laser_information
# Zoom, Pinhole, ...
laser_information.to_pickle('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/laser_information_metadata.pickle')

In [None]:
dff_laser_information = pd.read_pickle('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/laser_information_metadata.pickle')