# Metadata analysis

List of the parquets and pickles files:
          
        - images_metadata.parquet                             --> Contains the metadata describing the channels and the dimensions for each images.
        - attachments_metadata.pickle                         --> Used only to retrieved metadata relative to additional information on images, grouped in the DataFrames listed below.
        - image_attachments_metadata.parquet                  --> Contains the metadata relative to the microscope model and software used.
        - new_attachment_metadata.parquet                     --> Metadata left after sorting the metadata of interest: contains only 2 empty columns outside of the names and IDs columns.
        - ATLConfocalSettingDefinition_metadata.parquet       --> Store the metadata relative to ATLConfocalSettingDefinition: Zoom, Pinhole, ...
        - LDM_Block_Sequential_metadata.parquet               --> Store the metadata relative to LDM_Block_Sequential: Zoom, Pinhole, ...
        - detector_information_metadata.parquet               --> Store information from ATLConfocalSettingDefinition referenced as Detector, Multiband and LUT.
        - laser_information_metadata.pickle                   --> Store information from ATLConfocalSettingDefinition referenced as Laser, Shutter and Aotf.

## Import the libraries

In [None]:
# To read the dataframe
import pandas as pd
pd.set_option('display.max_columns', None) 
import numpy as np

### Keyword lists to clean the different dataframes

In [None]:
# Keywords specific to each categories:
not_num_keywords = ['Name', 'TimeStamp', 'Model', 'ScanMode', 'Unit', 'Type', 'State', 'BeamPosition']

integer_keywords = ['Resolution', 'DimID', 'NumberOfElements', 'Size', 'Tag', 'Version', 'Bit', 'Mode', 'Section', 'Magnification', 'Pos', 'Number', 'Speed',
                    'Direction', 'Wavelength', 'Average', 'Accumulation', 'Trigger', 'Relative', 'Detectors', 'Channels', 'Flip', 'Swap', 'Phase']
not_integer = ['Name', 'Position', 'Model', 'ScanMode', 'Unit', 'Can', 'Is']
#'Type', 'Dimension'

float_keywords = ['Min', 'Max', 'Bytes', 'Origin', 'Length', 'Position', 'Aperture', 'Index', 'Dim', 'Zoom', 'Pinhole', 'Time', 'Value', 'Range', 'CommonFactor', 'Gain', 'Offset', 'World', 'Power','Intensity']
not_float = ['Dimension', 'ID', 'Name', 'Position', 'Model', 'ActiveCS']

bool_keywords = ['CanDo', 'Is', 'Use', 'Valid', 'Enable', 'InUse', 'CopyOption', 'AutoSelection', 'Keep', 'Normalize', 'Freq', 'Flag', 'OutChecked', 'OpenVirtual', 'ModeActive', 'TwoLaser']
not_bool = ['Name', 'UseMode']
#'Visibility', 

categorical_keywords = ['LUTName', 'LutName', 'DyeName', 'Detector.@Name', 'Detector.@Type', 'Detector.@ScanType', 'LaserName', 'LightSourceName', 'LightSourceType', '@Channel']
not_cat = ['Tag']

## Metadata

In [None]:
# Dataframe describing the channels and the dimension for each images
dff_images = pd.read_parquet('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/images_metadata.parquet')

In [None]:
dff_images['FileName'].nunique()

In [None]:
dff_images.dropna(axis=1, how='all', inplace=True)

In [None]:
dff_images = dff_images.fillna(pd.NA)

In [None]:
# Filter per dtype
cols_num = list(filter(lambda x: [x for y in ['Min', 'Max', 'Bytes', 'Origin', 'Length', 'Position', 'Aperture', 'Index', 'Dim', 'Zoom', 'Pinhole', 'Time', 'Value', 'Range', 'CommonFactor', 'Gain', 'Offset', 'World', 'Power',
                                              'Intensity',
                                              'Resolution', 'DimID', 'NumberOfElements', 'Size', 'Tag', 'Version', 'Bit', 'Mode', 'Section', 'Magnification', 'Pos', 'Number', 'Speed','Direction', 'Wavelength', 'Average',
                                              'Accumulation', 'Trigger', 'Relative', 'Detectors', 'Channels', 'Flip', 'Swap', 'Phase',
                                              'CanDo', 'Is', 'Use', 'Valid', 'Enable', 'InUse', 'CopyOption', 'AutoSelection', 'Keep', 'Normalize', 'Freq', 'Flag', 'OutChecked', 'OpenVirtual', 'ModeActive', 'TwoLaser']
                                  if y in x and 'Name' not in x and 'TimeStamp' not in x and 'Model' not in x and 'ScanMode' not in x and 'Unit' not in x and 'Type' not in x and 'State' not in x and 'BeamPosition' not in x],
                       dff_images.columns.values))

cols_integer = list(filter(lambda x: [x for y in ['Resolution', 'DimID', 'NumberOfElements', 'Size', 'Tag', 'Version', 'Bit', 'Mode', 'Section', 'Magnification', 'Pos', 'Number', 'Speed',
                                                  'Direction', 'Wavelength', 'Average', 'Accumulation', 'Trigger', 'Relative', 'Detectors', 'Channels', 'Flip', 'Swap', 'Phase'] if y in x
                                     and 'Name' not in x and 'Position' not in x and 'Model' not in x and 'ScanMode' not in x and 'Unit' not in x and 'Can' not in x and 'Is' not in x], dff_images.columns.values))

cols_categ = list(filter(lambda x: [x for y in ['LUTName', 'LutName', 'DyeName', 'Detector.@Name', 'Detector.@Type', 'Detector.@ScanType', 'LaserName', 'LightSourceName', 'LightSourceType', '@Channel'] if y in x 
                                    and 'Tag' not in x], dff_images.columns.values))

cols_bool = list(filter(lambda x: [x for y in ['CanDo', 'Is', 'Use', 'Valid', 'Enable', 'InUse', 'CopyOption', 'AutoSelection', 'Keep', 'Normalize', 'Flag', 'OutChecked', 'OpenVirtual', 'ModeActive', 'TwoLaser']
                                   if y in x and 'Name' not in x and 'UseMode' not in x],
                        dff_images.columns.values))

#cols_timeStamp = list(filter(lambda x: [x for y in ['TimeStamp'] if y in x and 'NumberOf' not in x], dff_images.columns.values))

In [None]:
# Conversion to float or int dtypes:
for num in cols_num:
    dff_images[num] = pd.to_numeric(dff_images[num], errors='coerce')

# Conversion to booleen dtype:
for b in cols_bool:
    dff_images[b] = np.where(dff_images[b]==1, True, False)

# Conversion to categorical dtype:
dff_images[cols_categ] = dff_images[cols_categ].astype('category')

In [None]:
# Special rule:

# 'Visibility' cannot be used as keeyword for booleen:
dff_images['@Visibility'] = np.where(dff_images['@Visibility']==1, True, False)

# 'TimeStamp'is an excluded term for numerical values:
dff_images['Data.Image.TimeStampList.@NumberOfTimeStamps'] = pd.to_numeric(dff_images['Data.Image.TimeStampList.@NumberOfTimeStamps'], errors='coerce')

# Because the columns contains null-object:
dff_images[cols_integer] = dff_images[cols_integer].astype('Int64')

In [None]:
dff_images.info()

In [None]:
dff_images['Dimension.@DimID'].unique()

In [None]:
dff_images['Dimension.@NumberOfElements'].unique()

## Information relative to the microscope and the software

In [None]:
# Store the metadata relative to the microscope and software
dff_image_attachments = pd.read_parquet('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/image_attachments_metadata.parquet')

In [None]:
dff_image_attachments.dropna(axis=1, how='all', inplace=True)

In [None]:
# Special rule: integer contains the excluded keyword 'Type':
cols_integer_2 = list(filter(lambda x: [x for y in ['Type'] if y in x and 'Name' not in x], dff_image_attachments.columns.values))

for integer in cols_integer_2:
    dff_image_attachments[integer] = pd.to_numeric(dff_image_attachments[integer], errors='coerce')

In [None]:
dff_image_attachments.info()

In [None]:
all_values = dff_image_attachments.iloc[:,3:].values.ravel()
pd.unique(all_values)

All the files have been produced with the same microscope, the TCS SP8 confocal microscope.  
Application:'LAS AF'  
Software: 'LAS X 3.5.6.21594'  
SystemTypeName: 'TCS SP8'  
DataSourceTypeName: 'Confocal'  

## Parameters

In [None]:
# Zoom, Pinhole, ...
dff_ATLConfocalSettingDefinition = pd.read_parquet('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/ATLConfocalSettingDefinition_metadata.parquet')

In [None]:
dff_ATLConfocalSettingDefinition.dropna(axis=1, how='all', inplace=True)

In [None]:
dff_ATLConfocalSettingDefinition = dff_ATLConfocalSettingDefinition.fillna(pd.NA)

In [None]:
# Filters for each categories:
# Integers and floats are coupled together into the numeric category.

cols_num_3 = list(filter(lambda x: [x for y in ['Min', 'Max', 'Bytes', 'Origin', 'Length', 'Position', 'Aperture', 'Index', 'Dim', 'Zoom', 'Pinhole', 'Time', 'Value', 'Range', 'CommonFactor', 'Gain', 'Offset', 'World', 'Power',
                                                'Intensity',
                                                'Resolution', 'DimID', 'NumberOfElements', 'Size', 'Size', 'Tag', 'Version', 'Bit', 'Mode', 'Section', 'Magnification', 'Pos', 'Number', 'Dimension', 'Speed',
                                                'Direction', 'Wavelength', 'Average', 'Accumulation', 'Trigger', 'Relative', 'Visibility', 'Detectors', 'Channels', 'Flip', 'Swap', 'Phase',
                                                'CanDo', 'Is', 'Use', 'Valid', 'Enable', 'InUse', 'CopyOption', 'AutoSelection', 'Keep', 'Normalize', 'Freq', 'Flag', 'OutChecked', 'OpenVirtual', 'ModeActive', 'TwoLaser']
                                    if y in x and 'Name' not in x and 'TimeStamp' not in x and 'Model' not in x and 'ScanMode' not in x and 'Unit' not in x and 'Type' not in x and 'State' not in x and 'BeamPosition' not in x],
                         dff_ATLConfocalSettingDefinition.columns.values))

cols_integer_3 = list(filter(lambda x: [x for y in ['Resolution', 'DimID', 'NumberOfElements', 'Tag', 'Version', 'Bit', 'Mode', 'Section', 'Magnification', 'Pos', 'Number', 'Speed',
                    'Direction', 'Wavelength', 'Average', 'Accumulation', 'Trigger', 'Relative', 'Detectors', 'Channels', 'Flip', 'Swap', 'Phase'] if y in x
                                     and 'Name' not in x and 'Position' not in x and 'Model' not in x and 'ScanMode' not in x and 'Unit' not in x and 'Can' not in x and 'Is' not in x],
                             dff_ATLConfocalSettingDefinition.columns.values))

cols_bool_3 = list(filter(lambda x: [x for y in ['CanDo', 'Is', 'Use', 'Valid', 'Enable', 'InUse', 'CopyOption', 'AutoSelection', 'Keep', 'Normalize', 'Freq', 'Flag', 'OutChecked', 'OpenVirtual', 'ModeActive', 'TwoLaser']
                                     if y in x and 'Name' not in x and 'UseMode' not in x],
                        dff_ATLConfocalSettingDefinition.columns.values))

In [None]:
# Conversion to floats and integers:
for num in cols_num_3:
    dff_ATLConfocalSettingDefinition[num] = pd.to_numeric(dff_ATLConfocalSettingDefinition[num], errors='coerce')
    
# Conversion to booleen:
for b in cols_bool_3:
    dff_ATLConfocalSettingDefinition[b] = np.where(dff_ATLConfocalSettingDefinition[b]==1, True, False)

In [None]:
# Specific rules:

# Because the column name contains the excluded keyword 'Name':
dff_ATLConfocalSettingDefinition['ATLConfocalSettingDefinition.@IsUserSettingNameSet'] = pd.to_numeric(dff_ATLConfocalSettingDefinition['ATLConfocalSettingDefinition.@IsUserSettingNameSet'], errors='coerce')
dff_ATLConfocalSettingDefinition['ATLConfocalSettingDefinition.@IsUserSettingNameSet'] = np.where(dff_ATLConfocalSettingDefinition['ATLConfocalSettingDefinition.@IsUserSettingNameSet']==1, True, False)

# Because the column name contains the excluded keyword 'Unit':
dff_ATLConfocalSettingDefinition['ATLConfocalSettingDefinition.ClimateControl.@NumberOfUnits'] = pd.to_numeric(dff_ATLConfocalSettingDefinition['ATLConfocalSettingDefinition.ClimateControl.@NumberOfUnits'], errors='coerce')

In [None]:
# Because the columns contains null-object:
int_with_null_objects_3 = list(filter(lambda x: [x for y in ['ActiveCS_SubModeForRLD', 'Flip', 'Swap'] if y in x 
                                                 and 'Name' not in x and 'Position' not in x and 'Model' not in x and 'ScanMode' not in x and 'Unit' not in x and 'Can' not in x and 'Is' not in x],
                                      dff_ATLConfocalSettingDefinition.columns.values))

dff_ATLConfocalSettingDefinition[int_with_null_objects_3] = dff_ATLConfocalSettingDefinition[int_with_null_objects_3].astype('Int8')

In [None]:
dff_ATLConfocalSettingDefinition.info('int64')

## Detector

In [None]:
dff_detector_information = pd.read_parquet('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/detector_information_metadata.parquet')

In [None]:
dff_detector_information = dff_detector_information.fillna(pd.NA)

In [None]:
# Filters for each categories:
# Integers and floats are coupled together into the numeric category.

cols_num_4 = list(filter(lambda x: [x for y in ['Min', 'Max', 'Bytes', 'Origin', 'Length', 'Position', 'Aperture', 'Index', 'Dim', 'Zoom', 'Pinhole', 'Time', 'Value', 'Range', 'CommonFactor', 'Gain', 'Offset', 'World',
                                                'Power','Intensity',
                                                'Resolution', 'DimID', 'NumberOfElements', 'Size', 'Tag', 'Version', 'Bit', 'Mode', 'Section', 'Magnification', 'Pos', 'Number', 'Speed',
                                                'Direction', 'Wavelength', 'Average', 'Accumulation', 'Trigger', 'Relative', 'Detectors', 'Channels', 'Flip', 'Swap', 'Phase',
                                                'CanDo', 'Is', 'Use', 'Valid', 'Enable', 'InUse', 'CopyOption', 'AutoSelection', 'Keep', 'Normalize', 'Freq', 'Flag', 'OutChecked', 'OpenVirtual', 'ModeActive', 'TwoLaser']
                                    if y in x and 'Name' not in x and 'TimeStamp' not in x and 'Model' not in x and 'ScanMode' not in x and 'Unit' not in x and 'Type' not in x and 'State' not in x and 'BeamPosition' not in x],
                         dff_detector_information.columns.values))

cols_integer_4 = list(filter(lambda x: [x for y in ['Resolution', 'DimID', 'NumberOfElements', 'Size', 'Tag', 'Version', 'Bit', 'Mode', 'Section', 'Magnification', 'Pos', 'Number', 'Speed',
                                                    'Direction', 'Wavelength', 'Average', 'Accumulation', 'Trigger', 'Relative', 'Detectors', 'Channels', 'Flip', 'Swap', 'Phase'] if y in x
                                     and 'Name' not in x and 'Position' not in x and 'Model' not in x and 'ScanMode' not in x and 'Unit' not in x and 'Can' not in x and 'Is' not in x], 
                             dff_detector_information.columns.values))



cols_bool_4 = list(filter(lambda x: [x for y in ['CanDo', 'Is', 'Use', 'Valid', 'Enable', 'InUse', 'CopyOption', 'AutoSelection', 'Keep', 'Normalize', 'Freq', 'Flag', 'OutChecked', 'OpenVirtual', 'ModeActive', 'TwoLaser']
                                     if y in x and 'Name' not in x and 'UseMode' not in x],
                          dff_detector_information.columns.values))

cols_categ_4 = list(filter(lambda x: [x for y in ['LUTName', 'LutName', 'DyeName', 'Detector.@Name', 'Detector.@Type', 'Detector.@ScanType', 'LaserName', 'LightSourceName', 'LightSourceType', '@Channel'] if y in x
                                      and 'Tag' not in x], dff_detector_information.columns.values))

In [None]:
# Conversion to floats and integers:
for num in cols_num_4:
    dff_detector_information[num] = pd.to_numeric(dff_detector_information[num], errors='coerce')
    
# Conversion to booleen:
for b in cols_bool_4:
    dff_detector_information[b] = np.where(dff_detector_information[b]==1, True, False)

# Conversion to category
dff_detector_information[cols_categ_4] = dff_detector_information[cols_categ_4].astype('category')

In [None]:
dff_detector_information.info()

## Laser 

In [None]:
dff_laser_information = pd.read_pickle('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/laser_information_metadata.pickle')

In [None]:
dff_laser_information.dropna(axis=1, how='all', inplace=True)

In [None]:
dff_laser_information = dff_laser_information.fillna(pd.NA)

In [None]:
# Filters for each categories:
# Integers and floats are coupled together into the numeric category.

cols_num_5 = list(filter(lambda x: [x for y in ['Min', 'Max', 'Bytes', 'Origin', 'Length', 'Position', 'Aperture', 'Index', 'Dim', 'Zoom', 'Pinhole', 'Time', 'Value', 'Range', 'CommonFactor', 'Gain', 'Offset', 'World',
                                                'Power','Intensity',
                                                'Resolution', 'DimID', 'NumberOfElements', 'Size', 'Tag', 'Version', 'Bit', 'Mode', 'Section', 'Magnification', 'Pos', 'Number', 'Speed',
                                                'Direction', 'Wavelength', 'Average', 'Accumulation', 'Trigger', 'Relative', 'Detectors', 'Channels', 'Flip', 'Swap', 'Phase',
                                                'CanDo', 'Is', 'Use', 'Valid', 'Enable', 'InUse', 'CopyOption', 'AutoSelection', 'Keep', 'Normalize', 'Freq', 'Flag', 'OutChecked', 'OpenVirtual', 'ModeActive', 'TwoLaser']
                                    if y in x and 'Name' not in x and 'TimeStamp' not in x and 'Model' not in x and 'ScanMode' not in x and 'Unit' not in x and 'Type' not in x and 'State' not in x and 'BeamPosition' not in x],
                         dff_laser_information.columns.values))

cols_integer_5 = list(filter(lambda x: [x for y in ['Resolution', 'DimID', 'NumberOfElements', 'Size', 'Tag', 'Version', 'Bit', 'Mode', 'Section', 'Magnification', 'Pos', 'Number', 'Speed',
                                                    'Direction', 'Wavelength', 'Average', 'Accumulation', 'Trigger', 'Relative', 'Detectors', 'Channels', 'Flip', 'Swap', 'Phase'] if y in x
                                     and 'Name' not in x and 'Position' not in x and 'Model' not in x and 'ScanMode' not in x and 'Unit' not in x and 'Can' not in x and 'Is' not in x],
                             dff_laser_information.columns.values))



cols_bool_5 = list(filter(lambda x: [x for y in ['CanDo', 'Is', 'Use', 'Valid', 'Enable', 'InUse', 'CopyOption', 'AutoSelection', 'Keep', 'Normalize', 'Freq', 'Flag', 'OutChecked', 'OpenVirtual', 'ModeActive', 'TwoLaser']
                                     if y in x and 'Name' not in x and 'UseMode' not in x], dff_laser_information.columns.values))

cols_categ_5 = list(filter(lambda x: [x for y in ['LUTName', 'LutName', 'DyeName', 'Detector.@Name', 'Detector.@Type', 'Detector.@ScanType', 'LaserName', 'LightSourceName', 'LightSourceType', '@Channel'] if y in x
                                      and 'Tag' not in x], dff_laser_information.columns.values))

In [None]:
# Conversion to floats and integers:
for num in cols_num_5:
    dff_laser_information[num] = pd.to_numeric(dff_laser_information[num], errors='coerce')
    
# Conversion to booleen:
for b in cols_bool_5:
    dff_laser_information[b] = np.where(dff_laser_information[b]==1, True, False)
    
# Conversion to category
dff_laser_information[cols_categ_5] = dff_laser_information[cols_categ_5].astype('category')

In [None]:
dff_laser_information.info()

## Data reconciliation

In [None]:
# Check the number of unique ID --> corresponding to unique image for all the files.
dff_images['Image.@UniqueID'].nunique()

In [None]:
# Check the number of unique ID --> corresponding to unique image for all the files.
dff_image_attachments.shape

In [None]:
dff_image_attachments['Image.Attachment.Image.@UniqueID'].nunique()

The 2 DataFrames `dff_image` and `dff_image_attachment` have the same number of `'Image.@UniqueID`.

In [None]:
# Merge both Dataframe on the Name of the files, the Image Names and the Image Unique IDs:
dff_final = dff_images.merge(dff_image_attachments, how='outer', left_on=['FileName', 'Image.@Name', 'Image.@UniqueID'],
                 right_on=['Image.Attachment.FileName', 'Image.Attachment.Image.@Name', 'Image.Attachment.Image.@UniqueID'],
                 sort=False, suffixes=('Image.', 'Image.Attachment.'), indicator=True)

In [None]:
# Check that all the rows from dff_image_attachments have a counterpart in the dff_images:
dff_final['_merge'].unique()

In [None]:
# Check which are the images that got 'left_only':
dff_final[dff_final['_merge'] == 'left_only']

`'left_only` occurs only when **no image** are present, which correspond to `200316_Sample_7.li` and `200304_200304_Rev10a2_GFP_Rods.lif`. Correspond to previous observation when retriving the xml data from the lif files.

In [None]:
# Drop the '_merge' column:
dff_final.drop(['_merge'], axis=1, inplace=True)

In [None]:
dff_final[~(dff_final['FileName'] == dff_final['Image.Attachment.FileName']) & ~(dff_final['Image.@Name'] == dff_final['Image.Attachment.Image.@Name']) & ~(dff_final['Image.@UniqueID'] == dff_final['Image.Attachment.Image.@UniqueID'])]

The merging seems to have been done correctly: in each row the FileName, ImageName and ImageUniqueID correspond between the two merged dataframe. Therefore we can drop the duplicate columns ('Image.Attachment.FileName', 'Image.Attachment.Image.@Name', 'Image.Attachment.Image.@UniqueID')

In [None]:
# Drop the columns containing duplicated information:
dff_final.drop(['Image.Attachment.FileName', 'Image.Attachment.Image.@Name', 'Image.Attachment.Image.@UniqueID'], axis=1, inplace=True)

In [None]:
dff_final.head(10)

In [None]:
# Check the number of unique ID --> corresponding to unique image for all the files.
dff_ATLConfocalSettingDefinition.shape

In [None]:
dff_ATLConfocalSettingDefinition['Image.@UniqueID'].nunique()

The 2 DataFrames `dff_image` and `dff_ATLConfocalSettingDefinition` have the same number of `'Image.@UniqueID`.

In [None]:
# Merge both Dataframe on the Name of the files, the Image Names and the Image Unique IDs:
dff_final = dff_final.merge(dff_ATLConfocalSettingDefinition, how='outer', left_on=['FileName', 'Image.@Name', 'Image.@UniqueID'],
                 right_on=['FileName', 'Image.@Name', 'Image.@UniqueID'], sort=False, suffixes=('Image.', 'ATLConfocalSettingDefinition.'),
                            indicator=True)

In [None]:
# Check that all the rows from dff_image_attachments have a counterpart in the dff_images:
dff_final['_merge'].unique()

In [None]:
# Check which are the images that got 'left_only':
dff_final[dff_final['_merge'] == 'left_only']

`'left_only` occurs only when **no image** are present, which correspond to `200316_Sample_7.li` and `200304_200304_Rev10a2_GFP_Rods.lif`. Correspond to previous observation when retriving the xml data from the lif files.

In [None]:
# Drop the '_merge' column:
dff_final.drop(['_merge'], axis=1, inplace=True)

In [None]:
dff_final.head(20)

In [None]:
# Check the number of unique ID --> corresponding to unique image for all the files.
dff_detector_information.shape

In [None]:
dff_detector_information['Image.@UniqueID'].nunique()

The 2 DataFrames `dff_image` and `dff_detector_information` have the same number of `'Image.@UniqueID`. Several rows of information are available for each images.

In [None]:
duplicateRow_dff_detector_information = dff_detector_information[dff_detector_information.duplicated()]

In [None]:
duplicateRow_dff_detector_information.index

No duplicated row.

In [None]:
dff_detector_information.shape[0]/dff_detector_information['Image.@UniqueID'].nunique()

In [None]:
# Merge both Dataframe on the Name of the files, the Image Names, the Image Unique IDs and the LutName column:
dff_final = dff_final.merge(dff_detector_information, how='outer', left_on=['FileName', 'Image.@Name', 'Image.@UniqueID', 'Channel.@LUTName'],
                 right_on=['FileName', 'Image.@Name', 'Image.@UniqueID', 'ATLConfocalSettingDefinition.LUT_List.LUT.@LutName'], sort=False, suffixes=('_Image', '_Detector'),
                            indicator=True)

In [None]:
# Check that all the rows from dff_image_attachments have a counterpart in the dff_images:
dff_final['_merge'].unique()

In [None]:
# Check which are the images that got 'left_only':
dff_final[dff_final['_merge'] == 'left_only']

`200316_Sample_7.lif` and `200304_200304_Rev10a2_GFP_Rods.lif` doesn't have images.  

**Missing data** for:  
    - `200304_200304_Rev10a2_GFP_Lforms.lif`  / All images  
    - `200304_200304_Test_Leica_SP8_2.lif`/ Image006 / Green channel

In [None]:
# Drop the '_merge' column:
dff_final.drop(['_merge'], axis=1, inplace=True)

In [None]:
dff_final.head(10)

In [None]:
dff_laser_information.shape

In [None]:
dff_laser_information['Image.@UniqueID'].nunique()

In [None]:
dff_laser_information.shape[0]/dff_laser_information['Image.@UniqueID'].nunique()

In [None]:
#duplicateRow_4 = dff_laser_information[dff_laser_information.duplicated()]
#duplicateRow_4

In [None]:
bins = [405, 458, 561, 800]
dff_final['WaveLengths'] = pd.cut(dff_final['ATLConfocalSettingDefinition.Spectro.MultiBand.@TargetWaveLengthBegin'], bins=bins, labels=[405, 458, 561])

In [None]:
dff_final['WaveLengths'] = dff_final['WaveLengths'].astype('Int64')

In [None]:
dff_final['WaveLengths']

In [None]:
dff_laser_information['ATLConfocalSettingDefinition.LaserArray.Laser.@Wavelength'].value_counts()

In [None]:
dff_laser_information['ATLConfocalSettingDefinition.LaserArray.Laser.@Wavelength'] = dff_laser_information['ATLConfocalSettingDefinition.LaserArray.Laser.@Wavelength'].astype('Int64')

In [None]:
# Merge both Dataframe on the Name of the files, the Image Names, the Image Unique IDs and the LutName column:
dff_final = dff_final.merge(dff_laser_information, how='left', left_on=['FileName', 'Image.@Name', 'Image.@UniqueID', 'WaveLengths'],
                 right_on=['FileName', 'Image.@Name', 'Image.@UniqueID', 'ATLConfocalSettingDefinition.LaserArray.Laser.@Wavelength'], sort=False, suffixes=('_Image', '_Laser'),
                            indicator=True)

In [None]:
# Check that all the rows from dff_image_attachments have a counterpart in the dff_images:
dff_final['_merge'].unique()

In [None]:
# Check which are the images that got 'left_only':
dff_final[dff_final['_merge'] == 'left_only']['WaveLengths'].value_counts()

In [None]:
dff_final[(dff_final['_merge'] == 'left_only') & (dff_final['WaveLengths'] == 405)]

Again, problem for data of the files:  
    - `200304_200304_Rev10a2_GFP_Lforms.li`  
    - `200304_200304_Test_Leica_SP8_2.lif`

In [None]:
dff_final[(dff_final['Channel.@LUTName'] == 'Blue') & (dff_final['ATLConfocalSettingDefinition.LUT_List.LUT.@LutName'] == 'Blue') &(dff_final['ATLConfocalSettingDefinition.LaserArray.Laser.@Wavelength'] == 405)].head(50)

In [None]:
dff_final.head(10)

In [None]:
dff_final.to_pickle('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/combined_metadata.pickle')

In [None]:
metadata_dataframe = pd.read_pickle('/Users/virginie/bioformats/notebooks/metadata_leica_files/parquets_and_pickles/combined_metadata.pickle')

In [None]:
metadata_dataframe.head(20)