# ***EDA On The imzML-DESI Dataset***

This notebook shows EDA on the imzML-DESI Dataset.



### ***Import Packages:***

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from pyimzml.ImzMLParser import ImzMLParser, getionimage
from tqdm import tqdm
from typing import Tuple

sns.set_style("white")
%matplotlib inline

### ***Define Global Variables***

In [3]:
# path to folder containing the imzML files
imzml_folder = 'imzml-DESI'
# define the suffix of imzML files
suffix = '.imzML'
# define names of all imzML files
files = ['HG 11-11-12-s', 'HG 12-11-r', 'HG 14-13-r', 'HG 14-13-s',
         'HG 16-15-r', 'HG 16-15-s', 'HG 18-19-18-r', 'HG 19-18-s',
         'HG 29-25-23-21-20-r', 'HG 29-25-23-21-20-s', 'HG 6-6-7-r',
         'HG 6-7-s', 'HG 8-12-5-4-3-2-s', 'HG 8-5-4-3-2-r',
         'HG 9-10-r', 'HG 9-10-s']

### ***EDA***

***Let's get the amount of pixels in all the dataset:***

In [4]:
# list to store amount of pixels in replica
pixel_count = []

# loop over each imzML file
for file in tqdm(files):
    # read the imzML file
    p = ImzMLParser(os.path.join(imzml_folder, file + suffix))
    # add amount of pixels to list
    pixel_count.append(len(p.coordinates))

# create dataframe of file names and amount of pixels sorted ascending
pixel_count_df = pd.DataFrame({'pixel_count': pixel_count}, index=files)

# print number of total pixels in the dataset
print("Total number of pixels in the dataset "
      f"{pixel_count_df['pixel_count'].sum()}")

# print dataframe
pixel_count_df.sort_values(by='pixel_count', ascending=True)

100%|██████████| 16/16 [00:47<00:00,  2.96s/it]

Total number of pixels in the dataset 240162





Unnamed: 0,pixel_count
HG 6-7-s,5265
HG 9-10-s,6000
HG 19-18-s,6095
HG 12-11-r,7482
HG 14-13-s,7938
HG 16-15-s,10220
HG 6-6-7-r,10570
HG 16-15-r,12920
HG 11-11-12-s,14022
HG 14-13-r,14196


***Let's get the amount of pixels in the section samples:***

In [5]:
# filter the section samples
pixel_count_df_s = pixel_count_df.loc[pixel_count_df.index.str.contains('s')]

# print number of total pixels in section files
print("Total number of pixels in section samples "
      f"{pixel_count_df_s['pixel_count'].sum()}")

# print dataframe
pixel_count_df_s

Total number of pixels in section samples 99465


Unnamed: 0,pixel_count
HG 11-11-12-s,14022
HG 14-13-s,7938
HG 16-15-s,10220
HG 19-18-s,6095
HG 29-25-23-21-20-s,23718
HG 6-7-s,5265
HG 8-12-5-4-3-2-s,26207
HG 9-10-s,6000


***Let's get the amount of pixels in the replica samples:***

In [6]:
# filter the section samples
pixel_count_df_s = pixel_count_df.loc[pixel_count_df.index.str.contains('r')]

# print number of total pixels in replica files
print("Total number of pixels in replica samples "
      f"{pixel_count_df_s['pixel_count'].sum()}")

# print dataframe
pixel_count_df_s

Total number of pixels in replica samples 140697


Unnamed: 0,pixel_count
HG 12-11-r,7482
HG 14-13-r,14196
HG 16-15-r,12920
HG 18-19-18-r,24024
HG 29-25-23-21-20-r,27876
HG 6-6-7-r,10570
HG 8-5-4-3-2-r,28584
HG 9-10-r,15045


***Let's define a method to get info from a single imzML file:***

In [7]:
def imzml_info(file) -> Tuple[pd.DataFrame, np.ndarray]:
    """
    Function to get information about a single imzML file.
    The info is a dataframe containing for each pixel the following: 
        [x coordinate, y coordinate, mz max value, mz max value,
        mz number of values, mz number of non zero values]
    And an array containing unique values from all pixels mz array.

    Args:
        file (str): file name located in imzml_folder without suffix

    Returns:
        pd.DataFrame: all the info on the specific imzML file
    """
    # read the imzML file
    p = ImzMLParser(os.path.join(imzml_folder, file + suffix))
    # list to store pixels x-value
    xs = []
    # list to store pixels y-value
    ys = []
    # list to store pixels max mz value
    mzs_max = []
    # list to store pixels min mz value
    mzs_min = []
    # list to store pixels mz number of values
    mzs_len = []
    # list to store pixels mz number of non zero values
    non_zero_mzs = []
    # array to store unique values from all pixels mz array
    mzs_unique_values = np.asarray([])

    # loop over each coordinate in image
    for idx, (x, y, _) in enumerate(p.coordinates):
        # get pixel mz and intensity
        mzs, intensities = p.getspectrum(idx)
        # add pixel x-value
        xs.append(x)
        # add pixel y-value
        ys.append(y)
        # add pixel max mz value
        mzs_max.append(mzs.max())
        # add pixel min mz value
        mzs_min.append(mzs.min())
        # add pixel mz number of values
        mzs_len.append(mzs.shape[0])
        # add pixel mz number of non zero values
        non_zero_mzs.append(np.count_nonzero(intensities))
        # update unique values
        mzs_unique_values = np.unique(np.concatenate((mzs_unique_values, mzs),0))

    # create dataframe
    df = pd.DataFrame({'x': xs, 'y': ys, 'mz_max': mzs_max,
                       'mz_min': mzs_min, 'mz_values_number': mzs_len,
                       'mz_non_zero_values_number': non_zero_mzs,
                       })
    return (df, mzs_unique_values)


***Let's get info from every imzML file:***

In [8]:
# list to store each imzML file info dataframe
dataframes = []
# list to store each imzML file unique mz values
mz_values = []

# loop over each imzML file
for file in tqdm(files):
    # get imzML file info
    info = imzml_info(file)
    # add file name to each imzml info dataframe
    info[0]['file'] = file
    # add the imzML file info dataframe
    dataframes.append(info[0])
    # add the imzML file unique mz values
    mz_values.append(info[1])


# create a combine imzml info dataframe
combined_df = pd.concat(dataframes, ignore_index=True)

# get the unique mz values from all files
unique_mz_values = np.unique(np.asarray(mz_values).flatten())

100%|██████████| 16/16 [1:02:38<00:00, 234.90s/it]


***Let's see if all pixels have tha same mz range (min and max mz value):***

In [9]:
print(combined_df['mz_max'].unique())
print(combined_df['mz_min'].unique())

[1200.12402344]
[49.94447708]


***Let's see if all pixels have tha same number of mz values:***

In [10]:
print(combined_df['mz_values_number'].unique())

[205411 197502 208580 ... 241798 244807 246555]


***Let's see what is the number of mz values range (min and max number of mz values):***

In [11]:
print(combined_df['mz_values_number'].max())
print(combined_df['mz_values_number'].min())

262484
2


***Let's see if all pixels have tha same number of non zero mz values:***

In [12]:
print(combined_df['mz_non_zero_values_number'].unique())

[151048 140775 153589 ... 205916 207759 205398]


***Let's s check how many unique mz values we have:***

In [13]:
unique_mz_values.shape

(269261,)