In [1]:
import numpy as np
import os
from pyteomics import mzml
from pyms.GCMS.Class import GCMS_data
from pyms.Spectrum import Scan
from pyms import IntensityMatrix
from MSDataset import MSDataset

In [2]:
def to_seconds(rt):
    if rt.unit_info == "minute":
        return rt*60
    elif rt.unit_info == "second":
        return rt

def read_mzml(path):
    times_list = []
    scans_list = []
    with mzml.read(path) as reader:
        for scan in reader:
            subscans = scan["scanList"]["scan"]
            for subscan in subscans:
                times_list.append(to_seconds(subscan["scan start time"]))
                scans_list.append(Scan(scan["m/z array"], scan["intensity array"]))
    if len(times_list) > 0:
        return GCMS_data(times_list, scans_list)
    else:
        return None

def read_intensity_matrix(path):
    mzml = read_mzml(path)
    if mzml != None:
        return IntensityMatrix.build_intensity_matrix(mzml)
    else:
        return None
    

In [3]:
dataset = MSDataset("data", read_mzml, IntensityMatrix.build_intensity_matrix)

In [4]:
mat = dataset[0][0]

In [5]:
dataset.samples

[('data\\baseline\\20-0002_RI-Standard_2023-09-01_63.mzML', 0),
 ('data\\baseline\\20-0003_RI-Standard_2023-08-08_06.mzML', 0),
 ('data\\baseline\\20-0004_RI-Standard_2023-08-08_21.mzML', 0),
 ('data\\baseline\\20-0005_RI-Standard_2023-08-14_06.mzML', 0)]

In [6]:
mat.intensity_matrix

array([[     0.        ,   4214.34814453,      0.        , ...,
             0.        ,      0.        ,      0.        ],
       [     0.        ,   2033.01757812,      0.        , ...,
             0.        ,      0.        ,      0.        ],
       [     0.        ,   5472.63891602,   1898.40454102, ...,
             0.        ,      0.        ,      0.        ],
       ...,
       [286724.28515625,  16967.83007812,      0.        , ...,
          6507.88134766, 150989.515625  ,      0.        ],
       [418983.3203125 ,   9471.52929688,      0.        , ...,
             0.        , 267811.28808594,      0.        ],
       [620970.31640625,  20413.640625  ,      0.        , ...,
             0.        , 424469.32714844,      0.        ]],
      shape=(12308, 368))

In [7]:
dataset[0][0]

<pyms.IntensityMatrix.IntensityMatrix at 0x209afa37d90>