In [3]:
from os import path
import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import find_peaks
import pandas as pd
from glob import glob

Define folders containing the data

In [21]:
#Absolute path
HERE = path.abspath('.')

#Path to the folder containing the .ms1 files
ms1_folder =  path.join(HERE, 'ms1')

#Path to the folder containing the chromatograms as .csv files
chromatogram_folder = path.join(HERE, 'chromatograms')

ms1_files = glob(path.join(ms1_folder, '*.ms1'))

chromatograms = glob(path.join(chromatogram_folder, '*.csv'))

#File containing the data for the possible possible products (MA, m/z peaks)
MA_file = path.join(HERE, 'products_MA.csv')

Define the functions to process MS spectra and chromatograms (the functions are taken from the previous notebooks)

In [24]:
def filter_match(ms1file, MAfile):
    
    '''Take a .ms1 file and filter it extracting the BPIs over a certain threshold and their m/z. Take a MAfile containing the
    predicted data for the possible products and match them with the filtered peaks '''

    with open(ms1file,'r') as f:
        
        #read the file as one scan per time
        lines = f.read()
        lines = lines.split('S')[3:]
        scans = [line.split('\n') for line in lines]
        
        #for each scan, extract the BPI and its corresponding m/z and retention time
        scan_number = []
        ret_times = []
        bpis = []
        m_z = []

        for line in lines:
            
            data = line.split('\n')
            
            scan = int(''.join([el for el in data[1] if el.isdigit()]))

            scan_number.append(scan)

            rtime = float(data[2][8:])
            
            ret_times.append(rtime)
            
            bpi = int(data[3][6:])
            
            bpis.append(bpi)

            for count in data[5:-1]:

                pairs = count.split()
                if int(pairs[1]) == bpi:
                    m_z.append(float(pairs[0]))
        
        scan_number = np.array(scan_number)
        ret_times = np.array(ret_times)
        bpis = np.array(bpis)
        m_z = np.array(m_z)
        

    #Find maxima and filter those who are less than 1/20 of the max peak
    maxs = find_peaks(bpis, height=bpis.max()/20, prominence=bpis.max()/50)[0]
    
    #Select those peaks and their corresponding data
    scan_number = scan_number[maxs]
    ret_times = ret_times[maxs]
    bpis = bpis[maxs]
    m_z = m_z[maxs]
    
    #create a df containing the data for each peak
    data = pd.DataFrame({'mz':m_z, 'BPI': bpis, 'Rtime': ret_times, 'N scan': scan})
    
    #create a df containing the data for the predicted products
    predicted = pd.read_csv(MAfile, encoding='utf-8')

    predicted['Match'] = None

    predicted['m_z'] = np.nan

    predicted['Retention times'] = np.nan

    predicted['Base Peaks'] = np.nan

    #Now, set a threshold and check if for each row in data (filtered bpi) there is a match with any of
    #the predicted masses for the possible compounds

    threshold = 0.05

    difs = []
    
    for row in data.itertuples():
        
        #Set min and max threshold for the monoisotopic mass
        min_lim = row[1] - threshold
        max_lim = row[1] + threshold
        
        diff = 10000
        
        #iterate through each possible product, and if there is a match and the mass difference
        #is lower than the previous one, update the difference and overwrite peak data
        for molecule in predicted.itertuples():
            
            if min_lim < molecule[4] < max_lim and abs(molecule[4] - row[1]) < diff:

                predicted.at[molecule.Index,'Match'] = True
                predicted.at[molecule.Index,'m_z'] = row.mz
                predicted.at[molecule.Index,'Retention times'] = row.Rtime
                predicted.at[molecule.Index,'Base Peaks'] = row.BPI
                
                diff = abs(molecule[4] - row[1])
    
    #Select only the matches
    predicted = predicted[predicted.Match == True]

    predicted = predicted.infer_objects().sort_values(by='Retention times')

    return predicted


        
def match_chromatogram(chromatogram, filtered):
    
    '''Open chromatogram as a pandas DF and merge it with the corresponding filtered .ms1 files. It returns a
    .csv file containing the merged DF and the relative abundance of the predicted products (calculated just
    considering the matched peaks) '''
    
    chrom = pd.read_csv(chromatogram, encoding = 'utf-8')
    
    merged = pd.merge_asof(filtered, chrom, left_on='Retention times', right_on='RT [min]', direction='nearest')

    merged['Abundance']= (merged['Area']/sum(merged['Area']))*100

    return merged
    


Process all .ms1 files and chromatograms and save the results

In [25]:
for (i, ms1) in enumerate(ms1_files):

    filtered = filter_match(ms1, MA_file)

    matched = match_chromatogram(chromatograms[i], filtered)

    matched.to_csv(path.join(HERE, 'Results', ms1[-7:-4] + '.csv'), encoding='utf-8')
    