## Notebook 3 - Enzyme-free controls

This notebook removes removes any substrates found to react in the enzyme-free (lysate-containing) control experiments.

In [None]:
%run ../common.py

In [2]:
from pyopenms import *

Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


In [3]:
# We already prepared the reference database

df_database = pd.read_pickle(filepath_results + 'MS2_database_shifts_162_320_VB.pkl')

In [None]:
def process_enzymefree(df_database, mixes):

    df = pd.DataFrame(columns = ['Mix', 'Name', 'InchiKey', 'ik_MoNA', 'CSMILES', 'PrecursorMZ',
                                 'RT', 'CosineScore','MS1_AUC'])

    for mix_no in mixes:
        
        print(f'\nProcessing mix {mix_no}')

        ## Prepare reference
        df_mix = df_database[df_database['Mix'] == mix_no]

        ref_name = list(df_mix['Name'])
        ref_smiles = list(df_mix['CSMILES'])
        ref_ikmona = list(df_mix['ik_MoNA'])
        ref_inchikey = list(df_mix['InchiKey'])
        colnames = [a for a in df_mix.columns if 'M+' in a]
        ref_precursors_mz = np.array(df_mix[colnames])
        ref_fragments_mz = list(df_mix['mz'])
        ref_fragments_i = list(df_mix['relint'])

        filename = glob.glob(f"../data/screening_data/enzymefree_controls_data/*_Mix_{mix_no}.mzML")[0]

        exp = MSExperiment()
        MzMLFile().load(filename, exp)

        ## SUBSET EXPERIMENTAL MS2 SPECTRA

        list_hit_name = []
        list_hit_mzprecursor = []
        list_hit_rt = []
        list_hit_score = []
        list_hit_smiles = []
        list_hit_ikmona = []
        list_hit_inchikey = []

        for spec in exp:
            if spec.getMSLevel() == 2:

                mz_precursor = spec.getPrecursors()[0].getMZ()

                test = ((np.abs(mz_precursor - ref_precursors_mz)/mz_precursor)*1e6) < ppm # boolean 2D matrix
                score_best = score_threshold
                for i in np.where(test)[0]:
                        query_mz, query_intensities = spec.get_peaks()
                        score = cosine_greedy_score(query_mz, query_intensities, ref_fragments_mz[i], np.array(ref_fragments_i[i]))
                        if score > score_best:
                            score_best = score
                            I = i

                if score_best > score_threshold:
                    list_hit_mzprecursor.append(mz_precursor)
                    rt = spec.getRT() # seconds
                    list_hit_rt.append(rt)
                    list_hit_score.append(score_best)
                    list_hit_name.append(ref_name[I])
                    list_hit_smiles.append(ref_smiles[I])
                    list_hit_ikmona.append(ref_ikmona[I])
                    list_hit_inchikey.append(ref_inchikey[I])

        df_temp = pd.DataFrame({
            'File': filename,
            'Mix': mix_no,
            'Name': list_hit_name,
            'CSMILES': list_hit_smiles,
            'PrecursorMZ': np.round(list_hit_mzprecursor,4),
            'RT': np.round(list_hit_rt, 2),
            'CosineScore': np.round(list_hit_score,3),
            'InchiKey': list_hit_inchikey,
            'ik_MoNA': list_hit_ikmona,
        })

        # Let us now remove duplicate products based on their name. We keep the one with the highest score
        df_temp.sort_values(by=['CosineScore'], ascending=[False], inplace=True) # The sorting ensures we keep the highest score when dropping
        df_temp = df_temp.drop_duplicates(subset=['InchiKey', 'Mix'], keep='first') # We will keep the highest score for each Name

        print(f'{len(df_temp)} products identified')

        ## INTEGRATE IN THE MS1 CHANNEL (2D integration)

        list_hit_AUC = []

        t = 0.
        for j, row in df_temp.iterrows(): 

            print(f"Product {row['Name']} identified with score {row['CosineScore']}")

            rt_hit = row['RT']
            mz_hit = row['PrecursorMZ']

            AUC = []
            t_integrate = []
            mz_hit = list_hit_mzprecursor[j]
            mz_ub = mz_hit*(1+ppm_integrate/1e6)
            mz_lb = mz_hit*(1-ppm_integrate/1e6)
            rt_lb = rt_hit - rt_semiwindow
            rt_ub = rt_hit + rt_semiwindow
            for spec in exp:
                if spec.getMSLevel() == 1:
                    rt = spec.getRT()
                    if rt < rt_lb:
                        continue
                    if rt > rt_ub:
                        break


                    # Let us integrate along the m/z dimension to obtain the AUC for that time slice
                    mz_peaks, i_peaks = spec.get_peaks()
                    idx_bool = (mz_peaks > mz_lb) & (mz_peaks < mz_ub)
                    if any(idx_bool):
                        i_peaks_integrate = np.concatenate(([0.], i_peaks[idx_bool], [0.]))
                        mz_peaks_integrate = np.concatenate(([mz_lb], mz_peaks[idx_bool], [mz_ub]))
                        AUC_i = np.trapz(y=i_peaks_integrate, x=mz_peaks_integrate)

                        # Let us compute the baseline AUC to be substracted
                        #idx_where = np.where(idx_bool)[0]

                        #idx_first = idx_where[0]
                        #idx_last = idx_where[-1]
                        #i_peaks_integrate = np.array([0., i_peaks[idx_first], i_peaks[idx_last], 0.])
                        #mz_peaks_integrate = np.array([mz_lb, mz_peaks[idx_first], mz_peaks[idx_last], mz_ub])
                        #AUC_base = np.trapz(y=i_peaks_integrate, x=mz_peaks_integrate)
                        AUC_base = 0.

                        t_integrate.append(rt)
                        AUC.append(AUC_i-AUC_base)

            # Let us integrate along the time dimension
            t_integrate = np.concatenate(([rt_lb], t_integrate, [rt_ub]))
            AUC = np.concatenate(([0.], AUC, [0.]))
            AUC = np.trapz(y=AUC, x=t_integrate)
            list_hit_AUC.append(AUC)

        df_temp['AUC'] = np.round(list_hit_AUC,0)
        df = pd.concat([df,df_temp], ignore_index=True)
        mixes = set(df['Mix'])
    return df, mixes

We will run `process_enzymefree` iteratively until no more false positives arise in these controls.

We only need to repeat those mixes for which some product was identified.

In [6]:
df = pd.DataFrame()
names_drop = set()
mixes = range(1,13)

for k in range(10):
    df_tmp, mixes = process_enzymefree(df_database, mixes)
    
    if len(df_tmp) == 0:
        break
    
    tmp = set(df_tmp['Name'])
    names_drop = names_drop.union(tmp)
    
    df_database = df_database[~df_database['Name'].isin(names_drop)]
    


Processing mix 1
0 products identified

Processing mix 2
0 products identified

Processing mix 3
0 products identified

Processing mix 4
1 products identified
Product 2,3-DIHYDROXY-4-METHOXY-4'-ETHOXYBENZOPHENONE identified with score 0.857

Processing mix 5


  df = pd.concat([df,df_temp], ignore_index=True)


1 products identified
Product PHLORETIN identified with score 0.978

Processing mix 6
3 products identified
Product S-ISOCORYDINE (+) identified with score 0.975
Product NARINGENIN identified with score 0.971
Product HESPERETIN identified with score 0.971

Processing mix 7
2 products identified
Product TRYPTAMINE identified with score 0.946
Product HARPAGOSIDE identified with score 0.894

Processing mix 8
1 products identified
Product 7,8-DIHYDROXYFLAVONE identified with score 0.94

Processing mix 9
0 products identified

Processing mix 10
0 products identified

Processing mix 11
0 products identified

Processing mix 12
1 products identified
Product BIOCHANIN A identified with score 0.928

Processing mix 4
0 products identified

Processing mix 5
0 products identified

Processing mix 6
0 products identified

Processing mix 7
0 products identified

Processing mix 8
0 products identified

Processing mix 12
0 products identified


Set of hits identified in the enzyme-free controls. These would have been false positives, so we will just discard the corresponding substrates:

In [7]:
names_drop

{"2,3-DIHYDROXY-4-METHOXY-4'-ETHOXYBENZOPHENONE",
 '7,8-DIHYDROXYFLAVONE',
 'BIOCHANIN A',
 'HARPAGOSIDE',
 'HESPERETIN',
 'NARINGENIN',
 'PHLORETIN',
 'S-ISOCORYDINE (+)',
 'TRYPTAMINE'}

Let us now drop the corresponding substrates from `df_substrates` and `df_database`:

In [8]:
df_database.shape

(7211, 20)

In [9]:
df_database_clean = df_database[~df_database['Name'].isin(names_drop)]

In [10]:
df_database_clean.shape

(7211, 20)

In [11]:
df_database_clean.to_pickle(filepath_results + 'MS2_database_shifts_162_320_VB_clean.pkl')
df_database_clean.to_csv(filepath_results + 'MS2_database_shifts_162_320_VB_clean.csv', index=False)

In [12]:
df_substrates = pd.read_csv('./tmp/Substrates_VB.csv')

In [13]:
df_substrates.shape

(464, 17)

In [14]:
df_substrates_clean = df_substrates[~df_substrates['Name'].isin(names_drop)].copy()

df_substrates_clean = df_substrates[df_substrates['Name'].isin(df_database_clean['Name'])].copy()

In [15]:
df_substrates_clean.shape

(444, 17)

In [16]:
len(df_substrates_clean['Name'].unique())

434

In [19]:
## QC
all(df_substrates_clean.groupby('Name')['ik_MoNA'].nunique()==1)

True

In [20]:
df_substrates_clean.ik_MoNA.nunique()

427

In [21]:
df_substrates_clean

Unnamed: 0,Mix,Name,InchiKey,ik_MoNA,SMILES,CSMILES,M_charge,M+Glu+H,M+Glu+Na,M+Glu+2H,M+Glu+NH4,M+ACN+H,M+2Glu+H,M+2Glu+Na,M+2Glu+2H,M+2Glu+NH4,M+2Glu+ACN+H,superclass
0,1,3ALPHA-HYDROXY-3-DEOXYANGOLENSIC ACID METHYL E...,QFRUZVNPYYYLAN-BBJYNIMOSA-N,QFRUZVNPYYYLAN,C[C@@]12CCC3C(=C)[C@]1(CC(=O)O[C@H]2C4=COC=C4)...,C=C1C2CCC3(C)C(c4ccoc4)OC(=O)CC13OC1CC(O)C(C)(...,0,635.306679,657.288621,318.156978,652.333226,676.333226,797.359979,819.341921,399.183628,814.386526,838.386526,Triterpenoids
1,1,3-AMINO-BETA-PINENE,SQSDBXYJKLVZJR-UHFFFAOYSA-N,SQSDBXYJKLVZJR,CC1(C2CC1C(=C)C(C2)N)C.Cl,C=C1C(N)CC2CC1C2(C)C,0,314.196676,336.178618,157.601976,331.223223,355.223223,476.249976,498.231918,238.628626,493.276523,517.276523,Monoterpenoids
2,1,"3BETA-HYDROXY-23,24-BISNORCHOL-5-ENIC ACID",NPBNRBWMDNZEBN-YTEKVJICSA-N,NPBNRBWMDNZEBN,CC([C@H]1CCC2[C@@]1(CCC3C2CC=C4[C@@]3(CC[C@@H]...,CC(C(=O)O)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C,0,509.311371,531.293313,255.159323,526.337918,550.337918,671.364671,693.346613,336.185973,688.391218,712.391218,Steroids
3,1,3-HYDROXYTYRAMINE,VYFYYTLLBUKUHU-UHFFFAOYSA-N,VYFYYTLLBUKUHU,C1=CC(=C(C=C1CCN)O)O,NCCc1ccc(O)c(O)c1,0,316.139555,338.121497,158.573415,333.166102,357.166102,478.192855,500.174797,239.600065,495.219402,519.219402,Tyrosine alkaloids
4,1,AVOCADYNE ACETATE,JAKAZHIACKJNNB-UHFFFAOYSA-N,JAKAZHIACKJNNB,CC(=O)OCC(CC(CCCCCCCCCCCC#C)O)O,C#CCCCCCCCCCCCC(O)CC(O)COC(C)=O,0,489.306286,511.288228,245.156781,506.332833,530.332833,651.359586,673.341528,326.183431,668.386133,692.386133,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459,12,SPERMIDINE,ATHGHQPFGPMSJY-UHFFFAOYSA-N,ATHGHQPFGPMSJY,C(CCNCCCN)CN,NCCCCNCCCN,0,308.218474,330.200416,154.612875,325.245021,349.245021,470.271774,492.253716,235.639525,487.298321,511.298321,Ornithine alkaloids
460,12,STEVIOL,QFVOYBUQQBFCRH-VQSWZGCSSA-N,QFVOYBUQQBFCRH,C[C@@]12CCC[C@@]([C@H]1CC[C@]34[C@H]2CC[C@](C3...,C=C1CC23CCC4C(C)(C(=O)O)CCCC4(C)C2CCC1(O)C3,0,481.280071,503.262013,241.143673,498.306618,522.306618,643.333371,665.315313,322.170323,660.359918,684.359918,Diterpenoids
461,12,STIGMASTEROL,HCXVJBMSMIARIN-PHZDYDNGSA-N,HCXVJBMSMIARIN,CC[C@H](/C=C/[C@@H](C)[C@H]1CC[C@@H]2[C@@]1(CC...,CCC(C=CC(C)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C)C...,0,575.431092,597.413034,288.219184,592.457639,616.457639,737.484392,759.466334,369.245834,754.510939,778.510939,Steroids
462,12,TODDALOLACTONE,GLWPLQBQHWYKRK-UHFFFAOYSA-N,GLWPLQBQHWYKRK,CC(C)(C(CC1=C(C=C2C(=C1OC)C=CC(=O)O2)OC)O)O,COc1cc2oc(=O)ccc2c(OC)c1CC(O)C(C)(C)O,0,471.186564,493.168506,236.096920,488.213111,512.213111,633.239864,655.221806,317.123570,650.266411,674.266411,Coumarins


In [23]:
assert set(df_database_clean['Name']) == set(df_substrates_clean['Name'])

In [24]:
assert set(df_database_clean['CSMILES']) == set(df_substrates_clean['CSMILES'])

In [22]:
df_substrates_clean.to_csv(filepath_results + 'Substrates_VB_clean.csv', index=False)