`Last update at 2021-03-16`

# Background

## Rounds to test
```
Round 1: 60 MAGs; 39 MS/MS-BGC links (1 true link)
Round 2: 279 genomes/MAGs; 1 MS/MS-BGC links (0 true link)
Round 3: 279 genomes/MAGs and 589 metagenomes; 16 MS/MS-BGC links (8 true link)
```

## Outline

```
    2) Creating MS/MS fingerprints:
    2.1) Compares all fragmentation spectra of a raw file to all reference spectra of a list;
    2.2) Run the search for all mzXML files of a specified directory; (estimated runtime of 29:04:27)
    2.3) Creating MS/MS fingerprints matrix (ispec_mat);
    2.4) Merging columns from same sample.
    
```

In [1]:
from pyteomics import mzxml
from pyteomics import mgf
from spectrum_alignment import *
import numpy as np
import pandas as pd
import subprocess
import os
import glob
import requests
import json
import time

# 2. Creating MS/MS fingerprints

## 2.1. Compares all fragmentation spectra of a raw file to all reference spectra of a list

In [2]:
def get_ms2df(file_name, speclist, pmztol=0.1, fragtol=0.1):
    rt = []
    mz = []
    inten = []
    prec_mz = []
    scan_num = []
    scores = []
    specids = []
    pmlist = []
    for spec in speclist:
        for k,v in spec.items():
            pmlist.append(v[0])
    pmlist = np.array(pmlist)
    with mzxml.read(file_name) as reader:
        for spectrum in reader:
                if spectrum['msLevel'] == 2:
                    p_mz = spectrum['precursorMz'][0].get('precursorMz')
                    loc = np.where(abs(pmlist-p_mz) < pmztol)[0]
                    if len(loc):
                        for i in loc:
                            qmz = spectrum['m/z array']
                            qint = spectrum['intensity array']
                            qspec = list(zip(qmz, qint))
                            pm1, spec1 = list(speclist[i].values())[0]
                            specid = list(speclist[i].keys())[0]
                            score = score_alignment(spec1, qspec, pm1, p_mz, fragtol)[0]
                            if score>0:
                                lqmz = len(qmz)
                                scan_num.extend([spectrum['num']] * lqmz)
                                rt.extend([spectrum['retentionTime']] * lqmz)
                                mz.extend(qmz)
                                inten.extend(qint)
                                prec_mz.extend([p_mz] * lqmz)
                                scores.extend([score] * lqmz)
                                specids.extend([specid] * lqmz)

    ms2_data = pd.DataFrame(
            {'prec_mz': prec_mz,
             'mz': mz,
             'inten': inten,
             'rt': rt,
             'scan_num': scan_num,
             'specids': specids,
             'scores': scores
            })
    return ms2_data

def get_library(lib, peaks=False):
    base_url = "gnps.ucsd.edu"
    if peaks:
        datasets_url = 'https://' + base_url + '/ProteoSAFe/LibraryServlet?library=' + lib + '&showpeaks=true'
    else:
        datasets_url = 'https://' + base_url + '/ProteoSAFe/LibraryServlet?library=' + lib
    json_obj = json.loads(requests.get(datasets_url).text)
    return json_obj['spectra']


In [3]:
libnames = pd.DataFrame(get_library('all'))
libnames.head()

Unnamed: 0,spectrum_id,source_file,task,scan,ms_level,library_membership,spectrum_status,peaks_json,splash,submit_user,...,Pubmed_ID,Smiles,INCHI,INCHI_AUX,Library_Class,SpectrumID,Ion_Mode,create_time,task_id,user_id
0,CCMSLIB00000001547,130618_Ger_Jenia_WT-3-Des-MCLR_MH981.4-qb.1.1....,47daa4396adb426eaa5fa54b6ce7dd5f,1,2,GNPS-LIBRARY,1,,splash10-0w2a-0001282259-0001282259,mwang87,...,,CC(C)CC1NC(=O)C(C)NC(=O)C(=C)N(C)C(=O)CCC(NC(=...,,,1,CCMSLIB00000001547,Positive,2019-10-30 21:18:25.0,aa87bf9cd0784df9956753f435c32434,
1,CCMSLIB00000001548,20111105_Anada_Ger_HoiamideB_MH940_qb.1.1..mgf,47daa4396adb426eaa5fa54b6ce7dd5f,1,2,GNPS-LIBRARY,1,,splash10-00dl-0000011189-0000011189,mwang87,...,,CCC[C@@H](C)[C@@H]([C@H](C)[C@@H]1[C@H]([C@H](...,InChI=1S/C45H73N5O10S3/c1-14-17-24(6)34(52)26(...,,1,CCMSLIB00000001548,Positive,2019-06-04 02:55:49.0,cd4ed49954b94767a54918c340d18fa1,
2,CCMSLIB00000001549,20111105_Jenia_Ger_MalyngamideC_MH_456_qb.1.1....,47daa4396adb426eaa5fa54b6ce7dd5f,1,2,GNPS-LIBRARY,1,,splash10-00di-0000900000-0000900000,mwang87,...,,,,,1,CCMSLIB00000001549,Positive,2014-02-04 17:56:43.0,47daa4396adb426eaa5fa54b6ce7dd5f,
3,CCMSLIB00000001550,20111105_Jenia_Ger_Scytonemin_MH_545_qb.1.1..mgf,47daa4396adb426eaa5fa54b6ce7dd5f,1,2,GNPS-LIBRARY,1,,splash10-0002-0000190000-0000190000,mwang87,...,,OC1=CC=C(\C=C2\C(=O)C(C3=C4C5=C(C=CC=C5)N=C4\C...,InChI=1S/C36H20N2O4/c39-21-13-9-19(10-14-21)17...,CGZKSPLDUIRCIO-RPCRKUJJSA-N,1,CCMSLIB00000001550,Positive,2019-07-23 10:38:26.0,ca48cf7bc6644f5e89f98d62f114dfea,
4,CCMSLIB00000001551,A1.mgf,d14a5843653040ba9fa2c4376f2be358,1,2,GNPS-LIBRARY,1,,splash10-03di-0910000000-0910000000,mwang87,...,,,,,1,CCMSLIB00000001551,Positive,2014-02-04 17:56:31.0,d14a5843653040ba9fa2c4376f2be358,


In [4]:
# Creates a list of reference spectra from mgf files stored in specified directory
fmgf = [x for x in os.listdir('selected_mgf/') if '.mgf' in x]

speclist = []
for i in range(len(fmgf)):
    with mgf.MGF('selected_mgf/%s' % fmgf[i]) as reader: 
        for spectrum in reader:
            speclist.append({fmgf[i][:18] : [spectrum['params']['pepmass'][0], 
                                            list(zip(spectrum['m/z array'], spectrum['intensity array']))]
                            } 
            )


In [5]:
len(glob.glob('/Volumes/TFL190831/iomega_LCMS/*'))

1716

## 2.2. Run the search for all mzXML files of a specified directory

In [6]:
start = time.time()

dirlist = glob.glob('/Volumes/TFL190831/iomega_LCMS/*')

mxlist,excluded_list = [],[]
for dr in dirlist:
    print(dr)
    try:
        ms2_data = get_ms2df(dr, speclist)
        mxlist.append(ms2_data.loc[ms2_data.groupby(['scan_num'])['scores'].idxmax()])
    except:
        excluded_list.append(dr)
        
end = time.time()
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

/Volumes/TFL190831/iomega_LCMS/ERS4341364.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000240165.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000240165.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000240165.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000240165.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000240165.mzXML.5
/Volumes/TFL190831/iomega_LCMS/GCA_000240165.mzXML.6
/Volumes/TFL190831/iomega_LCMS/GCA_000240165.mzXML.7
/Volumes/TFL190831/iomega_LCMS/GCA_000240165.mzXML.8
/Volumes/TFL190831/iomega_LCMS/GCA_000240165.mzXML.9
/Volumes/TFL190831/iomega_LCMS/GCA_000240165.mzXML.10
/Volumes/TFL190831/iomega_LCMS/GCA_000240165.mzXML.11
/Volumes/TFL190831/iomega_LCMS/GCA_000240165.mzXML.12
/Volumes/TFL190831/iomega_LCMS/GCA_000240165.mzXML.13
/Volumes/TFL190831/iomega_LCMS/GCA_000240165.mzXML.14
/Volumes/TFL190831/iomega_LCMS/GCA_000240165.mzXML.15
/Volumes/TFL190831/iomega_LCMS/GCA_000240165.mzXML.16
/Volumes/TFL190831/iomega_LCMS/GCA_000240165.mzXML.17
/Volumes/TFL190831/iomega_LCMS/GCA_000240

/Volumes/TFL190831/iomega_LCMS/GCA_000466465.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000414705.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000414685.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000414665.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000414645.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000414625.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000414605.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000414585.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000414565.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000414525.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000414505.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000414485.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000414445.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000414425.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000218465.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000712235.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000244855.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000233535.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000214475.m

/Volumes/TFL190831/iomega_LCMS/GCA_000414685.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000414665.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000414645.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000414625.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000414605.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000414585.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000414565.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000414525.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000414505.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000414485.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000414445.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000414425.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000218465.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000712235.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000244855.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000233535.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000214475.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000234075.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000183585.m

/Volumes/TFL190831/iomega_LCMS/GCA_000162135.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000154085.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000263115.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000273155.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000273115.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000273035.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000273055.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000273295.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000273235.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000273015.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000269545.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000273725.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000307495.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000307345.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000307375.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000296385.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000174215.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000261265.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000261225.m

/Volumes/TFL190831/iomega_LCMS/GCA_000263115.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000273155.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000273115.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000273035.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000273055.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000273295.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000273235.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000273015.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000269545.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000273725.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000307495.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000307345.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000307375.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000296385.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000174215.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000261265.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000261225.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000261205.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000411435.m

/Volumes/TFL190831/iomega_LCMS/GCA_000317085.mzML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000332075.mzML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000309385.mzML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000317555.mzML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000147335.mzML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000317615.mzML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000314005.mzML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000331305.mzML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000315585.mzML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000517105.mzML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000315565.mzML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000332235.mzML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000010065.mzML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000009705.mzML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000238295.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000238295.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_002104455.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_002104455.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_011764015.mzXML.1
/Volume

/Volumes/TFL190831/iomega_LCMS/GCA_000514715.mzXML.5
/Volumes/TFL190831/iomega_LCMS/GCA_000514715.mzXML.6
/Volumes/TFL190831/iomega_LCMS/GCA_000527195.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000527195.mzXML.5
/Volumes/TFL190831/iomega_LCMS/GCA_000527195.mzXML.6
/Volumes/TFL190831/iomega_LCMS/GCA_000424825.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000424825.mzXML.5
/Volumes/TFL190831/iomega_LCMS/GCA_000424825.mzXML.6
/Volumes/TFL190831/iomega_LCMS/GCA_000482585.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000482585.mzXML.5
/Volumes/TFL190831/iomega_LCMS/GCA_000482585.mzXML.6
/Volumes/TFL190831/iomega_LCMS/GCA_000701285.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000701285.mzXML.5
/Volumes/TFL190831/iomega_LCMS/GCA_000701285.mzXML.6
/Volumes/TFL190831/iomega_LCMS/GCA_000377105.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000377105.mzXML.5
/Volumes/TFL190831/iomega_LCMS/GCA_000377105.mzXML.6
/Volumes/TFL190831/iomega_LCMS/GCA_000515055.mzXML.4
/Volumes/TFL190831/iomega_LCMS/GCA_000515055.m

/Volumes/TFL190831/iomega_LCMS/GCA_000739105.mzXML.7
/Volumes/TFL190831/iomega_LCMS/GCA_000739105.mzXML.8
/Volumes/TFL190831/iomega_LCMS/GCA_000739105.mzXML.9
/Volumes/TFL190831/iomega_LCMS/GCA_000156695.mzXML.7
/Volumes/TFL190831/iomega_LCMS/GCA_000156695.mzXML.8
/Volumes/TFL190831/iomega_LCMS/GCA_000156695.mzXML.9
/Volumes/TFL190831/iomega_LCMS/GCA_000424845.mzXML.7
/Volumes/TFL190831/iomega_LCMS/GCA_000424845.mzXML.8
/Volumes/TFL190831/iomega_LCMS/GCA_000424845.mzXML.9
/Volumes/TFL190831/iomega_LCMS/GCA_000424965.mzXML.7
/Volumes/TFL190831/iomega_LCMS/GCA_000424965.mzXML.8
/Volumes/TFL190831/iomega_LCMS/GCA_000424965.mzXML.9
/Volumes/TFL190831/iomega_LCMS/GCA_000426165.mzXML.7
/Volumes/TFL190831/iomega_LCMS/GCA_000426165.mzXML.8
/Volumes/TFL190831/iomega_LCMS/GCA_000426165.mzXML.9
/Volumes/TFL190831/iomega_LCMS/GCA_000412265.mzXML.1
/Volumes/TFL190831/iomega_LCMS/GCA_000412265.mzXML.2
/Volumes/TFL190831/iomega_LCMS/GCA_000412265.mzXML.3
/Volumes/TFL190831/iomega_LCMS/GCA_000412265.m

/Volumes/TFL190831/iomega_LCMS/ERS4341498.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341499.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341500.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341501.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341502.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341503.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341504.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341505.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341507.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341508.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341509.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341510.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341511.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341512.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341513.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341514.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341515.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341516.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341517.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341518.mzXML.1


/Volumes/TFL190831/iomega_LCMS/ERS4341670.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341671.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341672.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341674.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341675.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341676.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341677.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341678.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341679.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341680.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341681.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341682.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341683.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341684.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341685.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341687.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341688.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341689.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341690.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4341691.mzXML.1


/Volumes/TFL190831/iomega_LCMS/ERS4346535.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346536.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346537.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346538.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346539.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346540.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346542.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346543.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346545.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346546.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346547.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346549.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346550.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346551.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346552.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346553.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346554.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346555.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346556.mzXML.1
/Volumes/TFL190831/iomega_LCMS/ERS4346557.mzXML.1


In [7]:
refn = [x[:-4] for x in fmgf]
specn = os.listdir("/Volumes/TFL190831/iomega_LCMS/")

In [8]:
len(excluded_list),len(specn)

(340, 2235)

In [9]:
clean_excluded_list = []

for item in excluded_list:
    clean_excluded_list.append(os.path.basename(item))
    
len(clean_excluded_list)

340

In [10]:
filt_specn = []

for item in specn:
    if '._' not in item:
        if item not in clean_excluded_list:
            filt_specn.append(item)
        
len(filt_specn)

1376

## 2.3. Creating MS/MS fingerprints matrix (ispec_mat)

In [11]:
# create empty data frame using the names above as indexes
ispec_mat = pd.DataFrame(0, index=refn, columns=filt_specn)

In [None]:
# Fill the matrix with the highest score for a given reference spectrum in a givem sample
for i in range(len(filt_specn)):
    mtmp = mxlist[i].loc[mxlist[i].groupby(['specids'])['scores'].idxmax()]
    ispec_mat.loc[mtmp['specids'], filt_specn[i]] = mtmp['scores'].tolist()

In [None]:
ispec_mat

In [None]:
(ispec_mat>0.7).sum().sum()

In [None]:
ispec_mat.shape

In [None]:
ispec_mat = ispec_mat[(ispec_mat.T != 0).any()]

In [None]:
ispec_mat = ispec_mat.fillna(0)

ispec_mat

In [None]:
ispec_mat.shape

In [None]:
lib_df = libnames.loc[libnames['SpectrumID'].isin(refn), ['Compound_Name', 'SpectrumID']]

lib_df.to_csv('./outputs/libnames-round3-TFL210225.tsv',sep='\t')

In [None]:
import seaborn as sns

sns.clustermap(ispec_mat)

In [None]:
strain_count = []

for item in ispec_mat.columns:
    strain = item.split('.')[0]
    if strain not in strain_count:
        strain_count.append(strain)
        
len(strain_count)

## 2.4. Merging columns from same sample

In [None]:
ispec_mat_trans = ispec_mat.T
processed_list = []

merged_ispec_mat = pd.DataFrame()

for i,r in ispec_mat_trans.iterrows():
    strain = i.split('.')[0]
    if strain not in processed_list:
        print(i,strain)
        processed_list.append(strain)
        ispec_temp = ispec_mat_trans[ispec_mat_trans.index.map(lambda x: strain in x)]
        if len(ispec_temp) == 1:
            merged_ispec_mat[strain] = ispec_temp.loc[ispec_temp.index[0], :].values.tolist()
        else:
            merged_ispec_mat[strain] = list(ispec_temp.max(axis=0))

In [None]:
ispec_mat_trans[ispec_mat_trans.index.map(lambda x: 'GCA_000240165' in x)]

In [None]:
merged_ispec_mat.index = list(ispec_mat.index)

merged_ispec_mat

In [None]:
len(merged_ispec_mat.columns)

In [None]:
merged_ispec_mat.to_csv("./outputs/mass-affinity_df-round3.5-TFL210225.txt",sep="\t",index_label=False)

In [None]:
sns.clustermap(merged_ispec_mat,figsize=[18,28])