Script to merge the outputs of all the tools into one big table also get consensus substructures for insilico-tool and propagate canopus within networks

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import CalcMolFormula,CalcExactMolWt
from rdkit.Chem import MCS
from rdkit.Chem import MolFromSmarts
from collections import Counter
from rdkit.Chem.Fraggle import FraggleSim
from rdkit.Chem.rdMolDescriptors import CalcNumAtoms
from rdkit.Chem import rdMolDescriptors
from rdkit.SimDivFilters import rdSimDivPickers
from rdkit import DataStructs
from collections import defaultdict
import numpy as np
from matplotlib.pyplot import hist,xlabel
from rdkit.Chem.Draw import MolsToGridImage
from rdkit.rdBase import BlockLogs
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
from joblib import wrap_non_picklable_objects

def convert(molecule): #repair InChI and convert to smiles
    try:
        if not molecule.startswith("InChI="):
            molecule="InChI="+molecule
        else:
            pass
        mol=Chem.inchi.MolFromInchi(molecule)
        return(Chem.MolToSmiles(mol,kekuleSmiles=True))
    except:
        return None
    
def smilestomf(smiles):
    try:
        m = Chem.MolFromSmiles(smiles)
        u=CalcMolFormula(m)
        return u
    except:
        return None    
    


#if CalcExactMolWt(Chem.MolFromSmiles(i))<1300



def assignPointsToClusters(picks,fps):
    clusters = defaultdict(list)
    for i,idx in enumerate(picks):
        clusters[i].append(idx)
    sims = np.zeros((len(picks),len(fps)))
    for i in range(len(picks)):
        pick = picks[i]
        sims[i,:] = DataStructs.BulkTanimotoSimilarity(fps[pick],fps)
        sims[i,i] = 0
    best = np.argmax(sims,axis=0)
    for i,idx in enumerate(best):
        if i not in picks:
            clusters[idx].append(i)
    return clusters
    
def mls(df,columns,thresh = 0.65): # this is the function to calculate the consensus substructure, takes a dataframe, and the columns to select as input
 smiles=[]
 for i in columns:
     smiles=smiles+df[i].dropna().to_list() 
 if len(smiles)>4:   
  z= [Chem.MolFromSmiles(i) for i in smiles]
  scaffolds=[FraggleSim.generate_fraggle_fragmentation(i) for i in z]
  clean=[]
  block = BlockLogs()
  for i in scaffolds:
      newlist = [word for line in i for word in line.split('.')]
      newlist=list(set(newlist))
      newlist = [w.replace('*', '') for w in newlist]
      mfs=[Chem.MolFromSmiles(i) for i in newlist ]
      clean.append(mfs)
  del block
  clean=[j for i in clean for j in i]
  clean=[i for i in clean if not i == None]
  clean=[Chem.MolToSmiles(i) for i in clean]

  most=Counter(clean).most_common()
  most= [i[0] for i in most]
  if len(most)>50:
      most=most[0:50]
  else:
      pass
  mols = [Chem.MolFromSmiles(smi) for smi in most]
  mols = [i for i in mols if CalcNumAtoms(i) >12]

  fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(m,2,2048) for m in mols]

  lp = rdSimDivPickers.LeaderPicker()
  picks = lp.LazyBitVectorPick(fps,len(fps),thresh)
  clusters = assignPointsToClusters(picks,fps)
  sort_clusters= sorted([clusters[i] for i in clusters],key=len, reverse=True)

  mls=[]
  for _,i in enumerate(range(len(sort_clusters))):
      y=[mols[x] for x in sort_clusters[i]]
      if len(y)>1:
          mls.append(Chem.MolToSmiles(MolFromSmarts(MCS.FindMCS(y,timeout=120).smarts)))
      else:
         pass
  return mls
 else:
        return []
    
columns=["smiles_cfm_db_spec2vec","smiles-sir","smiles_cfm_j","smiles_moldis_j","smiles_moldis","smiles_cfm_db_cosine"]   
    
df_quant = pd.read_csv("/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/FEATURE-BASED-MOLECULAR-NETWORKING-cf822b6c-15072021/alltissues15072021-quant-py-dil.csv",index_col='row ID', sep=",") # mzmine quant table

#df_motif = pd.read_csv("/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/FEATURE-BASED-MOLECULAR-NETWORKING-cf822b6c-15072021/MS2LDA-all.csv",index_col='row ID', sep=",") # MS2LDA download

#df_frag=pd.read_csv("/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/FEATURE-BASED-MOLECULAR-NETWORKING-cf822b6c-15072021/MS2LDA-fragments.csv",index_col='Motif', sep=",") # MS2LDA download

df_network=pd.read_csv("/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/FEATURE-BASED-MOLECULAR-NETWORKING-cf822b6c-15072021/allcyto.csv",index_col='shared name', sep=",") # curated gnps output exported from cytoscape

df_sirius=pd.read_csv("/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/FEATURE-BASED-MOLECULAR-NETWORKING-cf822b6c-15072021/formula_identifications-extended.tsv",index_col='ID', sep="\t") # Sirius output

df_cmpid= pd.read_csv("/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/FEATURE-BASED-MOLECULAR-NETWORKING-cf822b6c-15072021/compound_identifications-extended.tsv",index_col='ID', sep="\t") # Sirius output

#df_anno=pd.read_csv("/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/FEATURE-BASED-MOLECULAR-NETWORKING-cf822b6c-15072021/MS2LDA-annotation.csv",index_col='Name', sep=",") # MS2LDA download

df_moldis= pd.read_csv("/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/FEATURE-BASED-MOLECULAR-NETWORKING-cf822b6c-15072021/MOLDISCOVERY-5561c3b7-view_significant_unique-main.tsv",index_col='Scan', sep="\t") # moldiscovery gnps download

df_moldis_j= pd.read_csv("/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/FEATURE-BASED-MOLECULAR-NETWORKING-cf822b6c-15072021/MOLDISCOVERY-jassbi.tsv",index_col='Scan', sep="\t") # moldiscovery gnps download

df_cfm_j= pd.read_csv("/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/FEATURE-BASED-MOLECULAR-NETWORKING-cf822b6c-15072021/jassbi-results.tsv",index_col='Feature_id', sep="\t") # CFM Datase search output

df_cfm_spec2vec= pd.read_csv("/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/FEATURE-BASED-MOLECULAR-NETWORKING-cf822b6c-15072021/cfm-hits-database_spec2vec.txt",index_col='Feature_id', sep="\t") # CFM Datase search output

df_cfm_cosine= pd.read_csv("/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/FEATURE-BASED-MOLECULAR-NETWORKING-cf822b6c-15072021/results_cfmdb_alltissues1507.tsv",index_col='Feature_id', sep="\t")

df_em= pd.read_csv("/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/FEATURE-BASED-MOLECULAR-NETWORKING-cf822b6c-15072021/hitsemmanuelfilter.tsv",index_col='Feature_id', sep="\t") # Emmanuel Datase search output

df_cano= pd.read_csv("/Users/delser/Desktop/PhD/Phytochemistry/FBMN/alltissues/FEATURE-BASED-MOLECULAR-NETWORKING-cf822b6c-15072021/canopus_summary.tsv", sep="\t")  #canopus data

#df_molnet= pd.read_csv("/Users/delser/Git/phd/molnet_v1.tsv",index_col='cluster index', sep="\t") # molnetenhancher output


  from rdkit.Chem import MCS
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  data = self._reader.read(nrows)
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [2]:
df= df_cano['name'].str.split('_',expand=True)
#df.head()
df_cano=df_cano.drop(['name','molecularFormula','adduct'], axis=1)
df_cano=df_cano.join(pd.to_numeric(df[2]))
df_cano=df_cano.rename(columns={'most specific class':"cano_most_specific_class","level 5":"cano_level_5","subclass":"cano_subclass","class":"cano_class","superclass":"cano_superclass","all classifications":"cano_all_classifications",2:"Feature_id"})
df_cano=df_cano.set_index("Feature_id")
#f_cano=df_cano.to_frame()
df_cano.head()


  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if dtype != object and dtype != np.object:


Unnamed: 0_level_0,cano_most_specific_class,cano_level_5,cano_subclass,cano_class,cano_superclass,cano_all_classifications
Feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
29699,Glycosyldiacylglycerols,Glycosyldiradylglycerols,Glycosylglycerols,Glycerolipids,Lipids and lipid-like molecules,Organic compounds; Organoheterocyclic compound...
56096,Amino acids and derivatives,Amino acids and derivatives,"Amino acids, peptides, and analogues",Carboxylic acids and derivatives,Organic acids and derivatives,Organic compounds; Organoheterocyclic compound...
34752,Polyethylene glycols,Dialkyl ethers,Ethers,Organooxygen compounds,Organic oxygen compounds,Organic compounds; Alcohols and polyols; Ether...
34264,Indoles,,Indoles,Indoles and derivatives,Organoheterocyclic compounds,Organic compounds; Organoheterocyclic compound...
26290,Carbonyl compounds,,Carbonyl compounds,Organooxygen compounds,Organic oxygen compounds,Organic compounds; Organooxygen compounds; Car...


In [3]:

df_em=df_em.drop(["m/z","m/z-1","cousine-score-Hit-1","numoffpeaks-1"], axis=1)
df_em=df_em.rename(columns={'Smiles-1':"Smiles","Library-Hit-1":"Compound_Name"})
#df_em=df_em['Level'] = 'N'

df_em.head()

  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


Unnamed: 0_level_0,Compound_Name,Smiles
Feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1
43355,"N,N`-(4-Azaoctane-1,8-diyl)bis(3-methoxy-4-hyd...",COc1cc(/C=C/C(O)=NCCCCNCCCN=C(O)/C=C/c2ccc(O)c...
43349,"N,N`-(4-Azaoctane-1,8-diyl)bis(3-methoxy-4-hyd...",COc1cc(/C=C/C(O)=NCCCCNCCCN=C(O)/C=C/c2ccc(O)c...
46861,"N,N`-(4-Azaoctane-1,8-diyl)bis(3-methoxy-4-hyd...",COc1cc(/C=C/C(O)=NCCCCNCCCN=C(O)/C=C/c2ccc(O)c...
43345,"N,N`-(4-Azaoctane-1,8-diyl)bis(3-methoxy-4-hyd...",COc1cc(/C=C/C(O)=NCCCCNCCCN=C(O)/C=C/c2ccc(O)c...
43343,"N,N`-(4-Azaoctane-1,8-diyl)bis(3-methoxy-4-hyd...",COc1cc(/C=C/C(O)=NCCCCNCCCN=C(O)/C=C/c2ccc(O)c...


Pick the hits of the jassbi cfm cousine score search if above 0.5 and min 5 peaks, then get their smiles and extract the sum formulas

In [4]:

df_cfm_j=df_cfm_j[['m/z',"Library-Hit-1","m/z-1","cousine-score-Hit-1","numoffpeaks-1","Smiles-1"]]

df_cfm_j=df_cfm_j.rename(columns={"cousine-score-Hit-1": "cosine_cfm_j",'Smiles-1':"smiles_cfm_j","Library-Hit-1":"Library-Hit-CFM-jassbi","numoffpeaks-1" :"numoffpeaks__cfm_j"})

df_cfm_j = df_cfm_j[(df_cfm_j.cosine_cfm_j > 0.5) & (df_cfm_j.numoffpeaks__cfm_j > 5)]

"""
sm=df_cfm_j[("smiles_cfm_j")]
l=sm.tolist()

from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
def smilestomf(smiles):
    m = Chem.MolFromSmiles(smiles)
    u=CalcMolFormula(m)
    return u

MF=[]
for i in l:
    MF.append(smilestomf(i))
"""
df_cfm_j["MF_j_cfm"]=df_cfm_j["smiles_cfm_j"].apply(lambda x: smilestomf(x))
#df_cfm_j= df_cfm_j.assign(MF_j_cfm=MF)
#df_cfm_j=df_cfm_j.rename(columns={"MF_j_cfm": "Compound_Name"})
df_cfm_j= df_cfm_j.assign(delta_mz_j_cfm=df_cfm_j["m/z"]-df_cfm_j["m/z-1"])
df_cfm_j




  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


Unnamed: 0_level_0,m/z,Library-Hit-CFM-jassbi,m/z-1,cosine_cfm_j,numoffpeaks__cfm_j,smiles_cfm_j,MF_j_cfm,delta_mz_j_cfm
Feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
860,228.1959,neophytadiene,278.297351,0.511559,7,C=CC(=C)CCCC(C)CCCC(C)CCCC(C)C,C20H38,-50.101451
930,226.1803,farnesyl_acetone,262.229666,0.561156,18,CC(=O)CC/C=C(\C)CC/C=C(\C)CCC=C(C)C,C18H30O,-36.049366
1093,209.1538,cupalene,202.172151,0.532549,9,Cc1ccc(C2(C)CCCC2(C)C)cc1,C15H22,6.981649
1255,226.1804,farnesyl_acetone,262.229666,0.570614,11,CC(=O)CC/C=C(\C)CC/C=C(\C)CCC=C(C)C,C18H30O,-36.049266
1447,209.1537,cupalene,202.172151,0.538105,9,Cc1ccc(C2(C)CCCC2(C)C)cc1,C15H22,6.981549
...,...,...,...,...,...,...,...,...
56817,367.1754,N-E-caffeoyl_tyramine_,299.115758,0.540279,12,OC(/C=C/c1ccc(O)c(O)c1)=NCCc1ccc(O)cc1,C17H17NO4,68.059642
57105,203.1795,farnesyl_acetone,262.229666,0.539936,11,CC(=O)CC/C=C(\C)CC/C=C(\C)CCC=C(C)C,C18H30O,-59.050166
57152,404.2644,occidol_acetate,260.177630,0.505505,10,CC(=O)OC(C)(C)C1CCc2c(C)ccc(C)c2C1,C17H24O2,144.086770
57239,203.1794,farnesyl_acetone,262.229666,0.525324,12,CC(=O)CC/C=C(\C)CC/C=C(\C)CCC=C(C)C,C18H30O,-59.050266


In [5]:
df_cfm_j=df_cfm_j.drop(["m/z","m/z-1"], axis=1)
df_cfm_j

  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


Unnamed: 0_level_0,Library-Hit-CFM-jassbi,cosine_cfm_j,numoffpeaks__cfm_j,smiles_cfm_j,MF_j_cfm,delta_mz_j_cfm
Feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
860,neophytadiene,0.511559,7,C=CC(=C)CCCC(C)CCCC(C)CCCC(C)C,C20H38,-50.101451
930,farnesyl_acetone,0.561156,18,CC(=O)CC/C=C(\C)CC/C=C(\C)CCC=C(C)C,C18H30O,-36.049366
1093,cupalene,0.532549,9,Cc1ccc(C2(C)CCCC2(C)C)cc1,C15H22,6.981649
1255,farnesyl_acetone,0.570614,11,CC(=O)CC/C=C(\C)CC/C=C(\C)CCC=C(C)C,C18H30O,-36.049266
1447,cupalene,0.538105,9,Cc1ccc(C2(C)CCCC2(C)C)cc1,C15H22,6.981549
...,...,...,...,...,...,...
56817,N-E-caffeoyl_tyramine_,0.540279,12,OC(/C=C/c1ccc(O)c(O)c1)=NCCc1ccc(O)cc1,C17H17NO4,68.059642
57105,farnesyl_acetone,0.539936,11,CC(=O)CC/C=C(\C)CC/C=C(\C)CCC=C(C)C,C18H30O,-59.050166
57152,occidol_acetate,0.505505,10,CC(=O)OC(C)(C)C1CCc2c(C)ccc(C)c2C1,C17H24O2,144.086770
57239,farnesyl_acetone,0.525324,12,CC(=O)CC/C=C(\C)CC/C=C(\C)CCC=C(C)C,C18H30O,-59.050266


In [6]:


df_moldis.head()
df_moldis=df_moldis.drop(["SpecFile","LocalSpecIdx","LocalPeptideIdx","Retention","SpectrumMass","PeptideMass","Score","FDR"], axis=1)

df_moldis=df_moldis.rename(columns={"MassDiff": "delta_mz_moldis", "Name": "Hit_moldis", "SMILES": "smiles_moldis","Adduct": "MoldisAdduct","Charge": "Charge_moldis"})

df_moldis.head()

  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


Unnamed: 0_level_0,Hit_moldis,delta_mz_moldis,MoldisAdduct,Charge_moldis,smiles_moldis
Scan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
29090,,-0.00062,M+H,1,CC(C)[C@@H]1NC(=O)[C@H](Nn2cc(C[C@H](C=O)NC(=O...
31432,Lyciumins_Lyciumin_C,-0.00182,M+H,1,CC(C)C1NC(=O)C(NC(=O)C(Cc2ccc(O)cc2)NC(=O)C2CC...
31922,"7,7',8,8',11,12-Hexahydrolycopene_1,2-Epoxide",-0.00032,M+H,1,CC(C)=CCC/C(C)=C\CC/C(C)=C\CC/C(C)=C\C=C\C=C(C...
7753,"15,7',8',11',12',15'-Hexahydro-beta,psi-",-2e-05,M+H,1,CC(C)=CCC/C(C)=C/CC/C(C)=C/CC/C(C)=C/CC/C=C(C)...
7763,Polyprenol_Octaprenol,-0.00032,M+H,1,CC(C)=CCC/C(C)=C\CC/C(C)=C\CC/C(C)=C\CC/C(C)=C...


In [7]:

df_moldis_j.head()
df_moldis_j=df_moldis_j.drop(["SpecFile","LocalSpecIdx","LocalPeptideIdx","Retention","SpectrumMass","PeptideMass","Score","FDR"], axis=1)

df_moldis_j=df_moldis_j.rename(columns={"MassDiff": "delta_mz_moldis_j", "Name": "Hit_moldis_j", "SMILES": "smiles_moldis_j","Adduct": "MoldisAdduct_j","Charge": "Charge_moldis_j"})

df_moldis_j.head()

  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


Unnamed: 0_level_0,Hit_moldis_j,delta_mz_moldis_j,MoldisAdduct_j,Charge_moldis_j,smiles_moldis_j
Scan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
43411,nicotianoside_VII,-0.00222,M+H,1,C=C[C@](C)(CC/C=C(\C)CC/C=C(\C)CC/C=C(/C)CO[C@...
36878,9/9_-cis-neoxanthin,0.00178,M+H,1,CC(=C/C=C/C(C)=C/C=C/C=C(C)/C=C/C=C(\C)C=C=C1C...
35284,3-O-alpha-L-rhamnopyranosyl_(1-4)-beta-D-gluco...,-0.00132,M+H,1,C=C[C@](C)(CC/C=C(/C)CC/C=C(/C)CC/C=C(/C)CO[C@...
49796,nicotianoside_VI,-0.00102,M+H,1,C=C[C@](C)(CC/C=C(/C)CC/C=C(/C)CC/C=C(\C)CO[C@...
41297,nicotianoside_V,-0.00262,M+H,1,C=C[C@](C)(CC/C=C(/C)CC/C=C(/C)CC/C=C(/C)CO[C@...


Pick sirius compound annotations and keep only the ones with confidence above 0.65

In [8]:

df_cmpid.head()

df_cmpid=df_cmpid.drop(["rank","formulaRank","#adducts","#predictedFPs","CSI:FingerIDScore","ZodiacScore","SiriusScore","molecularFormula","adduct","InChIkey2D","xlogp","dbflags","ionMass","retentionTimeInSeconds","id","pubchemids","links","InChI"], axis=1)
df_cmpid = df_cmpid[(df_cmpid.ConfidenceScore > 0.65) ]
df_cmpid=df_cmpid.rename(columns={ "name": "name-sir", "smiles": "smiles-sir"})

df_cmpid.head()

  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


Unnamed: 0_level_0,ConfidenceScore,name-sir,smiles-sir
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
51061,0.999992,Pentapeptide-4,CC(C(C(=O)NC(C(C)O)C(=O)NC(CCCCN)C(=O)NC(CO)C(...
33119,0.999957,(4S)-4-[[(2S)-5-(diaminomethylideneamino)-2-[[...,C1=CC=C(C=C1)CC(C(C(=O)NC(CCCN=C(N)N)C(=O)NC(C...
20057,0.999955,,CC1=CN(C(=O)NC1=O)C2CC(C(O2)CNC(=O)CCCCCCCC(=O...
49422,0.999804,,CC(C)CC(C(=O)NC(C)(C)C(=O)NC(CC(C)C)C(=O)NC(C)...
14394,0.999629,,CC(C)CC(=O)OC1C2C(CCC2C(=CO1)COC3C(C(C(C(O3)CO...


df_frag.head()


In [9]:


df_sirius.head()
#df_motif_red=df_motif.drop(["Precursor Mass","Retention Time","Document Annotation","Document","Probability","Overlap Score"], axis=1)
df_sirius=df_sirius.drop(["rank","ZodiacScore","SiriusScore","TreeScore","IsotopeScore","numExplainedPeaks","explainedIntensity","medianMassErrorFragmentPeaks(ppm)","medianAbsoluteMassErrorFragmentPeaks(ppm)","massErrorPrecursor(ppm)","ionMass","retentionTimeInSeconds"], axis=1)
#df_motif_red.head()

df_sirius.head()

  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


Unnamed: 0_level_0,molecularFormula,adduct,precursorFormula,id,degree
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
29312,C29H46O2,[M + H]+,C29H46O2,6422_altissues15072021-sirius-py_29312,7.0
29190,C28H46O2,[M + H]+,C28H46O2,6350_altissues15072021-sirius-py_29190,6.0
29298,C29H48O2,[M + H]+,C29H48O2,6417_altissues15072021-sirius-py_29298,6.0
29188,C28H44O2,[M + H]+,C28H44O2,6349_altissues15072021-sirius-py_29188,7.0
50689,C20H32O3,[M + H]+,C20H32O3,14102_altissues15072021-sirius-py_50689,5.0


Get CFM big DB search by spec2vec, keep only ic score above 0.5

In [10]:



df_cfm_spec2vec=df_cfm_spec2vec.drop(["hit2","hit2_m/z","hit2_score","hit3","hit3_m/z","hit3_score","hit4","hit4_m/z","hit4_score","hit5","hit5_m/z","hit5_score","hit6","hit6_m/z","hit6_score"], axis=1)
df_cfm_spec2vec=df_cfm_spec2vec.rename(columns={"hit1": "cfm_db_spec2vec_hit",'hit1_SMILES':"smiles_cfm_db_spec2vec","hit1_MolecularFormula":"MolecularFormula_cfm_db_spec2vec","hit1_score" :"score_cfm_db_spec2vec"})
df_cfm_spec2vec = df_cfm_spec2vec[(df_cfm_spec2vec.score_cfm_db_spec2vec > 0.5)]
df_cfm_spec2vec = df_cfm_spec2vec .assign(delta_cfm_spec2vec=df_cfm_spec2vec ["m/z"]-df_cfm_spec2vec ["hit1_m/z"])
#df_cfm_spec2vec = df_cfm_spec2vecj[(df_cfm_j.cosine_cfm_j > 0.5) & (df_cfm_j.numoffpeaks__cfm_j > 5)]
df_cfm_spec2vec

  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


Unnamed: 0_level_0,m/z,cfm_db_spec2vec_hit,smiles_cfm_db_spec2vec,MolecularFormula_cfm_db_spec2vec,hit1_m/z,score_cfm_db_spec2vec,delta_cfm_spec2vec
Feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
38,526.4316,62551,C(COCCOCCOCCOCCO)O,C10H22O6,238.141638,0.579498,288.289962
40,570.4579,62551,C(COCCOCCOCCOCCO)O,C10H22O6,238.141638,0.553819,332.316262
45,540.4472,62551,C(COCCOCCOCCOCCO)O,C10H22O6,238.141638,0.594717,302.305562
48,526.4316,62551,C(COCCOCCOCCOCCO)O,C10H22O6,238.141638,0.563774,288.289962
49,614.4842,62551,C(COCCOCCOCCOCCO)O,C10H22O6,238.141638,0.516715,376.342562
...,...,...,...,...,...,...,...
56292,579.4995,5312697,CCCC#CCCCCCCCCCCCCC(=O)O,C18H32O2,280.240230,0.588280,299.259270
57074,404.2640,PMA_119625,CC1=C2CCCC[C@@]2(C)C=CC1=O,C12H16O,176.120115,0.515460,228.143885
57076,420.2593,PMA_054517,CC1=CCC(CC1=O)C(=C)CCCC(C)(C)O,C15H24O2,236.177630,0.536186,184.081670
57177,413.3779,CNP0287127,C=C1C2CCC32CC(CCC13C)C(C)C,C15H24,204.187801,0.516802,209.190099


In [11]:
df_cfm_spec2vec=df_cfm_spec2vec.drop(["m/z","hit1_m/z"], axis=1)
df_cfm_spec2vec

  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


Unnamed: 0_level_0,cfm_db_spec2vec_hit,smiles_cfm_db_spec2vec,MolecularFormula_cfm_db_spec2vec,score_cfm_db_spec2vec,delta_cfm_spec2vec
Feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
38,62551,C(COCCOCCOCCOCCO)O,C10H22O6,0.579498,288.289962
40,62551,C(COCCOCCOCCOCCO)O,C10H22O6,0.553819,332.316262
45,62551,C(COCCOCCOCCOCCO)O,C10H22O6,0.594717,302.305562
48,62551,C(COCCOCCOCCOCCO)O,C10H22O6,0.563774,288.289962
49,62551,C(COCCOCCOCCOCCO)O,C10H22O6,0.516715,376.342562
...,...,...,...,...,...
56292,5312697,CCCC#CCCCCCCCCCCCCC(=O)O,C18H32O2,0.588280,299.259270
57074,PMA_119625,CC1=C2CCCC[C@@]2(C)C=CC1=O,C12H16O,0.515460,228.143885
57076,PMA_054517,CC1=CCC(CC1=O)C(=C)CCCC(C)(C)O,C15H24O2,0.536186,184.081670
57177,CNP0287127,C=C1C2CCC32CC(CCC13C)C(C)C,C15H24,0.516802,209.190099


Get CFM big DB search by modified cosine score, keep only ic score above 0.5 and more than 5 peaks

In [12]:
df_cfm_cosine

df_cfm_cosine=df_cfm_cosine.rename(columns={"cosine": "cosine_cfm_db_cosine",'Smiles 1':"smiles_cfm_db_cosine","Library Hit 1":"Library_Hit_cfm_db_cosine","numoffpeaks 1" :"numoffpeaks_cfm_db_cosine"})

df_cfm_cosine = df_cfm_cosine[(df_cfm_cosine.cosine_cfm_db_cosine > 0.5) & (df_cfm_cosine.numoffpeaks_cfm_db_cosine > 5)]




df_cfm_cosine["smiles_cfm_db_cosine"]=df_cfm_cosine["smiles_cfm_db_cosine"].apply(lambda x: convert(x))





df_cfm_cosine["MF_cfm_db_cosine"]=df_cfm_cosine["smiles_cfm_db_cosine"].apply(lambda x: smilestomf(x))

    
#df_cfm_cosine= df_cfm_cosine.assign(MF_cfm_db_cosine=MF)
#df_cfm_j=df_cfm_j.rename(columns={"MF_j_cfm": "Compound_Name"})
df_cfm_cosine= df_cfm_cosine.assign(delta_mz_cfm_db_cosine=df_cfm_cosine["m/z"]-df_cfm_cosine["m/z 1"])
df_cfm_cosine=df_cfm_cosine.drop(["m/z","m/z 1"], axis=1)
df_cfm_cosine



  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


Unnamed: 0_level_0,cosine_cfm_db_cosine,Library_Hit_cfm_db_cosine,numoffpeaks_cfm_db_cosine,smiles_cfm_db_cosine,MF_cfm_db_cosine,delta_mz_cfm_db_cosine
Feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,0.667835,CNP0179408,7,CC(C)CCCCCCCCC(=O)N1CCCC1C1=CN=CC=C1,C21H34N2O,1.007686
5,0.674954,PMA_080810,7,CC(C)CCCCCCCCC(=O)N1CCC[C@H]1C1=CN=CC=C1,C21H34N2O,1.007686
6,0.560974,ZINC000245174543,14,C[C@]12CCCC[C@@H]1CC[C@H]1[C@H]2CC[C@@]2(C)[C@...,C20H34O,48.081134
10,0.697262,PMA_037304,6,CC[C@H](C)CCCCCCCCC(=O)N1CCC[C@H]1C1=CN=CC=C1,C22H36N2O,1.007736
20,0.536068,ZINC000253499747,13,C[C@]12CCCC[C@H]1CC[C@H]1[C@H]2CC[C@]2(C)[C@@H...,C20H34O,48.081034
...,...,...,...,...,...,...
57547,0.594086,ZINC000036369833,6,COC1=C(OC)C(OC)=CC(/C=C/C(=O)N(CC2=CC3=C(C=CN3...,C24H26N2O4,-243.066357
57597,0.597434,PMA_134177,8,CNCCC1=CN(C)C2=CC=CC=C12,C12H16N2,-25.008548
57602,0.583535,PMA_134177,7,CNCCC1=CN(C)C2=CC=CC=C12,C12H16N2,-25.008548
57606,0.583535,PMA_134177,7,CNCCC1=CN(C)C2=CC=CC=C12,C12H16N2,-25.008449


Do first merges first prio to Emmanuel Database Hits

In [13]:
df_em.loc[:,'Level'] = 'N'
netw_quant = df_quant.join(df_em, how="outer")
#result = netw_quant.join(df_motif, how="outer")
result= netw_quant.join(df_sirius, how="outer")
result= result.join(df_cfm_cosine, how="outer")
result= result.join(df_cfm_spec2vec, how="outer")
result= result.join(df_cfm_j, how="outer")
result= result.join(df_moldis_j, how="outer")
result= result.join(df_moldis, how="outer")
result= result.join(df_cmpid, how="outer")
result= result.join(df_cano, how="outer")

result

  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


Unnamed: 0,row m/z,row retention time,07 Nrep,07 Nnes,07 Nrxs,07 Nsto,07 Ntab,08 Nglu,10 Nben,10 Ncle,...,smiles_moldis,ConfidenceScore,name-sir,smiles-sir,cano_most_specific_class,cano_level_5,cano_subclass,cano_class,cano_superclass,cano_all_classifications
1,375.300874,26.591875,0.000000e+00,0.000000e+00,0.000,0.000000e+00,0.0000,0.000,0.0000,0.0000,...,CCCCCCCCCCC[C@H](O)CC(=O)N1CCC[C@H]1c1cccnc1,,,,Heteroaromatic compounds,,,Heteroaromatic compounds,Organoheterocyclic compounds,Organic compounds; Organoheterocyclic compound...
2,375.300905,26.708085,1.399826e+08,7.982489e+07,0.000,1.135378e+08,60783.3560,0.000,0.0000,2594.5600,...,,,,,Benzenoids,,,,Benzenoids,Organic compounds; Organoheterocyclic compound...
3,331.274781,27.032871,1.456369e+08,1.578602e+08,5095030.375,1.636117e+06,12148.4585,390.456,0.0000,401.3875,...,,,,,Azacyclic compounds,,,Azacyclic compounds,Organoheterocyclic compounds,Organic compounds; Organoheterocyclic compound...
4,375.300859,27.204353,1.853471e+08,1.131553e+08,8789294.723,2.054237e+08,7070.3310,0.000,0.0000,2070.4005,...,,,,,Pyridines and derivatives,,,Pyridines and derivatives,Organoheterocyclic compounds,Organic compounds; Organoheterocyclic compound...
5,331.274799,27.624355,1.225282e+08,4.539143e+07,3250219.092,2.809773e+06,4730.8535,217.019,148.2960,0.0000,...,,,,,Azacyclic compounds,,,Azacyclic compounds,Organoheterocyclic compounds,Organic compounds; Alcohols and polyols; Organ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57596,163.122893,24.635752,3.057750e+03,0.000000e+00,0.000,0.000000e+00,0.0000,0.000,1517.4250,5592.0165,...,,0.703361,Habitrol,CN1CCCC1C2=CN=CC=C2,Aralkylamines,Aralkylamines,Amines,Organonitrogen compounds,Organic nitrogen compounds,Organic compounds; Organoheterocyclic compound...
57597,163.122798,22.623363,5.059680e+03,7.624705e+03,358.600,2.810580e+03,3549.6620,0.000,6276.6915,12382.8120,...,,0.712005,Habitrol,CN1CCCC1C2=CN=CC=C2,Aralkylamines,Aralkylamines,Amines,Organonitrogen compounds,Organic nitrogen compounds,Organic compounds; Organoheterocyclic compound...
57602,163.122847,26.424650,0.000000e+00,0.000000e+00,176.704,0.000000e+00,0.0000,0.000,1222.3840,5906.0730,...,,0.728219,Habitrol,CN1CCCC1C2=CN=CC=C2,Aralkylamines,Aralkylamines,Amines,Organonitrogen compounds,Organic nitrogen compounds,Organic compounds; Organoheterocyclic compound...
57606,163.122859,26.160844,5.060950e+03,0.000000e+00,204.249,3.038480e+03,0.0000,0.000,1921.6025,9111.0390,...,,,,,Aralkylamines,Aralkylamines,Amines,Organonitrogen compounds,Organic nitrogen compounds,Organic compounds; Organoheterocyclic compound...


add gnps second prio

In [14]:

df_network['Level'] = np.where(pd.isnull(df_network['Compound_Name'])!= True, "GNPS",None)



#clean=result.drop(["Precursor Mass","Retention Time","Document Annotation","Document"], axis=1)
clean= result

cmpd_name= clean.Compound_Name.fillna(df_network.Compound_Name).to_frame() #Compound_Name', 'MassDiff', 'Smiles
level=clean.Level.fillna(df_network.Level).to_frame()
smiles= clean.Smiles.fillna(df_network.Smiles).to_frame()
clean=clean.drop(["Compound_Name","Smiles","Level"], axis=1)


clean=pd.concat([clean, cmpd_name,smiles,level ], axis=1)
df_network_red=df_network.drop(["Compound_Name","Smiles","Level"], axis=1)
clean = clean.join(df_network_red, how="outer")
clean.head()

  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


Unnamed: 0,row m/z,row retention time,07 Nrep,07 Nnes,07 Nrxs,07 Nsto,07 Ntab,08 Nglu,10 Nben,10 Ncle,...,Ion_Source,IonMode,Library_Class,MassDiff,MQScore,MZErrorPPM,number of spectra,PI,SpectrumID,sum(precursor intensity)
1,375.300874,26.591875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,LC-ESI,Positive,3.0,9.2e-05,0.764151,0.243945,34,Emmanuel Gaquerel,CCMSLIB00005788032,65100000.0
2,375.300905,26.708085,139982614.0,79824890.0,0.0,113537800.0,60783.356,0.0,0.0,2594.56,...,LC-ESI,Positive,3.0,9.2e-05,0.76373,0.243945,43,Emmanuel Gaquerel,CCMSLIB00005788032,79200000.0
3,331.274781,27.032871,145636857.5,157860200.0,5095030.375,1636117.0,12148.4585,390.456,0.0,401.3875,...,LC-ESI,Positive,3.0,0.000183,0.949112,0.55273,37,Gaquerel,CCMSLIB00005788033,150000000.0
4,375.300859,27.204353,185347092.7,113155300.0,8789294.723,205423700.0,7070.331,0.0,0.0,2070.4005,...,LC-ESI,Positive,3.0,9.2e-05,0.762207,0.243945,50,Emmanuel Gaquerel,CCMSLIB00005788032,117000000.0
5,331.274799,27.624355,122528191.3,45391430.0,3250219.092,2809773.0,4730.8535,217.019,148.296,0.0,...,LC-ESI,Positive,3.0,0.000183,0.953081,0.55273,38,Gaquerel,CCMSLIB00005788033,50100000.0


In [15]:
canopus_freq=clean.groupby('componentindex')['cano_subclass'].apply(lambda x: x.mode()).reset_index(level=1, drop=True).reset_index()
canopus_freq=canopus_freq.drop_duplicates(subset=['componentindex'], keep='first')
canopus_freq

  and should_run_async(code)


Unnamed: 0,componentindex,cano_subclass
0,-1,"Amino acids, peptides, and analogues"
1,1,Ethers
2,2,Carboxylic acid derivatives
3,4,Carbohydrates and carbohydrate conjugates
4,6,Thiadiazoles
...,...,...
1602,3880,Alcohols and polyols
1603,3902,Triterpenoids
1604,3987,Diterpenoids
1606,3995,Alcohols and polyols


In [16]:

canopus_freq=canopus_freq.set_index("componentindex")
canopus_freq=canopus_freq.rename(columns={"cano_subclass" : "canopus_enhancer_subclass"})


  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


In [17]:
canopus_freq1=clean.groupby('componentindex')['cano_superclass'].apply(lambda x: x.mode()).reset_index(level=1, drop=True).reset_index()
canopus_freq1=canopus_freq1.drop_duplicates(subset=['componentindex'], keep='first')
canopus_freq1

Unnamed: 0,componentindex,cano_superclass
0,-1,Lipids and lipid-like molecules
1,1,Organic oxygen compounds
2,2,Lipids and lipid-like molecules
4,4,Benzenoids
6,6,Organoheterocyclic compounds
...,...,...
1647,3902,Lipids and lipid-like molecules
1648,3987,Lipids and lipid-like molecules
1649,3995,Organic oxygen compounds
1651,3999,Lipids and lipid-like molecules


In [18]:
canopus_freq1=canopus_freq1.set_index("componentindex")
canopus_freq1=canopus_freq1.rename(columns={"cano_superclass" : "canopus_enhancer_superclass"})

  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


In [19]:
#clean

#canopus_freq=clean.groupby('componentindex')['cano_superclass'].apply(lambda x: x.value_counts().index[0]).reset_index()
canopus_freq2=clean.groupby('componentindex')['cano_most_specific_class'].apply(lambda x: x.mode()).reset_index(level=1, drop=True).reset_index()
canopus_freq2=canopus_freq2.drop_duplicates(subset=['componentindex'], keep='first')
canopus_freq2

Unnamed: 0,componentindex,cano_most_specific_class
0,-1,Diterpenoids
1,1,Polyethylene glycols
2,2,Lipids and lipid-like molecules
4,4,Benzenoids
8,6,Thiadiazoles
...,...,...
2066,3902,Triterpenoids
2067,3987,Diterpene glycosides
2069,3995,Macrolactams
2071,3999,Glycosphingolipids


In [20]:


canopus_freq2=canopus_freq2.set_index("componentindex")
canopus_freq2=canopus_freq2.rename(columns={"cano_most_specific_class" : "canopus_enhancer_cano_most_specific_class"})


  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


In [21]:

clean=clean.reset_index()
clean=clean.set_index("componentindex")

final= clean.join(canopus_freq, how="left")
final= final.join(canopus_freq1, how="left")
final= final.join(canopus_freq2, how="left")
final

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


Unnamed: 0_level_0,index,row m/z,row retention time,07 Nrep,07 Nnes,07 Nrxs,07 Nsto,07 Ntab,08 Nglu,10 Nben,...,MassDiff,MQScore,MZErrorPPM,number of spectra,PI,SpectrumID,sum(precursor intensity),canopus_enhancer_subclass,canopus_enhancer_superclass,canopus_enhancer_cano_most_specific_class
componentindex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1,13,149.023222,21.229848,1.359427e+08,1.382029e+08,1.374533e+07,9.175219e+07,1.229461e+07,0.000000e+00,0.000000e+00,...,,,,26,,,1.120000e+08,"Amino acids, peptides, and analogues",Lipids and lipid-like molecules,Diterpenoids
-1,30,155.974172,0.178462,1.412411e+07,1.123981e+07,1.083707e+06,9.555183e+06,1.584404e+06,8.962862e+05,1.131018e+06,...,,,,112,,,8.600000e+07,"Amino acids, peptides, and analogues",Lipids and lipid-like molecules,Diterpenoids
-1,49,614.484237,37.134686,1.386053e+07,2.588403e+07,2.191637e+06,2.755502e+07,1.530908e+06,1.751402e+07,1.737014e+07,...,,,,112,,,5.210000e+08,"Amino acids, peptides, and analogues",Lipids and lipid-like molecules,Diterpenoids
-1,53,540.447250,38.339231,8.149429e+06,8.719363e+06,7.947326e+05,7.730808e+06,8.900204e+05,7.117477e+06,7.104920e+06,...,,,,63,,,1.720000e+08,"Amino acids, peptides, and analogues",Lipids and lipid-like molecules,Diterpenoids
-1,54,584.473543,38.358006,1.393637e+07,1.512933e+07,2.144942e+06,2.135812e+07,8.331676e+05,1.825117e+07,1.460736e+07,...,,,,112,,,3.670000e+08,"Amino acids, peptides, and analogues",Lipids and lipid-like molecules,Diterpenoids
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3999,37351,621.473344,43.192733,1.705200e+03,1.377360e+03,0.000000e+00,1.506320e+03,0.000000e+00,0.000000e+00,0.000000e+00,...,,,,56,,,4.072771e+05,Glycosphingolipids,Lipids and lipid-like molecules,Glycosphingolipids
4011,5695,340.285063,24.927804,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.326134e+05,0.000000e+00,0.000000e+00,...,,,,32,,,1.575059e+05,,Lipids and lipid-like molecules,Prenol lipids
4011,55465,340.284967,24.272761,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,,,,16,,,2.247056e+05,,Lipids and lipid-like molecules,Prenol lipids
4030,34501,197.677215,14.732328,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,,,,6,,,2.238860e+05,,,


In [22]:
#in_silico=[(df_cfm_cosine.Compound_Name == None) & (df_cfm_cosine.numoffpeaks_cfm_db_cosine > 5)]
#final=final.reset_index()
#in_silico=final[(final.Compound_Name == None).groupby(final.componentindex).transform('all')]
in_silico = final.groupby("componentindex").filter(lambda x : (x['Compound_Name'].isna()==True).all())
#df.groupby('name').filter(lambda g: (g != np.inf).all().all()
in_silico=in_silico.groupby("componentindex").filter(lambda x : x.shape[0]>2)

In [24]:
#in_silico = in_silico[(in_silico.componentindex	 != -1)]
in_silico["mls0"]=np.nan
in_silico["mls1"]=np.nan
in_silico["mls2"]=np.nan
in_silico["mls3"]=np.nan
in_silico

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


Unnamed: 0_level_0,index,row m/z,row retention time,07 Nrep,07 Nnes,07 Nrxs,07 Nsto,07 Ntab,08 Nglu,10 Nben,...,number of spectra,PI,SpectrumID,sum(precursor intensity),canopus_enhancer_subclass,canopus_enhancer_superclass,mls0,mls1,mls2,mls3
componentindex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1588,556.442615,45.424172,751833.240,720504.930,396032.5140,1901164.820,105188.1060,4.076539e+06,3.954588e+06,...,108,,,8.240000e+07,Ethers,Organic oxygen compounds,,,,
1,3617,567.446983,38.359141,384120.275,435020.250,38724.7150,429678.890,33036.6115,4.977797e+05,4.875659e+05,...,109,,,1.370000e+07,Ethers,Organic oxygen compounds,,,,
1,6302,584.473786,40.985788,983004.720,809695.020,81940.3050,1783053.850,134931.8340,1.667994e+06,1.522296e+06,...,109,,,1.710000e+08,Ethers,Organic oxygen compounds,,,,
1,6337,584.473804,40.810077,1548722.070,2602771.230,220239.3265,3314226.155,170469.3695,1.548675e+07,6.826625e+06,...,109,,,2.070000e+08,Ethers,Organic oxygen compounds,,,,
1,7897,584.473758,40.902964,975260.660,1616528.095,143831.6100,2576272.015,87234.3580,2.640714e+06,1.220538e+06,...,111,,,1.520000e+08,Ethers,Organic oxygen compounds,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3501,52445,287.237373,37.382913,0.000,5014.070,3995.4435,4766.580,281132.8370,2.734013e+04,0.000000e+00,...,91,,,1.327703e+06,Diterpenoids,Lipids and lipid-like molecules,,,,
3560,5658,715.315352,33.359346,0.000,0.000,248.0000,0.000,290386.2130,0.000000e+00,2.576600e+02,...,52,,,3.423057e+06,,Lipids and lipid-like molecules,,,,
3560,6167,701.299717,31.587500,0.000,0.000,0.0000,0.000,127085.4165,3.627460e+02,8.460599e+03,...,55,,,2.395697e+06,,Lipids and lipid-like molecules,,,,
3560,13513,673.268467,26.381440,0.000,0.000,326.3565,0.000,2431.0880,1.719120e+02,2.969734e+03,...,60,,,2.741134e+06,,Lipids and lipid-like molecules,,,,


In [25]:
in_silico.groupby("componentindex").size()

  and should_run_async(code)


componentindex
1        9
4        4
6       26
13      19
14      32
        ..
3353     3
3422     4
3469     3
3501     3
3560     4
Length: 829, dtype: int64

In [26]:
grouped = in_silico.groupby('componentindex')
dataframes = [group for _, group in grouped]
len(dataframes)

829

In [28]:

for i in tqdm(dataframes):
    #print("Network ",n," from ",len(dataframes))
    mls_results=mls(i,columns)
 
    if len(mls_results)>4:
        mls_results=mls_results[:3]
    else:
        pass
    try:
        for number,smiles in enumerate(mls_results):
            i["mls{}".format(number)]=smiles
    except:
        pass


  0%|          | 0/829 [00:00<?, ?it/s]

  mls.append(Chem.MolToSmiles(MolFromSmarts(MCS.FindMCS(y,timeout=120).smarts)))


In [29]:
mls_df=pd.concat(dataframes)
mls_df=mls_df.set_index("index")
mls_df=mls_df[["mls0","mls1","mls2","mls3"]]
mls_df

Unnamed: 0_level_0,mls0,mls1,mls2,mls3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1588,COCCO,CCCC,,
3617,COCCO,CCCC,,
6302,COCCO,CCCC,,
6337,COCCO,CCCC,,
7897,COCCO,CCCC,,
...,...,...,...,...
52445,CC1=C(C)C(C)(C)C(=O)CC1,CC1=CC(=O)OC1O,CC(C)=CCO,
5658,,,,
6167,,,,
13513,,,,


mls_df= pd.read_csv("/Users/delser/Git/phd/mls_networks.tsv", sep="\t",index_col='index') 
mls_df # use to reload consensus substructure table

In [24]:
final_df=final.reset_index()
final_df=final_df.set_index("index")
final_df=final_df.join(mls_df, how="outer")
final_df

  and should_run_async(code)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):


Unnamed: 0_level_0,componentindex,row m/z,row retention time,07 Nrep,07 Nnes,07 Nrxs,07 Nsto,07 Ntab,08 Nglu,10 Nben,...,PI,SpectrumID,sum(precursor intensity),canopus_enhancer_subclass,canopus_enhancer_superclass,canopus_enhancer_cano_most_specific_class,mls0,mls1,mls2,mls3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1298,375.300874,26.591875,0.000000e+00,0.000000e+00,0.000,0.000000e+00,0.0000,0.000,0.0000,...,Emmanuel Gaquerel,CCMSLIB00005788032,65100000.0,,Organoheterocyclic compounds,Pyridines and derivatives,,,,
2,1298,375.300905,26.708085,1.399826e+08,7.982489e+07,0.000,1.135378e+08,60783.3560,0.000,0.0000,...,Emmanuel Gaquerel,CCMSLIB00005788032,79200000.0,,Organoheterocyclic compounds,Pyridines and derivatives,,,,
3,214,331.274781,27.032871,1.456369e+08,1.578602e+08,5095030.375,1.636117e+06,12148.4585,390.456,0.0000,...,Gaquerel,CCMSLIB00005788033,150000000.0,Amines,Organic nitrogen compounds,Aralkylamines,,,,
4,1298,375.300859,27.204353,1.853471e+08,1.131553e+08,8789294.723,2.054237e+08,7070.3310,0.000,0.0000,...,Emmanuel Gaquerel,CCMSLIB00005788032,117000000.0,,Organoheterocyclic compounds,Pyridines and derivatives,,,,
5,214,331.274799,27.624355,1.225282e+08,4.539143e+07,3250219.092,2.809773e+06,4730.8535,217.019,148.2960,...,Gaquerel,CCMSLIB00005788033,50100000.0,Amines,Organic nitrogen compounds,Aralkylamines,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57596,125,163.122893,24.635752,3.057750e+03,0.000000e+00,0.000,0.000000e+00,0.0000,0.000,1517.4250,...,,,47200000.0,Amines,Alkaloids and derivatives,Alkaloids and derivatives,,,,
57597,1132,163.122798,22.623363,5.059680e+03,7.624705e+03,358.600,2.810580e+03,3549.6620,0.000,6276.6915,...,,,53000000.0,Amines,Organic nitrogen compounds,Aralkylamines,,,,
57602,1161,163.122847,26.424650,0.000000e+00,0.000000e+00,176.704,0.000000e+00,0.0000,0.000,1222.3840,...,,,51900000.0,Amines,Organic nitrogen compounds,Aralkylamines,,,,
57606,1161,163.122859,26.160844,5.060950e+03,0.000000e+00,204.249,3.038480e+03,0.0000,0.000,1921.6025,...,,,52800000.0,Amines,Organic nitrogen compounds,Aralkylamines,,,,


In [25]:
final_df.to_csv("mls_test.tsv",sep="\t")

  and should_run_async(code)
