In [1]:
import glob
import json
import os
import re
import time
import wget
import urllib.parse
import argparse


import numpy as np
import pandas as pd
import pubchempy as pcp


from pybatchclassyfire import *
from pandas import json_normalize
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import rdFMCS
from rdkit.Chem import PandasTools
def isNaN(string):
    return string != string

INFO:rdkit:Enabling RDKit 2021.09.4 jupyter extensions


In [17]:
def gnpsMNvsgnpsMAW(entry, mn_dir, name):
    
    """gnpsMNvsgnpsMAW checks with tanimoto similarity score, whether
    results from MAW GNPS and GNPS MN Masst results give same candidate

    Parameters:
    input_dir = input directory where you have stored the cytoscape file
    from GNPS MN results and have exported edge and node tables from cytoscape
    These two csv egde and node files must have "edge" and "node" in their name

    Returns:
    GNPS results with cluster index named
    GNPS MN results with a confirmation column if MAW detected same candidate,
    file named:

    Usage:
    gnpsMNvsgnpsMAW(mn_dir)
    """
    # extract files with edges from MN results
    GMNfile_edge = [f for f in os.listdir(mn_dir) if "edge" in f]
    # extract files with nodes from MN results
    GMNfile_node = [f for f in os.listdir(mn_dir) if "node" in f]
    # read the files
    GMNdf_node = pd.read_csv(mn_dir + "/" + GMNfile_node[0])
    GMNdf_edge = pd.read_csv(mn_dir + "/" + GMNfile_edge[0])
    # extract only important columns from both csv files
    GMNdf_node = GMNdf_node[
        [
            "precursor mass",
            "RTMean",
            "UniqueFileSources",
            "charge",
            "cluster index",
            "componentindex",
            "Compound_Name",
            "Smiles",
            "SpectrumID",
        ]
    ]
    GMNdf_edge = GMNdf_edge[
        ["cosine_score", "EdgeAnnotation", "node1", "node2", "mass_difference"]
    ]
    # rename node1 to cluster index to merge nodes and edges results from MN
    GMNdf_edge = GMNdf_edge.rename(columns={"node1": "cluster index"})
    GMNdf = pd.merge(GMNdf_node, GMNdf_edge, on="cluster index")
    GMNdf.to_csv(mn_dir + "/mergedN&E.csv")
    # Read results obtained from scoring_spec, named input_dir/MetabolomicsResults/scoredSpecDB.csv
    SDB = pd.read_csv(entry + "/mergedResults-with-one-Candidates.csv")
    # from GNPS MAW results and GNPS MN results, calculate how many MAW results are same as MN:
    for i, row in SDB.iterrows():
        for j, row in GMNdf.iterrows():
             if not isNaN(SDB["SMILES"][i]) and not isNaN(GMNdf["Smiles"][j]):
                if name in GMNdf["UniqueFileSources"][j]: 
                    SKms = [
                        Chem.MolFromSmiles(SDB["SMILES"][i]),
                        Chem.MolFromSmiles(GMNdf["Smiles"][j]),
                    ]
                    SKfps = [
                        AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=2048)
                        for x in SKms
                    ]
                    SKtn = DataStructs.FingerprintSimilarity(SKfps[0], SKfps[1])
                    if SKtn >= 0.99:
                        print(SKtn)
                        print(GMNdf["Smiles"][j])
                        SDB['AnnotationSources'][i] = SDB['AnnotationSources'][i] + "|GNPSMN"
                        SDB['MSILevel'][i] = 2.0
                        print(SDB['AnnotationSources'][i])
                    #else:

    SDB.to_csv(entry + "/mergedResults-with-one-Candidates.csv")

In [18]:
path = "/Users/mahnoorzulfiqar/OneDriveUNI/MAW-Diatom/SmarinoiRun1"
file = os.listdir(path)
folders = [x for x in os.listdir(path) if x.startswith('DS')]
folders2 = [x for x in folders if not '.mzML' in x]
folders2

['DS_201124_SC_full_PRM_pos_03',
 'DS_201124_SC_full_PRM_pos_04',
 'DS200309_Scost_QC_70k_neg_PRM',
 'DS_201124_SC_full_PRM_pos_05',
 'DS_201124_SC_full_PRM_pos_02',
 'DS_201124_SC_full_PRM_neg_03',
 'DS_201124_SC_full_PRM_neg_04',
 'DS200309_Scost_QC_70k_pos_PRM',
 'DS_201124_SC_full_PRM_neg_05',
 'DS_201124_SC_full_PRM_neg_02',
 'DS_201124_SC_full_PRM_neg_10',
 'DS_201124_SC_full_PRM_pos_10',
 'DS_201124_SC_full_PRM_pos_07',
 'DS_201124_SC_full_PRM_pos_09',
 'DS_201124_SC_full_PRM_pos_08',
 'DS_201124_SC_full_PRM_pos_01',
 'DS_201124_SC_full_PRM_pos_06',
 'DS_201124_SC_full_PRM_neg_07',
 'DS_201124_SC_full_PRM_neg_09',
 'DS_201124_SC_full_PRM_neg_08',
 'DS_201124_SC_full_PRM_neg_01',
 'DS_201124_SC_full_PRM_neg_06']

In [21]:
mn_dir = "/Users/mahnoorzulfiqar/OneDriveUNI/MAW-Diatom/Molecular-Networking/GNPS/First-Run"

In [22]:
for i in folders2:
    entry = path + "/" + i
    name = i
    gnpsMNvsgnpsMAW(entry, mn_dir, name)

1.0
OC(=O)C(N)Cc(c1)ccc(O)c1
SIRIUS|MassBank|GNPS|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN
1.0
CCCC(=O)O[C@H](CC([O-])=O)C[N+](C)(C)C
GNPS|GNPSMN|GNPSMN|GNPSMN
1.0
C(C[C@@H](C(=O)O)N)CN=C(N)N
SIRIUS|GNPS|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN
1.0
C(C[C@@H](C(=O)O)N)CN=C(N)N
SIRIUS|GNPS|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN
1.0
C(C[C@@H](C(=O)O)N)CN=C(N)N
SIRIUS|GNPS|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN
1.0
CCCCCC/C=C\\CCCCCCCC(=O)OCC(CO)O
GNPS|GNPSMN|GNPSMN|GNPSMN
1.0
CCC(=O)O[C@H](CC(O)=O)C[N+](C)(C)C
SIRIUS|GNPS|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN
1.0
CCC(=O)O[C@H](CC(O)=O)C[N+](C)(C)C
SIRIUS|GNPS|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN
1.0
O=C(O)[C@H](O)[C@@H](O)[C@H](O)[C@H](O)CO
SIRIUS|GNPS|GNPSMN|GNPSMN|GNPSMN
1.0
OC(=O)C(N)Cc(c1)ccc(O)c1
MassBank|GNPS|GNPSMN|GNPSMN|GNPSMN
1.0
CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)([O-])OCC[N+](C)(C)C)O
GNPS|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNPSMN|GNP