In [1]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import rdFMCS
from rdkit.Chem import PandasTools

import sys

import pandas as pd

import pubchempy as pcp
import numpy as np


import os
import glob
import re

In [9]:
input_dir = '/Users/mahnoorzulfiqar/OneDriveUNI/CondaCheck/'
# read the suspect list
slistcsv = "/Users/mahnoorzulfiqar/OneDriveUNI/CondaCheck/SkeletonemaSuspectListV1.csv"
# read input_table.csv generated in the R workflow
input_tablecsv = input_dir + "input_table.csv"
sl = True

In [21]:
    heavy_atoms = ['C', 'N', 'P', 'O', 'S']

    input_table = pd.read_csv(input_tablecsv)

        for m, row in input_table.iterrows():

        # Result directory
        result = input_dir + (input_table['ResultFileNames'][m] + 
                                 '/insilico/MetFrag').replace("./", "")

        # list of all the csv files in the result directory result_dir/inislico/MetFrag/
        files_met = (glob.glob(result+'/*.csv'))

        # read the csv file that contains all the features from the input .mzml file
        file1  = pd.read_csv(input_dir + (input_table['ResultFileNames'][m] + '/insilico/MS1DATA.csv').replace("./", ""))

        # for each feature in the MS1DATA.csv file
        for i, row in file1.iterrows():

            # take id as a pattern to differentiate between different ids
            pattern = file1.loc[i, "id_X"]

            #check which of the csv result files have the same pattern in their names
            results = [i for i in files_met if pattern in i]

            # find which of the files with that id have KEGG in their names,
            KEGG = [i for i in results if "KEGG" in i]

            # if kegg present in the name
            if KEGG:

                # read the KEGG csv file for that feature
                KEGG_file = pd.read_csv((KEGG)[0])

                # if the KEGG file isn't empty
                if len(KEGG_file) >= 1:

                    # extract only the columns with >0.75 score
                    KEGG_file = KEGG_file.drop(KEGG_file[KEGG_file.Score < 0.75].index)
                    print(KEGG_file)

                    if len(KEGG_file) >= 1:

                        # add the relavnt information to the original MS1DATA csv
                        file1.loc[i, 'KG_ID'] = KEGG_file.loc[0, 'Identifier']
                        file1.loc[i, 'KG_Name'] = KEGG_file.loc[0, 'CompoundName']
                        file1.loc[i, 'KG_Formula'] = KEGG_file.loc[0, 'MolecularFormula']
                        file1.loc[i, 'KG_expPeaks'] = KEGG_file.loc[0, 'NoExplPeaks']
                        file1.loc[i, 'KG_SMILES'] = Chem.MolToSmiles(Chem.MolFromInchi(KEGG_file["InChI"][0]))
                        file1.loc[i, 'KG_file'] = KEGG

                        #create empty list of KEGG top smiles
                        Kegg_smiles = []

                        # extract only the InChI of the top 5
                        for j in KEGG_file["InChI"][0:5].tolist():
                            # convert the InChI to SMILES
                            mol = Chem.MolToSmiles(Chem.MolFromInchi(j))
                            if sl:
                                # read the suspect list
                                slist = pd.read_csv(slistcsv)

                                # Add columns 
                                file1['KG_Top_can_SL'] = np.nan # top candidate among the top 5 candidates, according to similarity with a compound in suspect list
                                file1['KG_tanimotoSLvsCAN'] = np.nan # tanimoto score
                                file1['KG_SL_comp'] = np.nan # Smiles of the suspect listr compund with  high similairity with the one of the top 5 candidates

                                # for each smiles in suspect list
                                for k, row in slist.iterrows():
                                    # Calculate the tanimoto score
                                    SSms = [Chem.MolFromSmiles(mol), Chem.MolFromSmiles(slist['SMILES'][k])]
                                    SSfps = [AllChem.GetMorganFingerprintAsBitVect(x,2, nBits=2048) for x in SSms]
                                    SStn = DataStructs.FingerprintSimilarity(SSfps[0],SSfps[1])
                                    if SStn >= 0.8:
                                        file1.loc[i, 'KG_Top_can_SL'] = j
                                        file1.loc[i, 'KG_tanimotoSLvsCAN'] = SStn
                                        file1.loc[i, 'KG_SL_comp'] = slist['SMILES'][k]
                            mol2 = Chem.MolFromSmiles(mol)
                            Kegg_smiles.append(mol2)
                        # if there are more than 1 top smiles
                        if len(Kegg_smiles) > 1:
                            #calculate the MCSS
                            res = rdFMCS.FindMCS(Kegg_smiles)
                            sm_res = Chem.MolToSmiles(Chem.MolFromSmarts(res.smartsString))
                            # if there are atleast 3 heavy atoms in the MCSS, then add it to the result file
                            elem = [ele for ele in heavy_atoms if(ele in sm_res)]
                            if elem and len(sm_res)>=3:
                                file1.loc[i, 'KG_MCSSstring'] = res.smartsString
                                file1.loc[i, 'KG_MCSS_SMILES'] = Chem.MolToSmiles(Chem.MolFromSmarts(res.smartsString))

            #start here for PubChem; find which of the files with that id have PubChem in their names,
            PubChem = [i for i in results if "PubChem" in i]

            if PubChem:

                PubChem_file = pd.read_csv(PubChem[0])

                # if more candidates
                if len(PubChem_file) >= 1:

                    # take the ones with more than 0.75 score
                    PubChem_file = PubChem_file.drop(PubChem_file[PubChem_file.Score < 0.75].index)

                    if len(PubChem_file) >= 1:

                        # add the relavnt information to the original MS1DATA csv
                        file1.loc[i, 'PC_ID'] = PubChem_file.loc[0, 'Identifier']
                        file1.loc[i, 'PC_Name'] = PubChem_file.loc[0, 'IUPACName']
                        file1.loc[i, 'PC_Formula'] = PubChem_file.loc[0, 'MolecularFormula']
                        file1.loc[i, 'PC_expPeaks'] = PubChem_file.loc[0, 'NoExplPeaks']
                        file1.loc[i, 'PC_SMILES'] = PubChem_file["SMILES"][0]
                        file1.loc[i, 'PC_file'] = PubChem

                        # empty object
                        Pubchem_smiles = []

                        # extract only the SMILES of the top 5
                        for j in PubChem_file["SMILES"][0:5].tolist():

                            # if sl = True
                            if sl:

                                # read the suspect list
                                slist = pd.read_csv(slistcsv)

                                # Add columns 
                                file1['PC_Top_can_SL'] = np.nan # top candidate among the top 5 candidates, according to similarity with a compound in suspect list
                                file1['PC_tanimotoSLvsCAN'] = np.nan # tanimoto score
                                file1['PC_SL_comp'] = np.nan # Smiles of the suspect listr compund with  high similairity with the one of the top 5 candidates
                                # calculate tanimoto
                                for n, row in slist.iterrows():

                                    SSms = [Chem.MolFromSmiles(j), Chem.MolFromSmiles(slist['SMILES'][n])]
                                    SSfps = [AllChem.GetMorganFingerprintAsBitVect(x,2, nBits=2048) for x in SSms]
                                    SStn2 = DataStructs.FingerprintSimilarity(SSfps[0],SSfps[1])

                                    if SStn2 >= 0.8:
                                        file1.loc[i, 'PC_Top_can_SL'] = j
                                        file1.loc[i, 'PC_tanimotoSLvsCAN'] = SStn2
                                        file1.loc[i, 'PC_SL_comp'] = slist['SMILES'][n]

                            # Concert smiles to mol
                            sm2 = Chem.MolFromSmiles(j)
                            # store mol in Pubchem_smiles
                            Pubchem_smiles.append(sm2)

                        if len(Pubchem_smiles) > 1:
                            # calculate MCSS
                            res2 = rdFMCS.FindMCS(Pubchem_smiles)
                            sm_res = Chem.MolToSmiles(Chem.MolFromSmarts(res2.smartsString))
                            # If atleast 3 heavy atoms present
                            elem = [ele for ele in heavy_atoms if(ele in sm_res)]
                            if elem and len(sm_res)>=3:
                                file1.loc[i, 'PC_MCSSstring']= res2.smartsString
                                file1.loc[i, 'PC_MCSS_SMILES'] = Chem.MolToSmiles(Chem.MolFromSmarts(res2.smartsString))
        file1.to_csv(input_dir + (input_table['ResultFileNames'][m] + '/insilico/MetFragResults.csv').replace("./", ""))

   Score                                              InChI  \
0    1.0  InChI=1S/C19H14ClF5N4O2/c20-14-7-6-13(8-12(14)...   

  FragmenterScore_Values  SuspectListScore  MaximumTreeDepth  \
0          1324.0;1572.0               0.0                 2   

   MonoisotopicMass CompoundName Identifier MolecularFormula  \
0         460.07257     Flupoxam     C18543   C19H14ClF5N4O2   

                                 FormulasOfExplPeaks   InChIKey2  \
0  66.0469703041635:[C5H5+H]-;193.997356261979:[C...  UHFFFAOYSA   

        InChIKey1  FragmenterScore  \
0  AOQMRUTZEYVDIL        26.477126   

                                       ExplPeaks                     InChIKey  \
0  66.0469703041635_818.3;193.997356261979_804.5  AOQMRUTZEYVDIL-UHFFFAOYSA-N   

   NoExplPeaks  NumberPeaksUsed  
0            2               48  
Empty DataFrame
Columns: [Score, InChI, FragmenterScore_Values, SuspectListScore, MaximumTreeDepth, MonoisotopicMass, CompoundName, Identifier, MolecularFormula, Formulas

   Score                                              InChI  \
0    1.0  InChI=1S/C12H23NO4/c1-9(2)6-12(16)17-10(7-11(1...   

                              FragmenterScore_Values  SuspectListScore  \
0  708.0;708.0;708.0;708.0;708.0;708.0;448.0;708....               0.0   

   MaximumTreeDepth  MonoisotopicMass         CompoundName Identifier  \
0                 2          245.1628  Isovalerylcarnitine     C20826   

  MolecularFormula                                FormulasOfExplPeaks  \
0        C12H23NO4  56.049413342519:[C3H8N-2H]+;56.0495128523064:[...   

    InChIKey2       InChIKey1  FragmenterScore  \
0  UHFFFAOYSA  IGQBPDJNUXPEMT       439.580771   

                                           ExplPeaks  \
0  56.049413342519_0.0;56.0495128523064_0.0;56.04...   

                      InChIKey  NoExplPeaks  NumberPeaksUsed  
0  IGQBPDJNUXPEMT-UHFFFAOYSA-N          909             7515  
   Score                                              InChI  \
0    1.0  InChI=1S/C11H21NO

      Score                                              InChI  \
0  1.997571      InChI=1S/C5H11NO2/c1-6(2,3)4-5(7)8/h4H2,1-3H3   
1  1.760120  InChI=1S/C5H11NO2/c1-3(2)4(6)5(7)8/h3-4H,6H2,1...   
2  1.000000  InChI=1S/C5H11NO2/c1-6-4-2-3-5(7)8/h6H,2-4H2,1...   
3  0.978095  InChI=1S/C5H11NO2/c6-4-2-1-3-5(7)8/h1-4,6H2,(H...   
4  0.769270  InChI=1S/C5H11NO2/c1-2-3-4(6)5(7)8/h4H,2-3,6H2...   
5  0.767898  InChI=1S/C5H11NO2/c1-3-5(2,6)4(7)8/h3,6H2,1-2H...   

                              FragmenterScore_Values  SuspectListScore  \
0  448.0;448.0;448.0;448.0;448.0;448.0;448.0;448....               1.0   
1  348.0;348.0;348.0;348.0;348.0;348.0;348.0;348....               1.0   
2  348.0;348.0;348.0;348.0;348.0;348.0;348.0;348....               0.0   
3  348.0;348.0;348.0;348.0;348.0;348.0;348.0;348....               0.0   
4  348.0;348.0;348.0;348.0;348.0;348.0;348.0;348....               0.0   
5  348.0;348.0;348.0;348.0;348.0;348.0;348.0;348....               0.0   

   MaximumTreeDept