In [19]:
import mummichog as mg
import pandas as pd
import numpy as np
import os
import re

# run_name = "1734535932.3620632.ttest_p0_05"
# run_name = "trans_omic_covid_data.run_1_default"
run_name = "trans_omic_covid_data.rsd_1_default_p"
end_name = run_name.split(".")[-1]
cut_off_p = 0.05


In [20]:
## MUMMICHOG OUTPUT: COMPARING NAME OF METABOLITE IN GT DATASET TO MUMMICHOG ANNOTATION ##
print("Mummichog Output - User input to emperical compounds\n")

file_path = f"../runs/{run_name}/tables/userInput_to_EmpiricalCompounds.tsv"
og_ui_cpd = pd.read_csv(file_path, sep="\t")
print(f"Number of original ouput rows: {len(og_ui_cpd)}")
ui_cpd_nan = og_ui_cpd.dropna(subset=['compounds']).copy()
print(f"Dropped {len(og_ui_cpd)-len(ui_cpd_nan)} nan compounds rows, {len(ui_cpd_nan)} rows")
print(f"Number of unique features: {len(np.unique(ui_cpd_nan['CompoundID_from_user']))}")

# a metabolite feature can have multiple annotations separated by a ";" in compound and "$" in compound_names
ui_cpd_nan['compound_split'] = ui_cpd_nan['compounds'].str.split(';') 
ui_cpd = ui_cpd_nan.explode('compound_split')
ui_cpd = ui_cpd.rename(columns={'compound_split': 'compound'})
ui_cpd = ui_cpd.drop(columns=['compounds'])
ui_cpd = ui_cpd[ui_cpd['compound'].str.strip() != ""]
print(f"Number of rows after ; split: {len(ui_cpd)}")


ui_cpd.to_csv(f"../runs/{run_name}/tables/edited_emperical_compounds_no_symbols_kegg.csv", index=False)
ui_cpd = ui_cpd.reset_index(drop=True)
ui_cpd.head()


Mummichog Output - User input to emperical compounds

Number of original ouput rows: 1559
Dropped 0 nan compounds rows, 1559 rows
Number of unique features: 1391
Number of rows after ; split: 3078


Unnamed: 0,input_row,EID,str_row_ion,compound_names,input_row.1,m/z,retention_time,p_value,statistic,CompoundID_from_user,compound
0,row1,E1,row1_M+H[1+];row1754_M+HCOONa[1+];row18_M(C13)...,1-pyrroline,row1,70.065243,44.397226,0.149661,-2.008318,M70.065243T44.3972,CE1944
1,row13,E2,row13_M+2H[2+];row1791_M+H[1+];row1792_M+H[1+],Methylarsonate; Methylarsonic acid,row13,70.980029,34.213457,0.868648,0.308109,M70.980029T34.2135_1,C07294
2,row18,E1,row1_M+H[1+];row1754_M+HCOONa[1+];row18_M(C13)...,1-pyrroline,row18,71.068645,44.667389,0.246868,-1.674244,M71.068645T44.6674,CE1944
3,row29,E6,row29_M+H[1+],Acrylamide; 2-Propenamide,row29,72.044165,582.519073,0.005951,3.525478,M72.044165T582.5191,C01659
4,row56,E12,row1922_M+H[1+];row56_M+2H[2+],Octanoic acid; Caprylic acid; Octylic acid,row56,73.06487,474.584274,0.062903,2.494126,M73.06487T474.5843,C06423


In [21]:
# Adding KEGG to ui_cpd df
# KEGG_map_file = "name_map_on_mummichog_output_cpds.csv"
# KEGG_map = pd.read_csv(KEGG_map_file)
# KEGG_map['compound_name'] = KEGG_map['Query']
# KEGG_map = KEGG_map.drop(columns=['Query'])

# ui_cpd = ui_cpd.merge(KEGG_map[['compound_name', 'KEGG']], on='compound_name', how='left')
# # print(len(ui_cpd))
# # ui_cpd.to_csv(f"../runs/{run_name}/tables/edited_emperical_compounds_KEGG.csv")
# ui_cpd.head()

In [22]:
# True metabolite identities from the COVID-dataset (gt -> ground truth)
# Required cols: metabolite_identification, retention_time
file_path = "m_MTBLS2542_Metabo_LC-MS_positive_reverse-phase_v2_maf.tsv"
gt = pd.read_csv(file_path, sep="\t")
gt['retention_time'] = gt['retention_time']*60

print("Ground Truth - True Metabolite info from COVID dataset")
print(f"No of rows: {len(gt)}")
print(f"No of unique ChEBI IDs {len(np.unique(gt['database_identifier']))}")
gt.head()

Ground Truth - True Metabolite info from COVID dataset
No of rows: 515
No of unique ChEBI IDs 503


Unnamed: 0,database_identifier,chemical_formula,smiles,inchi,metabolite_identification,mass_to_charge,fragmentation,modifications,charge,retention_time,...,20P02820143_BEHC18_POS,20P02820140_BEHC18_POS,20P02820141_BEHC18_POS,20P02820209_BEHC18_POS,20P02820208_BEHC18_POS,20P02820075_BEHC18_POS,20P02820073_BEHC18_POS,20P02820072_BEHC18_POS,20P02820079_BEHC18_POS,20P02820078_BEHC18_POS
0,CHEBI:16610,C7H19N3,NCCCCNCCCN,"InChI=1S/C7H19N3/c8-4-1-2-6-10-7-3-5-9/h10H,1-9H2",Spermidine,,,,,36.78,...,752800.5,2562570.0,2398701.0,4761392.0,1558479.0,1824217.0,987750.2,1373717.0,2332980.0,1791080.0
1,CHEBI:18019,C6H14N2O2,NCCCC[C@H](N)C(O)=O,"InChI=1S/C6H14N2O2/c7-4-2-1-3-5(8)6(9)10/h5H,1...",L-lysine,,,,,36.78,...,207079900.0,269723200.0,263210500.0,191198200.0,169237900.0,247593400.0,211914800.0,175770300.0,166759800.0,283303400.0
2,CHEBI:17964,C6H11NO2,OC(=O)C1CCCCN1,"InChI=1S/C6H11NO2/c8-6(9)5-3-1-2-4-7-5/h5,7H,1...",Pipecolate,,,,,36.84,...,202035800.0,262874000.0,257367300.0,186656800.0,165223200.0,241940200.0,206904200.0,171339200.0,163171500.0,277729400.0
3,CHEBI:16176,C5H12N2O2,NCCC[C@@H](N)C(O)=O,"InChI=1S/C5H12N2O2/c6-3-1-2-4(7)5(8)9/h4H,1-3,...",D-ornithine,,,,,36.84,...,52776200.0,66081780.0,68861630.0,83346550.0,118974700.0,34957160.0,35879610.0,47774180.0,29969900.0,52020930.0
4,CHEBI:17311,C9H21N2O2+,OC(=O)[C@@H](N)CCCC[N+](C)(C)C,"InChI=1S/C9H20N2O2/c1-11(2,3)7-5-4-6-8(10)9(12...","Nepsilon,nepsilon,nepsilon-trimethyllysine",,,,,40.92,...,17985430.0,19152680.0,14170200.0,10175680.0,4711219.0,57363160.0,21301670.0,23808810.0,13248860.0,12168430.0


In [None]:
# Adding KEGG to gt df from chebi id
KEGG_map_file = "name_map_gt_chebi_to_kegg.csv"
KEGG_map = pd.read_csv(KEGG_map_file)
KEGG_map['Query'] = "CHEBI:" + KEGG_map['Query'].astype(str)
KEGG_map['database_identifier'] = KEGG_map['Query']
KEGG_map = KEGG_map.drop(columns=['Query'])

gt = gt.merge(KEGG_map[['database_identifier', 'KEGG']], on='database_identifier', how='left')

print("Ground Truth - True Metabolite info from COVID dataset")
print(f"No of rows: {len(gt)}")
print(f"No of unique ChEBI IDs: {len(np.unique(gt['database_identifier']))}")
print(f"No of unique KEGG IDs: {gt[gt['KEGG'].notna()]['KEGG'].nunique()}")
gt.head()


254
Ground Truth - True Metabolite info from COVID dataset
No of rows: 515
No of unique ChEBI IDs 503


Unnamed: 0,database_identifier,chemical_formula,smiles,inchi,metabolite_identification,mass_to_charge,fragmentation,modifications,charge,retention_time,...,20P02820140_BEHC18_POS,20P02820141_BEHC18_POS,20P02820209_BEHC18_POS,20P02820208_BEHC18_POS,20P02820075_BEHC18_POS,20P02820073_BEHC18_POS,20P02820072_BEHC18_POS,20P02820079_BEHC18_POS,20P02820078_BEHC18_POS,KEGG
0,CHEBI:16610,C7H19N3,NCCCCNCCCN,"InChI=1S/C7H19N3/c8-4-1-2-6-10-7-3-5-9/h10H,1-9H2",Spermidine,,,,,36.78,...,2562570.0,2398701.0,4761392.0,1558479.0,1824217.0,987750.2,1373717.0,2332980.0,1791080.0,C00315
1,CHEBI:18019,C6H14N2O2,NCCCC[C@H](N)C(O)=O,"InChI=1S/C6H14N2O2/c7-4-2-1-3-5(8)6(9)10/h5H,1...",L-lysine,,,,,36.78,...,269723200.0,263210500.0,191198200.0,169237900.0,247593400.0,211914800.0,175770300.0,166759800.0,283303400.0,C00047
2,CHEBI:17964,C6H11NO2,OC(=O)C1CCCCN1,"InChI=1S/C6H11NO2/c8-6(9)5-3-1-2-4-7-5/h5,7H,1...",Pipecolate,,,,,36.84,...,262874000.0,257367300.0,186656800.0,165223200.0,241940200.0,206904200.0,171339200.0,163171500.0,277729400.0,C00408
3,CHEBI:16176,C5H12N2O2,NCCC[C@@H](N)C(O)=O,"InChI=1S/C5H12N2O2/c6-3-1-2-4(7)5(8)9/h4H,1-3,...",D-ornithine,,,,,36.84,...,66081780.0,68861630.0,83346550.0,118974700.0,34957160.0,35879610.0,47774180.0,29969900.0,52020930.0,C00515
4,CHEBI:17311,C9H21N2O2+,OC(=O)[C@@H](N)CCCC[N+](C)(C)C,"InChI=1S/C9H20N2O2/c1-11(2,3)7-5-4-6-8(10)9(12...","Nepsilon,nepsilon,nepsilon-trimethyllysine",,,,,40.92,...,19152680.0,14170200.0,10175680.0,4711219.0,57363160.0,21301670.0,23808810.0,13248860.0,12168430.0,C03793


In [24]:
ui_cpd.head()

Unnamed: 0,input_row,EID,str_row_ion,compound_names,input_row.1,m/z,retention_time,p_value,statistic,CompoundID_from_user,compound
0,row1,E1,row1_M+H[1+];row1754_M+HCOONa[1+];row18_M(C13)...,1-pyrroline,row1,70.065243,44.397226,0.149661,-2.008318,M70.065243T44.3972,CE1944
1,row13,E2,row13_M+2H[2+];row1791_M+H[1+];row1792_M+H[1+],Methylarsonate; Methylarsonic acid,row13,70.980029,34.213457,0.868648,0.308109,M70.980029T34.2135_1,C07294
2,row18,E1,row1_M+H[1+];row1754_M+HCOONa[1+];row18_M(C13)...,1-pyrroline,row18,71.068645,44.667389,0.246868,-1.674244,M71.068645T44.6674,CE1944
3,row29,E6,row29_M+H[1+],Acrylamide; 2-Propenamide,row29,72.044165,582.519073,0.005951,3.525478,M72.044165T582.5191,C01659
4,row56,E12,row1922_M+H[1+];row56_M+2H[2+],Octanoic acid; Caprylic acid; Octylic acid,row56,73.06487,474.584274,0.062903,2.494126,M73.06487T474.5843,C06423


In [25]:
def check_cpd_synonyms(compound_name, index):
    ''' Function checks if all compounds in the gt dataset are unique i.e., that there are no synonyms for a compound
    listed as separate entries. Note that this is not exhaustive as this function does not check all compounds in gt but
    only the ones checks the compound synonyms output by mummichog's annotations'''
    
    compound_syns = compound_name.split(";")
    if len(compound_syns) <= 1:
        return True
    else:
        found_match = False
        for compound in compound_syns:
            escaped_compound_name = re.escape(compound)
            match = gt[gt['metabolite_identification'].str.fullmatch(escaped_compound_name, case=False, na=False)]
            if not found_match and not match.empty:
                found_match = True
            else:
                if not match.empty:
                    print("oh no the same compound with a different name exists in more than one ground truth entry", index, compound_name)
                    return False
                
    return True

def check_duplicate_features(matches_df):
    if len(matches_df) == len(np.unique(matches_df["compoundID_from_user"])):
        return matches_df # No duplicates
    else:
        print("The same compound id feature is pointing to different compounds. Fix it")
        # grouped = matches_df.groupby("compoundID_from_user")
        # dup_mets_df = grouped.filter(lambda x: len(x) > 1)
        # # print(dup_mets_df.groupby("compoundID_from_user").head())
        # matches_df = matches_df.drop(dup_mets_df.index)

        # # The fix: checks exact names; if names are the same, selects the one with the smallest rt_diff
        # keep_dup_idx = []
        # for compoundID_from_user, grp in dup_mets_df.groupby("compoundID_from_user"):
        #     keep_index = None # or (index, rt_diff)
        #     for index, row in grp.iterrows():
        #         print(row['compound_names'])
        #         dummy = "acetamidopropanal$D-Proline;L-Proline"
        #         potential_cpd_names = [item.strip().lower() for part in dummy.split("$") for item in part.split(";")]
        #         print(potential_cpd_names)
        #         dklfjs
        #         check_names = row['compound_name'].strip().lower() == row['gt_compound_name'].strip().lower()
                
        #         # exact matching name exists
        #         if check_names:
        #             # multiple rows have the exact name -> pick the smallest rt diff
        #             if keep_index:
        #                 if row['rt_difference'] < keep_index[-1]:
        #                     keep_index = (index, row['rt_difference'])
        #             else:
        #                 keep_index = (index, row['rt_difference'])
            
        #     if not keep_index: # i.e. compound_name and gt_compound_name are not exact matches
        #         print('um', row['compound_name'], row['gt_compound_name'])
        #         keep_index = (grp['rt_difference'].idxmin(),)

        #     keep_dup_idx.append(keep_index[0])
            
        # fixed_dup_mets_df = dup_mets_df.loc[keep_dup_idx]

        # matches_df = pd.concat([matches_df, fixed_dup_mets_df])
        # if len(matches_df) == len(np.unique(matches_df["compoundID_from_user"])):
        #     print(f"All fixed. {len(dup_mets_df)} duplicate compoundID_from_user fixed to {len(fixed_dup_mets_df)}\n")
        #     print(fixed_dup_mets_df)

        # return matches_df

In [26]:
def run_rt_name_matches():
    matches = []

    for index, row in ui_cpd.iterrows():
        compound_match_found = False
        compound_name = row['compound_name']

        if pd.notna(compound_name):     
            ui_retention_time = row['retention_time']   

            # check_cpd_synonyms(compound_name, index)
            compound_syns = compound_name.split(";")
            
            for compound in compound_syns:
                escaped_compound_name = re.escape(compound_name)
                match = gt[gt['metabolite_identification'].str.fullmatch(escaped_compound_name, case=False, na=False)]
                if not match.empty:
                    break
            
            # Include partial matches
            if match.empty:
                match = gt[gt['metabolite_identification'].str.contains(escaped_compound_name, case=False, na=False)]
            
            if not match.empty:
                compound_match_found = True  
                rt_difference, gt_retention_time, gt_compound_name = min(
                    ((abs(ui_retention_time - match.iloc[i]['retention_time']), match.iloc[i]['retention_time'], match.iloc[i]['metabolite_identification']) 
                    for i in range(len(match))),
                    key=lambda x: x[0]
                )

                matches.append({
                    "compound_name": compound_name,
                    "gt_compound_name": gt_compound_name,
                    "ui_retention_time": ui_retention_time,
                    "gt_retention_time": gt_retention_time,
                    "rt_difference": rt_difference,
                    "p_value": row['p_value'],
                    "compoundID_from_user": row['CompoundID_from_user'],
                    "gt_index": match.index[0],
                    "ui_index": index,
                })
            


    matches_df = pd.DataFrame(matches)
    matches_df = check_duplicate_features(matches_df)
    sig_df = matches_df[matches_df["p_value"]<0.05]

    print(f"\n{len(matches_df)} matches found of which {len(sig_df)} are significant features")
    # print(sig_df.head())

    if not os.path.exists(f'../runs/{run_name}/analysis'):
        os.mkdir(f'../runs/{run_name}/analysis')

    # matches_df.to_csv(f'../runs/{run_name}/analysis/matches_{end_name}.csv')
    # sig_df.to_csv(f'../runs/{run_name}/analysis/sig_matches_{end_name}.csv')


In [27]:
# KEGG matches
def run_KEGG_matches():
    matches = []

    for index, row in ui_cpd.iterrows():
        KEGG_id = row['compound']

        if pd.notna(KEGG_id):     
            match = gt[gt['KEGG'].str.fullmatch(KEGG_id, case=False, na=False)]
            
            if not match.empty:
                matches.append({
                    "compound_names": row['compound_names'],
                    "gt_compound_name": match['metabolite_identification'].iloc[0],
                    "ui_retention_time": row['retention_time'],
                    "gt_retention_time": match['retention_time'].iloc[0],
                    "rt_difference": abs(row['retention_time'] - match['retention_time'].iloc[0]),
                    "p_value": row['p_value'],
                    "compoundID_from_user": row['CompoundID_from_user'],
                    "gt_KEGG": match['KEGG'].iloc[0],
                    "ui_KEGG": str(KEGG_id),
                    "gt_index": match.index[0],
                    "ui_index": index,
                    "eid": row["EID"]
                })
            

    matches_df = pd.DataFrame(matches)
    matches_df.head()
    check_duplicate_features(matches_df)
    sig_df = matches_df[matches_df["p_value"]<cut_off_p]

    print(f"\n{len(matches_df)} matches found of which {len(sig_df)} are significant features")
    # print(sig_df.head())

    if not os.path.exists(f'../runs/{run_name}/analysis'):
        os.mkdir(f'../runs/{run_name}/analysis')

    matches_df.to_csv(f'../runs/{run_name}/analysis/matches_{end_name}_KEGG_v2_dups_{str(cut_off_p).replace(".", "_")}.csv')
    sig_df.to_csv(f'../runs/{run_name}/analysis/sig_matches_{end_name}_KEGG_v2_dups_{str(cut_off_p).replace(".", "_")}.csv')


In [28]:
run_KEGG_matches()

The same compound id feature is pointing to different compounds. Fix it

374 matches found of which 96 are significant features
