In [1]:
# Update species_annotation, so that result returns sorted, CHEBI candidates & match score

In [2]:
import editdistance
import libsbml
import numpy as np
import operator
import os
import pickle
import pandas as pd
import sys
import matplotlib.pyplot as plt
%matplotlib inline  

BIOMD_12 = 'BIOMD0000000012.xml'
BASE_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/'
DATA_DIR = os.path.join(BASE_DIR, "DATA")
ALGO_DIR = os.path.join(DATA_DIR, "algo")
CHEBI_DIR = os.path.join(DATA_DIR, "chebi")
RHEA_DIR = os.path.join(DATA_DIR, "rhea")
BIOMODEL_DIR = os.path.join(DATA_DIR, "biomodels/curated_biomodels_31mar2021")
BIGG_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/DATA/bigg'
ecoli_fpath = os.path.join(BIGG_DIR, "e_coli_core.xml")

PROJ_DIR = os.path.join(os.getcwd(), os.pardir)
AMAS_DIR = os.path.join(PROJ_DIR, "AMAS")
sys.path.append(PROJ_DIR)

from AMAS import species_annotation as sa
from AMAS import reaction_annotation as ra
from AMAS import constants as cn
from AMAS import iterator as it
from AMAS import tools

In [3]:
prev_notebook_dir = '/Users/woosubs/Desktop/AutomateAnnotation/AnnotationRecommender/annotation_recommender/notebook'
with open(os.path.join(prev_notebook_dir, 'chebi_models.pickle'), 'rb') as handle:
  chebi_models = pickle.load(handle)

with open(os.path.join(cn.REF_DIR, 'chebi_shortened_formula_30apr2022.pickle'), 'rb') as f:
  ref_shortened_chebi_to_formula = pickle.load(f)
with open(os.path.join(cn.REF_DIR, 'chebi_low_synonyms.pickle'), 'rb') as f:
  chebi_low_synonyms = pickle.load(f)

one_biomd = 'BIOMD0000000634.xml'
one_biomd_fpath = os.path.join(BIOMODEL_DIR, one_biomd)
species_an = sa.SpeciesAnnotation(libsbml_fpath=one_biomd_fpath)
model_itm = chebi_models[one_biomd]
pred_species = species_an.predictAnnotationByName(inp_spec_list=list(model_itm.keys()))

reader = libsbml.SBMLReader()
document = reader.readSBML(one_biomd_fpath)
model = document.getModel()

In [4]:
pred_species

{'ATP': {'chebi': ['CHEBI:30616', 'CHEBI:15422'],
  'match_score': [('CHEBI:30616', 0.51), ('CHEBI:15422', 0.34)],
  'formula': ['C10N5O13P3']},
 'ADP': {'chebi': ['CHEBI:73342', 'CHEBI:456216', 'CHEBI:16761'],
  'match_score': [('CHEBI:73342', 0.39),
   ('CHEBI:456216', 0.37),
   ('CHEBI:16761', 0.34)],
  'formula': ['C10N5O10P2', 'C12N3O6']},
 'AMP': {'chebi': ['CHEBI:456215', 'CHEBI:28971', 'CHEBI:16027'],
  'match_score': [('CHEBI:456215', 0.29),
   ('CHEBI:28971', 0.23),
   ('CHEBI:16027', 0.15)],
  'formula': ['C10N5O7P', 'C16N3O4S']}}

In [5]:
pred_species['ADP']['chebi']

['CHEBI:73342', 'CHEBI:456216', 'CHEBI:16761']

In [7]:
url_default = 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A'
for one_k in pred_species.keys():
  urls = [url_default+val for val in pred_species[one_k]['chebi']]
  print(urls)

['https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3ACHEBI:30616', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3ACHEBI:15422']
['https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3ACHEBI:73342', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3ACHEBI:456216', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3ACHEBI:16761']
['https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3ACHEBI:456215', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3ACHEBI:28971', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3ACHEBI:16027']


In [70]:
inp_str = 'glucose'
# For now, choose the terms that are included in the CHEBI-formula mapping reference
dist_dict_min = {one_k:np.min([editdistance.eval(inp_str.lower(), val) for val in chebi_low_synonyms[one_k]]) \
                 for one_k in chebi_low_synonyms.keys() if one_k in ref_shortened_chebi_to_formula.keys()}
min_min_dist = np.min([dist_dict_min[val] for val in dist_dict_min.keys()])
min_min_chebis = [one_k for one_k in dist_dict_min.keys() \
                  if dist_dict_min[one_k]==min_min_dist and one_k in ref_shortened_chebi_to_formula.keys()]
# Results are sorted based on match_score (average of 1 - (editdistance/len_synonyms)
res_tuple = [(one_chebi,
              np.round(np.mean([1.0-editdistance.eval(inp_str.lower(), val)/len(val) \
                                for val in chebi_low_synonyms[one_chebi]]), 2)) \
             for one_chebi in min_min_chebis] 
res_tuple.sort(key=operator.itemgetter(1), reverse=True)
one_result[cn.CHEBI] = res_tup
min_min_formula = list(set([ref_shortened_chebi_to_formula[val] for val in min_min_chebis]))
one_result[cn.FORMULA] = min_min_formula


# one_match_score = 1 - min_min_dist/len(inp_str)
# one_result[cn.MATCH_SCORE] = one_match_score

# # predicted formula of the species
# one_result[cn.CHEBI] = min_min_chebis
# min_min_formula = list(set([ref_shortened_chebi_to_formula[val] for val in min_min_chebis]))
# one_result[cn.FORMULA] = min_min_formula

In [47]:
min_min_chebis

['CHEBI:4167', 'CHEBI:42758', 'CHEBI:17234']

In [73]:
for one_chebi in min_min_chebis:
  print(chebi_low_synonyms[one_chebi])
  print(np.mean([1.0-editdistance.eval(inp_str.lower(), val)/len(val) for val in chebi_low_synonyms[one_chebi]]))
  print()

['glc-oh', 'd-glucose', 'd-glcp', 'grape sugar', 'dextrose', 'd-glc', 'd-glucopyranose', 'wurcs=2.0/1,1,0/[a2122h-1x_1-5]/1/', 'glucose']
0.32933419158909355

['aldehydo-d-gluco-hexose', 'd-glucose', 'd(+)-glucose', 'aldehydo-d-glucose', 'dextrose', 'd-glucose in linear form', '(2r,3s,4r,5r)-2,3,4,5,6-pentahydroxyhexanal', 'wurcs=2.0/1,1,0/[o2122h]/1/', 'glucose']
0.4268444660816304

['glukose', 'glc', 'gluco-hexose', 'dl-glucose', 'glucose']
0.5614285714285714



In [63]:
res_tup = [(one_chebi,
            np.round(np.mean([1.0-editdistance.eval(inp_str.lower(), val)/len(val) for val in chebi_low_synonyms[one_chebi]]), 2)) \
          for one_chebi in min_min_chebis] 

In [64]:
res_tup

[('CHEBI:4167', 0.33), ('CHEBI:42758', 0.43), ('CHEBI:17234', 0.56)]

In [65]:
res_tup.sort(key=operator.itemgetter(1), reverse=True)

In [66]:
res_tup

[('CHEBI:17234', 0.56), ('CHEBI:42758', 0.43), ('CHEBI:4167', 0.33)]