In [1]:
# Update iterator algorithm; and making recommendations; 
import collections
import copy
import itertools
import libsbml
import numpy as np
import os
import pickle
import pandas as pd
import sys
import matplotlib.pyplot as plt
%matplotlib inline  

BIOMD_12 = 'BIOMD0000000012.xml'
BASE_DIR = '/Users/woosubshin/Desktop/AutomateAnnotation/'
DATA_DIR = os.path.join(BASE_DIR, "DATA")
ALGO_DIR = os.path.join(DATA_DIR, "algo")
CHEBI_DIR = os.path.join(DATA_DIR, "chebi")
RHEA_DIR = os.path.join(DATA_DIR, "rhea")
BIOMODEL_DIR = os.path.join(DATA_DIR, "biomodels/curated_biomodels_31mar2021")
BIGG_DIR = '/Users/woosubshin/Desktop/AutomateAnnotation/DATA/bigg'
ecoli_fpath = os.path.join(BIGG_DIR, "e_coli_core.xml")

PROJ_DIR = os.path.join(os.getcwd(), os.pardir)
AMAS_DIR = os.path.join(PROJ_DIR, "AMAS")
sys.path.append(PROJ_DIR)

SUPPL_DIR = os.path.join(PROJ_DIR, os.pardir, "AMAS_suppl")
ACCURACY_DIR = os.path.join(SUPPL_DIR, "data_for_credibility")

from AMAS import species_annotation as sa
from AMAS import reaction_annotation as ra
from AMAS import recommender
from AMAS import constants as cn
from AMAS import iterator as it
from AMAS import tools

# Should save & load predicted species & reaction annotations

In [2]:
# get recom 
one_fpath = os.path.join(BIOMODEL_DIR, 'BIOMD0000000190.xml')
reader = libsbml.SBMLReader()
document = reader.readSBML(one_fpath)
model = document.getModel()
recom = recommender.Recommender(libsbml_fpath=one_fpath)

In [None]:
# two options: 
# 1. MSSC best with cutoff=0.0 (default option)
# 2. MSSC above (cutoff=0.0?)

# basically species should return a list of ranked candidates with scores; 

In [4]:
spec_res = recom.species.predictAnnotationByEditDistance('hydrogen')
spec_res

{'name_used': 'hydrogen',
 'chebi': ['CHEBI:18276', 'CHEBI:49637'],
 'match_score': [('CHEBI:18276', 1.0), ('CHEBI:49637', 1.0)],
 'formula': ['H']}

In [47]:
import editdistance
import operator
from AMAS.species_annotation import CHEBI_LOW_SYNONYMS

In [13]:
s = 'a'

1.0 - min(1.0, editdistance.eval('a', 'ab')/len('ab')

0.5

In [53]:
getOneEScore('a', 'ab')

0.5

In [50]:
def getOneEScore(one_s, two_s):
  """
  Compute the eScore 
  of a pair of two strings using
  the formula below:
  1.0 - (editdistance(one_s, two_s) / max(len(one_s, two_s)))
  
  Values should be between 0.0 and 1.0.
  
  Parameters
  ----------
  one_s: str
  two_s: str
  
  Returns
  -------
  : float (0.0-1.0)
  """
  edist = editdistance.eval(one_s, two_s)/ max(len(one_s), len(two_s))
  escore = 1.0 - edist
  return escore

def getEScores(inp_str):
  """
  Compute the eScores
  of a query string with
  all possible ChEBI terms. 
  A sorted list of tuples 
  (CHEBI:XXXXX, eScore)
  will be returned.
  
  Parameters
  ----------
  inp_str: str
  
  Returns
  -------
  :list-tuple
  """
  escores = [(one_k, np.max([getOneEScore(inp_str.lower(), val) \
                             for val in CHEBI_LOW_SYNONYMS[one_k]])) \
              for one_k in CHEBI_LOW_SYNONYMS.keys() \
              if one_k in cn.REF_CHEBI2FORMULA.keys()]
  escores.sort(key=operator.itemgetter(1), reverse=True)
  return escores

In [48]:
inp_str = 'hydrogen'
# list of tuples
escores = [(one_k, np.max([getOneEScore(inp_str.lower(), val) \
                           for val in CHEBI_LOW_SYNONYMS[one_k]])) \
            for one_k in CHEBI_LOW_SYNONYMS.keys() \
            if one_k in cn.REF_CHEBI2FORMULA.keys()]
escores.sort(key=operator.itemgetter(1), reverse=True)

In [49]:
escores

[('CHEBI:18276', 1.0),
 ('CHEBI:49637', 1.0),
 ('CHEBI:29236', 0.8),
 ('CHEBI:29237', 0.8),
 ('CHEBI:29238', 0.8),
 ('CHEBI:15378', 0.75),
 ('CHEBI:29235', 0.7272727272727273),
 ('CHEBI:39234', 0.7),
 ('CHEBI:4876', 0.7),
 ('CHEBI:27510', 0.6666666666666667),
 ('CHEBI:5818', 0.6666666666666667),
 ('CHEBI:5779', 0.6666666666666667),
 ('CHEBI:18099', 0.6363636363636364),
 ('CHEBI:142541', 0.6363636363636364),
 ('CHEBI:180544', 0.6363636363636364),
 ('CHEBI:5776', 0.6363636363636364),
 ('CHEBI:15571', 0.625),
 ('CHEBI:17997', 0.625),
 ('CHEBI:24556', 0.625),
 ('CHEBI:43176', 0.625),
 ('CHEBI:29191', 0.625),
 ('CHEBI:25555', 0.625),
 ('CHEBI:29239', 0.625),
 ('CHEBI:29306', 0.625),
 ('CHEBI:29308', 0.625),
 ('CHEBI:44423', 0.625),
 ('CHEBI:5124', 0.625),
 ('CHEBI:53767', 0.625),
 ('CHEBI:83101', 0.625),
 ('CHEBI:150012', 0.6153846153846154),
 ('CHEBI:155891', 0.6153846153846154),
 ('CHEBI:43451', 0.6153846153846154),
 ('CHEBI:175350', 0.6153846153846154),
 ('CHEBI:29305', 0.615384615384615

In [30]:
[editdistance.eval(inp_str.lower(), val) for val in ['x-trp']]

[7]

In [22]:
CHEBI_LOW_SYNONYMS['CHEBI:10047']

['x-trp']

In [44]:
elist_err = [val for val in escores \
             if val[1]<0 or val[1]>1.0]
elist = [k[1] for k in escores]
print(np.min(elist))
print(np.max(elist))
print(len(elist))

0.0
1.0
148331


In [None]:
one_result = dict()
# For now, choose the terms that are included in the CHEBI-formula mapping reference
dist_dict_min = {one_k:np.min([editdistance.eval(inp_str.lower(), val) for val in CHEBI_LOW_SYNONYMS[one_k]]) \
                 for one_k in CHEBI_LOW_SYNONYMS.keys() if one_k in cn.REF_CHEBI2FORMULA.keys()}
min_min_dist = np.min([dist_dict_min[val] for val in dist_dict_min.keys()])
min_min_chebis = [one_k for one_k in dist_dict_min.keys() \
                  if dist_dict_min[one_k]==min_min_dist and one_k in cn.REF_CHEBI2FORMULA.keys()]
# Results are sorted based on match_score (average of 1 - (editdistance/len_synonyms)
res_tuple = [(one_chebi,
              np.round(np.max([1.0-editdistance.eval(inp_str.lower(), val)/len(val) \
                                for val in CHEBI_LOW_SYNONYMS[one_chebi]]), cn.ROUND_DIGITS)) \
             for one_chebi in min_min_chebis] 
res_tuple.sort(key=operator.itemgetter(1), reverse=True)
#  CHEBI part is added, because we want a sorted list after computing res_tuple
one_result[cn.NAME_USED] = inp_str
one_result[cn.CHEBI] = [val[0] for val in res_tuple]
one_result[cn.MATCH_SCORE] = res_tuple
min_min_formula = list(set([cn.REF_CHEBI2FORMULA[val] for val in min_min_chebis]))
one_result[cn.FORMULA] = min_min_formula
return one_result