In [1]:
# Update iterator algorithm
import collections
import itertools
import libsbml
import numpy as np
import os
import pickle
import pandas as pd
import sys
import matplotlib.pyplot as plt
%matplotlib inline  

BIOMD_12 = 'BIOMD0000000012.xml'
BASE_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/'
DATA_DIR = os.path.join(BASE_DIR, "DATA")
ALGO_DIR = os.path.join(DATA_DIR, "algo")
CHEBI_DIR = os.path.join(DATA_DIR, "chebi")
RHEA_DIR = os.path.join(DATA_DIR, "rhea")
BIOMODEL_DIR = os.path.join(DATA_DIR, "biomodels/curated_biomodels_31mar2021")
BIGG_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/DATA/bigg'
ecoli_fpath = os.path.join(BIGG_DIR, "e_coli_core.xml")

PROJ_DIR = os.path.join(os.getcwd(), os.pardir)
AMAS_DIR = os.path.join(PROJ_DIR, "AMAS")
sys.path.append(PROJ_DIR)

from AMAS import species_annotation as sa
from AMAS import reaction_annotation as ra
from AMAS import recommender
from AMAS import constants as cn
from AMAS import iterator as it
from AMAS import tools

In [2]:
reader = libsbml.SBMLReader()
document = reader.readSBML(ecoli_fpath)
model = document.getModel()
recom = recommender.Recommender(libsbml_fpath=ecoli_fpath)

In [3]:
# make sure both adp and atp are included
one_reaction = 'R_PFK'
print(recom.reactions.reaction_components[one_reaction])

['M_h_c', 'M_atp_c', 'M_f6p_c', 'M_adp_c', 'M_fdp_c']


In [34]:
reac_recom = recom.getReactionAnnotation(pred_id = one_reaction)
print(reac_recom)

Recommendation(id='R_PFK', credibility=0.91, candidates=[('RHEA:12423', 0.8), ('RHEA:13380', 0.8), ('RHEA:14216', 0.8), ('RHEA:15656', 0.8), ('RHEA:16112', 0.8), ('RHEA:20108', 0.8)], urls=['https://www.rhea-db.org/rhea/12423', 'https://www.rhea-db.org/rhea/13380', 'https://www.rhea-db.org/rhea/14216', 'https://www.rhea-db.org/rhea/15656', 'https://www.rhea-db.org/rhea/16112', 'https://www.rhea-db.org/rhea/20108'])


In [35]:
one_rhea_tup = recom.reactions.candidates[one_reaction]
one_rhea = one_rhea_tup[0][0]
print(one_rhea_tup)

[('RHEA:12423', 0.8), ('RHEA:13380', 0.8), ('RHEA:14216', 0.8), ('RHEA:15656', 0.8), ('RHEA:16112', 0.8), ('RHEA:20108', 0.8)]


In [6]:
def getDictOfRheaComponentFormula(inp_rhea):
  """
  Get a dictionary {chebi_id: formula}
  from a given rhea term.
  Rhea term -> CheBI IDs -> Formulas
  
  Parameters
  ----------
  str: inp_rhea
  
  Returns
  -------
  : dict
  """
  chebis = cn.REF_RHEA2CHEBI[one_rhea]
  return {val:cn.REF_CHEBI2FORMULA[val] for val in chebis \
          if val in cn.REF_CHEBI2FORMULA.keys()}

In [17]:
# match_dict = {one_k:[spec_id for spec_id in pred_spec_formulas.keys() \
#                      if one_rhea2formula[one_k] in pred_spec_formulas[spec_id]
#                     ] \
#               for one_k in one_rhea2formula.keys()}
# print(match_dict)
# unmatched_species = [val for val in pred_spec_formulas.keys() \
#                     if val not in list(itertools.chain(*match_dict.values()))]
# unmatched_chebi = [val for val in match_dict.keys() if not match_dict[val]]
# if len(unmatched_species) == 1 and len(unmatched_chebi) == 1:
#   print({unmatched_species[0]: unmatched_chebi})

{'CHEBI:456216': ['M_adp_c'], 'CHEBI:15378': ['M_h_c'], 'CHEBI:58694': ['M_fdp_c'], 'CHEBI:30616': [], 'CHEBI:58695': ['M_f6p_c']}
{'M_atp_c': ['CHEBI:30616']}


In [7]:
def getDictMatchByItem(chebi2ref_formula, spec2pred_formula):
  """
  Get match between two keys,
  where there are exactly 
  one matching items.
  If all items are matched by 1-1
  (i.e., one species - one chebi),
  return the fully matched dictionary.
  (i.e., improve precision)
  If neither, return None.
  (i.e., nothing to update)
  
  Parameters
  ----------
  chebi2ref_formula: dict
      {chebi_term: a_species_formula(string)}
  spec2pred_formula: dict
      {species_id: [predicted_formulas]}
  
  Returns
  -------
  dict/None
      {species_id: [chebi_term]}
  """
  match_dict = {one_k:[spec_id for spec_id in spec2pred_formula.keys() \
                       if chebi2ref_formula[one_k] in spec2pred_formula[spec_id]
                      ] \
                for one_k in chebi2ref_formula.keys()}
  unmatched_species = [val for val in spec2pred_formula.keys() \
                      if val not in list(itertools.chain(*match_dict.values()))]
  unmatched_chebi = [val for val in match_dict.keys() if not match_dict[val]]
  if len(unmatched_species) == 1 and len(unmatched_chebi) == 1:
    return {unmatched_species[0]: unmatched_chebi} 
  elif all([len(val[1])==1 for val in list(match_dict.items())]):
    return match_dict
  else:
    return None

In [37]:
# The actual process: 1. Getting information to update
pred_spec_formulas = recom.species.formula
one_rhea2formula = getDictOfRheaComponentFormula(one_rhea)

match_res = getDictMatchByItem(chebi2ref_formula=one_rhea2formula,
                               spec2pred_formula=pred_spec_formulas)
print(match_res)
match_res_formula = {k:[cn.REF_CHEBI2FORMULA[chebi] for chebi in match_res[k]] for k in match_res.keys()}
print(match_res_formula)

{'M_atp_c': ['CHEBI:30616']}
{'M_atp_c': ['C10N5O13P3']}


In [None]:
# Process 2: test whether this update makes sense;
# i.e., check if this increases the confidence or match score (of reaction)


# Process 3; based on the result of process 2, actuallly update annotation.
# In this step, match score should be updated, I guess (or not..)


In [58]:
recom.species.updateSpeciesWithRecommendation()

TypeError: updateSpeciesWithRecommendation() missing 1 required positional argument: 'inp_recom'

In [52]:
# TODO: before updating the species, test whether it increases reaction scores
# it may proceed if match score does not decrease (i.e., best match score is same or increasing)
import copy
def checkWhetherUpdateImprovesMatchScore(cur_spec_formulas, inp_spec2formula_dict):
  """
  Check whether it improves reaction measures; 
  
  """
#   cur_spec_formulas = copy.deepcopy(recom.species.formula)
#   inp_spec2formula_dict = match_res_formula
  cur_spec_formulas.update(inp_spec2formula_dict)

  new_pred_res = recom.reactions.predictAnnotation(inp_spec_dict = cur_spec_formulas,
                                                   inp_reac_list = [one_reaction])
  old_pred_res = recom.reactions.predictAnnotation(inp_spec_dict = recom.species.formula,
                                                   inp_reac_list = [one_reaction])
  # since candidates are already sorted, 
  # just check the match score (index '1') of the very first element (index '0')
  # this 'one_reaction' should be 
  new_pred_val = np.sum([new_pred_res[cn.MATCH_SCORE][k][0][1] \
                         for k in new_pred_res[cn.MATCH_SCORE].keys()])
  old_pred_val = np.sum([old_pred_res[cn.MATCH_SCORE][k][0][1] \
                         for k in old_pred_res[cn.MATCH_SCORE].keys()])
  if new_pred_val >= old_pred_val:
    return True
  else:
    return False

In [59]:
np.sum([new_pred_res[cn.MATCH_SCORE][k][0][1] for k in new_pred_res[cn.MATCH_SCORE].keys()])

1.0

In [53]:
checkWhetherUpdateImprovesMatchScore(cur_spec_formulas = copy.deepcopy(recom.species.formula),
                                     inp_spec2formula_dict = match_res_formula)

KeyError: 0

In [58]:


cur_spec_formulas = copy.deepcopy(recom.species.formula)
inp_spec2formula_dict = match_res_formula
cur_spec_formulas.update(inp_spec2formula_dict)

new_pred_res = recom.reactions.predictAnnotation(inp_spec_dict = cur_spec_formulas,
                                                 inp_reac_list = [one_reaction])
old_pred_res = recom.reactions.predictAnnotation(inp_spec_dict = recom.species.formula,
                                                 inp_reac_list = [one_reaction])
new_pred_val = new_pred_res[cn.MATCH_SCORE][one_reaction][0][1]
old_pred_val = old_pred_res[cn.MATCH_SCORE][one_reaction][0][1]

# if new_pred_val >= old_pred_val:
#   return True
# else:
#   return False

In [56]:
new_pred_res['MAT']

{'candidates': {'R_PFK': Index(['RHEA:12423', 'RHEA:13380', 'RHEA:14216', 'RHEA:15656', 'RHEA:16112'], dtype='object')},
 'match_score': {'R_PFK': [('RHEA:12423', 1.0),
   ('RHEA:13380', 1.0),
   ('RHEA:14216', 1.0),
   ('RHEA:15656', 1.0),
   ('RHEA:16112', 1.0)]},
 'query_df':              R_PFK
 C10N2O11P        0
 C45CoN4O14       0
 C17N4O10P        0
 C45O             0
 C20O2            0
 ...            ...
 C10N2O3          0
 C17N5O17P3R      0
 C32N5O8P         0
 C40N9O13         0
 C22N2O2          0
 
 [3790 rows x 1 columns]}

In [47]:
new_reac_res[cn.MATCH_SCORE][one_reaction][0][1]

1.0

In [48]:
old_reac_res[cn.MATCH_SCORE][one_reaction][0][1]

0.8

In [None]:
# as the value increased, (maximum possible match score), 
# 

In [39]:
cur_spec_formulas['M_atp_c']

['C10N5O13P3']

In [40]:
recom.species.formula['M_atp_c']

['C20O4', 'C18ClN2O6S2']

In [None]:
# Below is for species class
def updateSpeciesWithDict(inp_dict, inp_match_score=None):
  """
  Update species with candidates
  using a dictionary.
  Match score should be
  wisely chosen as 
  nothing is matched; 
  
  Parameters
  ----------
  inp_dict: dict
      {species_id: [chebi terms]}
  
  Returns
  -------
  None
  """
  # TODO: 
  pass

In [45]:
inp_dict = match_res
list(itertools.chain(*[[cn.REF_CHEBI2FORMULA[val] for val in inp_dict[k]] for k in inp_dict.keys()]))

['C10N5O13P3']

In [59]:
match_res

{'M_atp_c': ['CHEBI:30616']}

In [40]:
cn.REF_CHEBI2FORMULA[inp_dict['M_atp_c'][0]]

'C10N5O13P3'

In [41]:
inp_dict

{'M_atp_c': ['CHEBI:30616']}

In [50]:
one_str = 'I have two pets; one cute cat, and one ugly dog.'
dict(collections.Counter(one_str.split(' ')))

{'I': 1,
 'have': 1,
 'two': 1,
 'pets;': 1,
 'one': 2,
 'cute': 1,
 'cat,': 1,
 'and': 1,
 'ugly': 1,
 'dog.': 1}

In [36]:
recom.species.formula

{'M_fdp_c': ['C6O12P2'],
 'M_atp_c': ['C18ClN2O6S2', 'C20O4'],
 'M_h_c': ['H'],
 'M_f6p_c': ['C6O9P'],
 'M_adp_c': ['C9N4O5',
  'C20N2O5S',
  'C19O9P',
  'C17Cl2F3N7O2S',
  'C19O2',
  'C28N6OS',
  'C115N8O85',
  'C27O5',
  'C16F3IN2O4',
  'C18ClN2O6S2',
  'C18N4O11',
  'C35N4O4',
  'C26N7O2S',
  'C16NO6',
  'C10N5O10P2',
  'C14N2O',
  'C29N6O4S',
  'C101N7O75',
  'C20O4',
  'C21ClN3O2',
  'C26FIN5O4',
  'C19N2O2S',
  'C8NO6',
  'C18O2',
  'C12']}

In [6]:
# iterator algorithm compares both 