In [1]:
# Update iterator algorithm
import collections
import copy
import itertools
import libsbml
import numpy as np
import os
import pickle
import pandas as pd
import sys
import matplotlib.pyplot as plt
%matplotlib inline  

BIOMD_12 = 'BIOMD0000000012.xml'
BASE_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/'
DATA_DIR = os.path.join(BASE_DIR, "DATA")
ALGO_DIR = os.path.join(DATA_DIR, "algo")
CHEBI_DIR = os.path.join(DATA_DIR, "chebi")
RHEA_DIR = os.path.join(DATA_DIR, "rhea")
BIOMODEL_DIR = os.path.join(DATA_DIR, "biomodels/curated_biomodels_31mar2021")
BIGG_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/DATA/bigg'
ecoli_fpath = os.path.join(BIGG_DIR, "e_coli_core.xml")

PROJ_DIR = os.path.join(os.getcwd(), os.pardir)
AMAS_DIR = os.path.join(PROJ_DIR, "AMAS")
sys.path.append(PROJ_DIR)

from AMAS import species_annotation as sa
from AMAS import reaction_annotation as ra
from AMAS import recommender
from AMAS import constants as cn
from AMAS import iterator as it
from AMAS import tools

# Should save & load predicted species & reaction annotations

In [2]:
reader = libsbml.SBMLReader()
document = reader.readSBML(ecoli_fpath)
model = document.getModel()
recom = recommender.Recommender(libsbml_fpath=ecoli_fpath)

In [3]:
# recom.reactions.exist_annotation
two_reactions = ['R_PFK', 'R_PFL']

In [4]:
# # make sure both adp and atp are included
# one_reaction = 'R_PFK'
# print(recom.reactions.reaction_components[one_reaction])

In [5]:
reac_recom = recom.getReactionListAnnotation(pred_ids = two_reactions)
print(reac_recom)

[Recommendation(id='R_PFK', credibility=0.91, candidates=[('RHEA:12423', 0.8), ('RHEA:13380', 0.8), ('RHEA:14216', 0.8), ('RHEA:15656', 0.8), ('RHEA:16112', 0.8), ('RHEA:20108', 0.8)], urls=['https://www.rhea-db.org/rhea/12423', 'https://www.rhea-db.org/rhea/13380', 'https://www.rhea-db.org/rhea/14216', 'https://www.rhea-db.org/rhea/15656', 'https://www.rhea-db.org/rhea/16112', 'https://www.rhea-db.org/rhea/20108']), Recommendation(id='R_PFL', credibility=0.12, candidates=[('RHEA:11847', 1.0), ('RHEA:17428', 0.8), ('RHEA:22991', 0.8), ('RHEA:22995', 0.8), ('RHEA:28045', 0.8), ('RHEA:12768', 0.6666666666666666), ('RHEA:21915', 0.6666666666666666), ('RHEA:44143', 0.6666666666666666)], urls=['https://www.rhea-db.org/rhea/11847', 'https://www.rhea-db.org/rhea/17428', 'https://www.rhea-db.org/rhea/22991', 'https://www.rhea-db.org/rhea/22995', 'https://www.rhea-db.org/rhea/28045', 'https://www.rhea-db.org/rhea/12768', 'https://www.rhea-db.org/rhea/21915', 'https://www.rhea-db.org/rhea/44143'

In [6]:
# one_rhea_tup = recom.reactions.candidates[one_reaction]
# one_rhea = one_rhea_tup[0][0]
# print(one_rhea_tup)

In [7]:
# def getDictOfRheaComponentFormula(inp_rhea):
#   """
#   Get a dictionary {chebi_id: formula}
#   from a given rhea term.
#   Rhea term -> CheBI IDs -> Formulas
  
#   Parameters
#   ----------
#   str: inp_rhea
  
#   Returns
#   -------
#   : dict
#   """
#   chebis = cn.REF_RHEA2CHEBI[inp_rhea]
#   return {val:cn.REF_CHEBI2FORMULA[val] for val in chebis \
#           if val in cn.REF_CHEBI2FORMULA.keys()}

In [8]:
# def getDictMatchByItem(chebi2ref_formula, spec2pred_formula):
#   """
#   Get match between two keys,
#   where there are exactly 
#   one matching items.
#   If all items are matched by 1-1
#   (i.e., one species - one chebi),
#   return the fully matched dictionary.
#   (i.e., improve precision)
#   If neither, return None.
#   (i.e., nothing to update)
  
#   Parameters
#   ----------
#   chebi2ref_formula: dict
#       {chebi_term: a_species_formula(string)}
#   spec2pred_formula: dict
#       {species_id: [predicted_formulas]}
  
#   Returns
#   -------
#   dict/None
#       {species_id: [chebi_term]}
#   """
#   match_dict = {one_k:[spec_id for spec_id in spec2pred_formula.keys() \
#                        if chebi2ref_formula[one_k] in spec2pred_formula[spec_id]
#                       ] \
#                 for one_k in chebi2ref_formula.keys()}
#   unmatched_species = [val for val in spec2pred_formula.keys() \
#                       if val not in list(itertools.chain(*match_dict.values()))]
#   unmatched_chebi = [val for val in match_dict.keys() if not match_dict[val]]
#   if len(unmatched_species) == 1 and len(unmatched_chebi) == 1:
#     return {unmatched_species[0]: unmatched_chebi} 
#   elif all([len(val[1])==1 for val in list(match_dict.items())]):
#     return match_dict
#   else:
#     return None

In [12]:
anot_iter = it.Iterator(cur_spec_formula=recom.species.formula,
                        cur_reac_candidates=recom.reactions.candidates,
                        reaction_cl=recom.reactions)

In [16]:
match_res

{'M_for_c': ['CHEBI:15740'],
 'M_accoa_c': ['CHEBI:57288'],
 'M_pyr_c': ['CHEBI:15361'],
 'M_coa_c': ['CHEBI:57287']}

In [26]:
# Iterator.match() will take care of this entire process, and return approprite valaues; 
combine_spec2update = dict()
for one_reaction in two_reactions:
  print("Working with %s" % one_reaction)
  one_rhea_tup = recom.reactions.candidates[one_reaction]
  one_rhea = one_rhea_tup[0][0]
  pred_spec_formulas = recom.species.formula
  one_rhea2formula = anot_iter.getDictOfRheaComponentFormula(inp_rhea=one_rhea)
  print("Arguments")
  print(pred_spec_formulas)
  print(one_rhea2formula)

  match_res, match_res_formula = anot_iter.getDictsToUpdate(reaction_id=one_reaction)
# match_res = anot_iter.getDictMatchByItem(chebi2ref_formula=one_rhea2formula,
#                                          spec2pred_formula=pred_spec_formulas)
  print("Returns")
  print(match_res)
# match_res_formula = {k:[cn.REF_CHEBI2FORMULA[chebi] for chebi in match_res[k]] for k in match_res.keys()}
  print(match_res_formula)
  print("\n\n")
  upd_val = anot_iter.getUpdatedMatchScore(cur_spec_formulas = copy.deepcopy(anot_iter.orig_spec_formula),
                                           inp_spec2formula_dict = match_res_formula)
  print("Testing updated score", upd_val)
  print("\n\n")
  if upd_val['is_increased']:
    # update combine_spec2update;  but by combining the elements.
    for k in match_res.keys():
      if k in combine_spec2update.keys():
        combine_spec2update[k] = combine_spec2update[k] + match_res[k]
      else:
        combine_spec2update[k] = match_res[k] 
  # => if true, include to updated formula status, and then continue; 

print("Final updated species dict", combine_spec2update)


# TODO (last step): for each match, caldulate match score difference and determine whether to improve it or not
# if decided to use, include it in the updated spec/reac dictionaries. finally incorporate it with recom. 

Working with R_PFK
Arguments
{'M_adp_c': ['C20O4', 'C28N6OS', 'C101N7O75', 'C17Cl2F3N7O2S', 'C18N4O11', 'C18O2', 'C20N2O5S', 'C26N7O2S', 'C26FIN5O4', 'C16NO6', 'C14N2O', 'C29N6O4S', 'C10N5O10P2', 'C12', 'C19N2O2S', 'C27O5', 'C19O2', 'C9N4O5', 'C19O9P', 'C21ClN3O2', 'C115N8O85', 'C16F3IN2O4', 'C18ClN2O6S2', 'C8NO6', 'C35N4O4'], 'M_h_c': ['H'], 'M_fdp_c': ['C6O12P2'], 'M_for_c': ['CO2'], 'M_accoa_c': ['C23N7O17P3S'], 'M_coa_c': ['C21N7O16P3S'], 'M_f6p_c': ['C6O9P'], 'M_atp_c': ['C20O4', 'C18ClN2O6S2'], 'M_pyr_c': ['C3O3']}
{'CHEBI:456216': 'C10N5O10P2', 'CHEBI:15378': 'H', 'CHEBI:58694': 'C6O12P2', 'CHEBI:30616': 'C10N5O13P3', 'CHEBI:58695': 'C6O9P'}
Returns
{'M_atp_c': ['CHEBI:30616']}
{'M_atp_c': ['C10N5O13P3']}



Testing updated score {'new_score': 1.0, 'old_score': 0.9, 'is_increased': True}



Working with R_PFL
Arguments
{'M_adp_c': ['C20O4', 'C28N6OS', 'C101N7O75', 'C17Cl2F3N7O2S', 'C18N4O11', 'C18O2', 'C20N2O5S', 'C26N7O2S', 'C26FIN5O4', 'C16NO6', 'C14N2O', 'C29N6O4S', 'C10N5O10

In [15]:
anot_iter.orig_reac_candidates

{'R_PFK': [('RHEA:12423', 0.8),
  ('RHEA:13380', 0.8),
  ('RHEA:14216', 0.8),
  ('RHEA:15656', 0.8),
  ('RHEA:16112', 0.8),
  ('RHEA:20108', 0.8)],
 'R_PFL': [('RHEA:11847', 1.0),
  ('RHEA:17428', 0.8),
  ('RHEA:22991', 0.8),
  ('RHEA:22995', 0.8),
  ('RHEA:28045', 0.8),
  ('RHEA:12768', 0.6666666666666666),
  ('RHEA:21915', 0.6666666666666666),
  ('RHEA:44143', 0.6666666666666666)]}

In [None]:
anot_iter = it.Iterator(cur_spec_formula=recom.species.formula,
                        cur_reac_candidates=recom.reactions.candidates,
                        reaction_cl=recom.reactions)

In [10]:
# # Before updating the species, test whether it increases reaction scores
# # it may proceed if match score increases
# import copy
# def checkWhetherUpdateImprovesMatchScore(cur_spec_formulas, inp_spec2formula_dict):
#   """
#   Check whether it improves reaction measures; 
#   if new value (sum of maximum match score per reaction)
#   increased, return True; otherwise return False.
  
#   Parameters
#   ----------
#   cur_spec_formulas: dict
#       {'species_id': [formula-str]}
#       Dictionary to be updated
      
#   inp_spec_2formula_dict: dict
#       {'species_id': [formula-str]}
#       Dictionary to update
      
#   Returns
#   -------
#   : bool
#   """
#   cur_spec_formulas.update(inp_spec2formula_dict)

#   new_pred_res = recom.reactions.predictAnnotation(inp_spec_dict = cur_spec_formulas,
#                                                    inp_reac_list = [one_reaction])
#   old_pred_res = recom.reactions.predictAnnotation(inp_spec_dict = recom.species.formula,
#                                                    inp_reac_list = [one_reaction])
#   # since candidates are already sorted, 
#   # just check the match score (index '1') of the very first candidate tuple (index '0')
#   new_pred_val = np.mean([new_pred_res[cn.MATCH_SCORE][k][0][1] \
#                          for k in new_pred_res[cn.MATCH_SCORE].keys()])
#   old_pred_val = np.mean([old_pred_res[cn.MATCH_SCORE][k][0][1] \
#                          for k in old_pred_res[cn.MATCH_SCORE].keys()])
#   if new_pred_val > old_pred_val:
#     return new_pred_val
#   else:
#     return False

In [11]:
# Process 2: test whether this update makes sense;
# i.e., check if this increases the confidence or match score (of reaction)
upd_val = anot_iter.getUpdatedMatchScore(cur_spec_formulas = copy.deepcopy(anot_iter.orig_spec_formula),
                                         inp_spec2formula_dict = match_res_formula)
print(upd_val)

{'new_score': 1.0, 'old_score': 0.8, 'is_increased': True}


In [None]:
# TODO: repeat until threshold; update both species and reaction annotation

In [None]:
# Process 3; based on the result of process 2, actuallly update annotation.
# In this step, match score should be updated, I guess (or not..)
# Done!... for species

In [None]:
# Process 3.5; make sure it iterator doesn't depend on recom

# Process 4; report final result (create recom.getRecommendation??)

# Process 5; create this option to the recommender. 

In [79]:
# # Below is for species class (maybe a species_annotation method)
# def updateSpeciesWithDict(self, inp_dict):
#   """
#   Update species with candidates
#   using a dictionary.
#   Match score should be
#   wisely chosen as 
#   nothing is matched.
#   As match scores are given
#   when exact matches are found, 
#   match scores should be 1.0. 
  
#   Parameters
#   ----------
#   inp_dict: dict
#       {species_id: [chebi terms]}
  
#   Returns
#   -------
#   None
#   """
#   # TODO:
#   info2upd_candidates = {k:[(val, 1.0) for val in inp_dict[k]] for k in inp_dict.keys()}
#   info2upd_formula = {k:[cn.REF_CHEBI2FORMULA[chebi] for chebi in inp_dict[k]] for k in inp_dict.keys()}
#   self.candidates.update(info2upd_candidates)
#   self.formula.update(info2upd_formula)

In [50]:
one_str = 'I have two pets; one cute cat, and one ugly dog.'
dict(collections.Counter(one_str.split(' ')))

{'I': 1,
 'have': 1,
 'two': 1,
 'pets;': 1,
 'one': 2,
 'cute': 1,
 'cat,': 1,
 'and': 1,
 'ugly': 1,
 'dog.': 1}

In [36]:
recom.species.formula

{'M_fdp_c': ['C6O12P2'],
 'M_atp_c': ['C18ClN2O6S2', 'C20O4'],
 'M_h_c': ['H'],
 'M_f6p_c': ['C6O9P'],
 'M_adp_c': ['C9N4O5',
  'C20N2O5S',
  'C19O9P',
  'C17Cl2F3N7O2S',
  'C19O2',
  'C28N6OS',
  'C115N8O85',
  'C27O5',
  'C16F3IN2O4',
  'C18ClN2O6S2',
  'C18N4O11',
  'C35N4O4',
  'C26N7O2S',
  'C16NO6',
  'C10N5O10P2',
  'C14N2O',
  'C29N6O4S',
  'C101N7O75',
  'C20O4',
  'C21ClN3O2',
  'C26FIN5O4',
  'C19N2O2S',
  'C8NO6',
  'C18O2',
  'C12']}

In [6]:
# iterator algorithm compares both 