In [1]:
# Update iterator algorithm; and making recommendations; 
import collections
import copy
import itertools
import libsbml
import numpy as np
import os
import pickle
import pandas as pd
import sys
import matplotlib.pyplot as plt
%matplotlib inline  

BIOMD_12 = 'BIOMD0000000012.xml'
BASE_DIR = '/Users/woosubshin/Desktop/AutomateAnnotation/'
DATA_DIR = os.path.join(BASE_DIR, "DATA")
ALGO_DIR = os.path.join(DATA_DIR, "algo")
CHEBI_DIR = os.path.join(DATA_DIR, "chebi")
RHEA_DIR = os.path.join(DATA_DIR, "rhea")
BIOMODEL_DIR = os.path.join(DATA_DIR, "biomodels/curated_biomodels_31mar2021")
BIGG_DIR = '/Users/woosubshin/Desktop/AutomateAnnotation/DATA/bigg'
ecoli_fpath = os.path.join(BIGG_DIR, "e_coli_core.xml")

PROJ_DIR = os.path.join(os.getcwd(), os.pardir)
AMAS_DIR = os.path.join(PROJ_DIR, "AMAS")
sys.path.append(PROJ_DIR)

from AMAS import species_annotation as sa
from AMAS import reaction_annotation as ra
from AMAS import recommender
from AMAS import constants as cn
from AMAS import iterator as it
from AMAS import tools

# Should save & load predicted species & reaction annotations

In [2]:
reader = libsbml.SBMLReader()
document = reader.readSBML(ecoli_fpath)
model = document.getModel()
recom = recommender.Recommender(libsbml_fpath=ecoli_fpath)

In [3]:
# # make sure both adp and atp are included
one_reaction = 'R_ACKr'
comps_ackrs = recom.reactions.reaction_components[one_reaction]
print(comps_ackrs)

['M_atp_c', 'M_adp_c', 'M_actp_c', 'M_ac_c']


In [4]:
spec_res = recom.getSpeciesListRecommendation(pred_ids=comps_ackrs, method='edist')
for val in spec_res:
  print(val.id)
  print(val.candidates)
  print()

M_atp_c
[('CHEBI:182955', 0.231), ('CHEBI:135736', 0.059)]

M_adp_c
[('CHEBI:77390', 0.312), ('CHEBI:32411', 0.28), ('CHEBI:36331', 0.259), ('CHEBI:31899', 0.158), ('CHEBI:182955', 0.154), ('CHEBI:68840', 0.15), ('CHEBI:42870', 0.147), ('CHEBI:90695', 0.138), ('CHEBI:456216', 0.125), ('CHEBI:28498', 0.111), ('CHEBI:167004', 0.105), ('CHEBI:147398', 0.1), ('CHEBI:72990', 0.088), ('CHEBI:93296', 0.085), ('CHEBI:151629', 0.083), ('CHEBI:152534', 0.083), ('CHEBI:153980', 0.083), ('CHEBI:78443', 0.08), ('CHEBI:167672', 0.067), ('CHEBI:89713', 0.062), ('CHEBI:90217', 0.061), ('CHEBI:88249', 0.06), ('CHEBI:135736', 0.059), ('CHEBI:63450', 0.056), ('CHEBI:63452', 0.05), ('CHEBI:75998', 0.048), ('CHEBI:90304', 0.046)]

M_actp_c
[('CHEBI:22191', 1.0), ('CHEBI:15350', 1.0)]

M_ac_c
[('CHEBI:47622', 1.0), ('CHEBI:30089', 1.0)]



In [4]:
mult_reactions = ['R_PFK', 'R_PFL']
recom = recommender.Recommender(libsbml_fpath=ecoli_fpath)
reac_recom = recom.getReactionListAnnotation(pred_ids = mult_reactions, spec_method='edist')
anot_iter = it.Iterator(cur_spec_formula=recom.species.formula,
                        reaction_cl=recom.reactions,
                        reactions_to_update = mult_reactions)

In [5]:
E_COLI_PATH = os.path.join(cn.TEST_DIR, 'e_coli_core.xml')
INIT_SPEC_FORMULA = {'M_accoa_c': ['C23N7O17P3S'],
                     'M_for_c': ['CO2'], 
                     'M_adp_c': ['C115N8O85', 'C21ClN3O2', 'C19N2O2S',
                                 'C17Cl2F3N7O2S', 'C16NO6', 'C19O2',
                                 'C26N7O2S', 'C19O9P', 'C28N6OS',
                                 'C16F3IN2O4', 'C35N4O4', 'C18N4O11',
                                 'C26FIN5O4', 'C20O4', 'C12',
                                 'C10N5O10P2', 'C8NO6', 'C29N6O4S',
                                 'C18ClN2O6S2', 'C27O5', 'C18O2',
                                 'C9N4O5', 'C20N2O5S', 'C101N7O75',
                                 'C14N2O'],
                     'M_pyr_c': ['C3O3'],
                     'M_coa_c': ['C21N7O16P3S'],
                     'M_fdp_c': ['C6O12P2'],
                     'M_f6p_c': ['C6O9P'],
                     'M_atp_c': ['C20O4', 'C18ClN2O6S2'],
                     'M_h_c': ['H']}

REACTION_CANDIDATES = {'R_PFK': [('RHEA:12423', 0.8), ('RHEA:13380', 0.8),
                                 ('RHEA:14216', 0.8), ('RHEA:15656', 0.8),
                                 ('RHEA:16112', 0.8), ('RHEA:20108', 0.8)],
                       'R_PFL': [('RHEA:11847', 1.0), ('RHEA:17428', 0.8),
                                 ('RHEA:22991', 0.8), ('RHEA:22995', 0.8),
                                 ('RHEA:28045', 0.8), ('RHEA:12768', 0.667),
                                 ('RHEA:21915', 0.667), ('RHEA:44143', 0.667)]}
R_PFK = 'R_PFK'
R_PFL = 'R_PFL'
REACTIONS = [R_PFK, R_PFL]

In [6]:
reac_cl = ra.ReactionAnnotation(libsbml_fpath = E_COLI_PATH)
reac_cl.candidates = REACTION_CANDIDATES
anot_iter = it.Iterator(cur_spec_formula=INIT_SPEC_FORMULA,
                             reaction_cl=reac_cl,
                             reactions_to_update=REACTIONS)

In [7]:
anot_iter.runOneMatchCycle()

{'M_atp_c': ['CHEBI:30616']}

In [6]:
recom.updateAnnotationsByIteration()

In [7]:
print(recom.species.candidates['M_atp_c'])
print(recom.species.formula['M_atp_c'])
print(recom.reactions.candidates['R_PFK'])

[('CHEBI:30616', 0.231)]
['C10N5O13P3']
[('RHEA:12423', 1.0), ('RHEA:13380', 1.0), ('RHEA:14216', 1.0), ('RHEA:15656', 1.0), ('RHEA:16112', 1.0)]


In [8]:
one_rhea_tup = anot_iter.reactions.candidates[one_reaction]
one_rhea = one_rhea_tup[0][0]
pred_spec_formulas = anot_iter.orig_spec_formula
one_rhea2formula = anot_iter.getDictOfRheaComponentFormula(inp_rhea=one_rhea)

In [18]:
upd_spec2chebi, upd_spec2formula = anot_iter.getDictsToUpdate(reaction_id='R_PFK')
print(upd_spec2formula)

{'M_atp_c': ['C10N5O13P3']}


In [19]:
upd_val = anot_iter.getUpdatedMatchScore(cur_spec_formulas = copy.deepcopy(anot_iter.orig_spec_formula),
                                         inp_spec2formula_dict = upd_spec2formula)
upd_val

{'new_score': 1.0, 'old_score': 0.9, 'is_increased': True}

In [21]:
one_rhea = 'RHEA:12423'
reaction_id = 'R_PFK'
# match_res will look like {species_id: [CHEBI term]}
# filter to have only keys and items of one reaction
filt_spec_formula = {k:anot_iter.orig_spec_formula[k] \
                       for k in anot_iter.reactions.reaction_components[reaction_id]}

In [22]:
upd_spec_chebi = anot_iter.getDictMatchByItem(chebi2ref_formula=anot_iter.getDictOfRheaComponentFormula(one_rhea),
                                              spec2pred_formula=filt_spec_formula)

In [26]:
upd_spec_chebi

{'M_atp_c': ['CHEBI:30616']}

In [5]:
anot_iter.runOneMatchCycle()

{'M_atp_c': ['CHEBI:30616']}

In [6]:
anot_iter.orig_spec_formula

{'M_h_c': ['H'],
 'M_coa_c': ['C21N7O16P3S'],
 'M_for_c': ['CO2'],
 'M_accoa_c': ['C23N7O17P3S'],
 'M_adp_c': ['C10N5O10P2',
  'C35N4O4',
  'C19O9P',
  'C12',
  'C18O2',
  'C19O2',
  'C18N4O11',
  'C19N2O2S',
  'C9N4O5',
  'C20O4',
  'C26N7O2S',
  'C16F3IN2O4',
  'C8NO6',
  'C28N6OS',
  'C27O5',
  'C26FIN5O4',
  'C17Cl2F3N7O2S',
  'C16NO6',
  'C14N2O',
  'C101N7O75',
  'C29N6O4S',
  'C115N8O85',
  'C20N2O5S',
  'C21ClN3O2',
  'C18ClN2O6S2'],
 'M_pyr_c': ['C3O3'],
 'M_atp_c': ['C18ClN2O6S2', 'C20O4'],
 'M_f6p_c': ['C6O9P'],
 'M_fdp_c': ['C6O12P2']}

In [22]:
recom.species.formula

{'M_fdp_c': ['C6O12P2'],
 'M_h_c': ['[3He]', 'C6N3O', 'H', 'C12N6O3', 'C6N3O2'],
 'M_coa_c': ['C21N7O16P3S'],
 'M_for_c': ['CO2', 'C25NO3.Cl'],
 'M_pyr_c': ['C3O3'],
 'M_f6p_c': ['C6O9P'],
 'M_atp_c': ['C30N4O29P3'],
 'M_adp_c': ['C30O8P', 'C39O8P'],
 'M_accoa_c': ['C23N7O17P3S']}

In [30]:
# recom.reactions.exist_annotation


In [21]:
recom.reactions

<AMAS.reaction_annotation.ReactionAnnotation at 0x7f82a7f88ee0>

In [14]:
one_rhea_tup = anot_iter.reactions.candidates[one_reaction]
one_rhea = one_rhea_tup[0][0]
pred_spec_formulas = anot_iter.orig_spec_formula
one_rhea2formula = anot_iter.getDictOfRheaComponentFormula(inp_rhea=one_rhea)
upd_spec2chebi, upd_spec2formula = anot_iter.getDictsToUpdate(reaction_id=one_reaction)

In [17]:
upd_val = anot_iter.getUpdatedMatchScore(cur_spec_formulas = copy.deepcopy(anot_iter.orig_spec_formula),
                                            inp_spec2formula_dict = upd_spec2formula)

In [18]:
upd_val

{'new_score': 0.8, 'old_score': 0.8, 'is_increased': False}

In [22]:
one_rhea_tup = anot_iter.reactions.candidates[one_reaction]
one_rhea = one_rhea_tup[0][0]
pred_spec_formulas = anot_iter.orig_spec_formula
one_rhea2formula = anot_iter.getDictOfRheaComponentFormula(inp_rhea=one_rhea)
upd_spec2chebi, upd_spec2formula = anot_iter.getDictsToUpdate(reaction_id=one_reaction)

In [25]:
upd_spec2formula

{'M_for_c': ['CO2'],
 'M_accoa_c': ['C23N7O17P3S'],
 'M_pyr_c': ['C3O3'],
 'M_coa_c': ['C21N7O16P3S']}

In [15]:
for one_reaction in ['R_PFK', 'R_PFL']:
  one_rhea_tup = anot_iter.reactions.candidates[one_reaction]
  one_rhea = one_rhea_tup[0][0]
  pred_spec_formulas = anot_iter.orig_spec_formula
  one_rhea2formula = anot_iter.getDictOfRheaComponentFormula(inp_rhea=one_rhea)
  upd_spec2chebi, upd_spec2formula = anot_iter.getDictsToUpdate(reaction_id=one_reaction)
  # Meaning, when examining match scores we only consider 
  # individual updates; not cumulated updtaes (so we don't use combine_spec2chhebi below) 
  upd_val = anot_iter.getUpdatedMatchScore(cur_spec_formulas = copy.deepcopy(anot_iter.orig_spec_formula),
                                      inp_spec2formula_dict = upd_spec2formula)

TypeError: 'NoneType' object is not iterable

In [16]:
one_reaction

'R_PFK'

In [7]:
anot_iter.r2upd

['R_PFK', 'R_PFL']

In [6]:
anot_iter.reactions.candidates

In [6]:
# one_rhea_tup = recom.reactions.candidates[one_reaction]
# one_rhea = one_rhea_tup[0][0]
# print(one_rhea_tup)

In [7]:
# def getDictOfRheaComponentFormula(inp_rhea):
#   """
#   Get a dictionary {chebi_id: formula}
#   from a given rhea term.
#   Rhea term -> CheBI IDs -> Formulas
  
#   Parameters
#   ----------
#   str: inp_rhea
  
#   Returns
#   -------
#   : dict
#   """
#   chebis = cn.REF_RHEA2CHEBI[inp_rhea]
#   return {val:cn.REF_CHEBI2FORMULA[val] for val in chebis \
#           if val in cn.REF_CHEBI2FORMULA.keys()}

In [8]:
# def getDictMatchByItem(chebi2ref_formula, spec2pred_formula):
#   """
#   Get match between two keys,
#   where there are exactly 
#   one matching items.
#   If all items are matched by 1-1
#   (i.e., one species - one chebi),
#   return the fully matched dictionary.
#   (i.e., improve precision)
#   If neither, return None.
#   (i.e., nothing to update)
  
#   Parameters
#   ----------
#   chebi2ref_formula: dict
#       {chebi_term: a_species_formula(string)}
#   spec2pred_formula: dict
#       {species_id: [predicted_formulas]}
  
#   Returns
#   -------
#   dict/None
#       {species_id: [chebi_term]}
#   """
#   match_dict = {one_k:[spec_id for spec_id in spec2pred_formula.keys() \
#                        if chebi2ref_formula[one_k] in spec2pred_formula[spec_id]
#                       ] \
#                 for one_k in chebi2ref_formula.keys()}
#   unmatched_species = [val for val in spec2pred_formula.keys() \
#                       if val not in list(itertools.chain(*match_dict.values()))]
#   unmatched_chebi = [val for val in match_dict.keys() if not match_dict[val]]
#   if len(unmatched_species) == 1 and len(unmatched_chebi) == 1:
#     return {unmatched_species[0]: unmatched_chebi} 
#   elif all([len(val[1])==1 for val in list(match_dict.items())]):
#     return match_dict
#   else:
#     return None

In [6]:
anot_iter = it.Iterator(cur_spec_formula=recom.species.formula,
                        reaction_cl=recom.reactions,
                        reactions_to_update = two_reactions)

In [8]:
anot_iter.runOneMatchCycle()

{'M_atp_c': ['CHEBI:30616']}

In [None]:
# # Iterator.match() will take care of this entire process, and return approprite valaues; 
# combine_spec2update = dict()
# for one_reaction in two_reactions:
#   print("Working with %s" % one_reaction)
#   one_rhea_tup = recom.reactions.candidates[one_reaction]
#   one_rhea = one_rhea_tup[0][0]
#   pred_spec_formulas = recom.species.formula
#   one_rhea2formula = anot_iter.getDictOfRheaComponentFormula(inp_rhea=one_rhea)
#   print("Arguments")
#   print(pred_spec_formulas)
#   print(one_rhea2formula)

#   match_res, match_res_formula = anot_iter.getDictsToUpdate(reaction_id=one_reaction)
# # match_res = anot_iter.getDictMatchByItem(chebi2ref_formula=one_rhea2formula,
# #                                          spec2pred_formula=pred_spec_formulas)
#   print("Returns")
#   print(match_res)
# # match_res_formula = {k:[cn.REF_CHEBI2FORMULA[chebi] for chebi in match_res[k]] for k in match_res.keys()}
#   print(match_res_formula)
#   print("\n\n")
#   upd_val = anot_iter.getUpdatedMatchScore(cur_spec_formulas = copy.deepcopy(anot_iter.orig_spec_formula),
#                                            inp_spec2formula_dict = match_res_formula)
#   print("Testing updated score", upd_val)
#   print("\n\n")
#   if upd_val['is_increased']:
#     # update combine_spec2update;  but by combining the elements.
#     for k in match_res.keys():
#       if k in combine_spec2update.keys():
#         combine_spec2update[k] = combine_spec2update[k] + match_res[k]
#       else:
#         combine_spec2update[k] = match_res[k] 
#   # => if true, include to updated formula status, and then continue; 

# print("Final updated species dict", combine_spec2update)


# # TODO (last step): for each match, caldulate match score difference and determine whether to improve it or not
# # if decided to use, include it in the updated spec/reac dictionaries. finally incorporate it with recom. 

In [11]:
anot_iter.r2upd

['R_PFK', 'R_PFL']

In [50]:
one_str = 'I have two pets; one cute cat, and one ugly dog.'
dict(collections.Counter(one_str.split(' ')))

{'I': 1,
 'have': 1,
 'two': 1,
 'pets;': 1,
 'one': 2,
 'cute': 1,
 'cat,': 1,
 'and': 1,
 'ugly': 1,
 'dog.': 1}

In [36]:
recom.species.formula

{'M_fdp_c': ['C6O12P2'],
 'M_atp_c': ['C18ClN2O6S2', 'C20O4'],
 'M_h_c': ['H'],
 'M_f6p_c': ['C6O9P'],
 'M_adp_c': ['C9N4O5',
  'C20N2O5S',
  'C19O9P',
  'C17Cl2F3N7O2S',
  'C19O2',
  'C28N6OS',
  'C115N8O85',
  'C27O5',
  'C16F3IN2O4',
  'C18ClN2O6S2',
  'C18N4O11',
  'C35N4O4',
  'C26N7O2S',
  'C16NO6',
  'C10N5O10P2',
  'C14N2O',
  'C29N6O4S',
  'C101N7O75',
  'C20O4',
  'C21ClN3O2',
  'C26FIN5O4',
  'C19N2O2S',
  'C8NO6',
  'C18O2',
  'C12']}

In [38]:
import numpy as np
import pandas as pd

data = {
        'age':[11, 24, 37],
        'size':[50, 100, 105],
        'salary':[10, 200, 800] }

# Convert the dictionary into DataFrame
df = pd.DataFrame(data, index=['customer1', 'customer2', 'customer3'])
print("Original DataFrame:\n", df)

Original DataFrame:
            age  size  salary
customer1   11    50      10
customer2   24   100     200
customer3   37   105     800


In [39]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=2,  random_state=0)

In [43]:
regr.fit(X=df[['age', 'size']], y=df.loc[:, 'salary'])

RandomForestRegressor(max_depth=2, random_state=0)

In [44]:
new_data = {
        'age':[10, 40, 50],
        'size':[50, 100, 105]}
new_df = pd.DataFrame(new_data)
new_df

Unnamed: 0,age,size
0,10,50
1,40,100
2,50,105


In [46]:
regr.predict(df[['age', 'size']])

array([102.4, 243.5, 592.5])

In [24]:
np.array(df[['C']])

array([[7],
       [8],
       [9]])

In [31]:
df.loc[:, 'salary']

0     10
1    200
2    800
Name: salary, dtype: int64