In [18]:
# create species annotation class
import numpy as np
import os
import pickle
import pandas as pd
import sys
PROJ_DIR = "/Users/woosubs/Desktop/AutomateAnnotation/AnnotationRecommender/"
MOD_DIR = os.path.join(PROJ_DIR, "annotation_recommender")
sys.path.append(MOD_DIR)

BIOMD_12 = 'BIOMD0000000012.xml'
BASE_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/'
DATA_DIR = os.path.join(BASE_DIR, "DATA")
ALGO_DIR = os.path.join(DATA_DIR, "algo")
CHEBI_DIR = os.path.join(DATA_DIR, "chebi")
RHEA_DIR = os.path.join(DATA_DIR, "rhea")
BIOMODEL_DIR = os.path.join(DATA_DIR, "biomodels/curated_biomodels_31mar2021")
BIGG_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/DATA/bigg'
ecoli_fpath = os.path.join(BIGG_DIR, "e_coli_core.xml")


from annotation_recommender import species_annotation as sa
from annotation_recommender import reaction_annotation as ra
from annotation_recommender import constants as cn
from annotation_recommender import iterator as it
from annotation_recommender import tools

# chebi to shortened formula
with open(os.path.join(CHEBI_DIR, 'chebi_shortened_formula_30apr2022.pickle'), 'rb') as f:
  ref_shortened_chebi_to_formula = pickle.load(f)
# shortened formula to chebi
with open(os.path.join(CHEBI_DIR, 'shortened_formula_to_chebis_20jul2022.pickle'), 'rb') as f:
  ref_shortened_formula_to_chebi = pickle.load(f)
with open(os.path.join(RHEA_DIR, 'rhea2chebi_reference.pkl'), 'rb') as f:
  ref_rhea_to_chebi = pickle.load(f)

with open(os.path.join(CHEBI_DIR, 'chebi_synonyms.pickle'), 'rb') as f:
  chebi_synonyms = pickle.load(f)
chebi_low_synonyms = dict()
for one_k in chebi_synonyms.keys():
  chebi_low_synonyms[one_k] = list(set([val.lower() for val in chebi_synonyms[one_k]]))

with open(os.path.join(RHEA_DIR, 'kegg2rhea_master.pickle'), 'rb') as handle:
  ref_kegg2rhea_master = pickle.load(handle)
with open(os.path.join(RHEA_DIR, 'kegg2rhea_bi.pickle'), 'rb') as handle:
  ref_kegg2rhea_bi = pickle.load(handle)

# load reference matrix
with open(os.path.join(ALGO_DIR, 'binary_ref_df.pickle'), 'rb') as handle:
    ref_mat = pickle.load(handle)
# check its shape
print(ref_mat.shape)

(13651, 3790)


In [2]:
specanot = sa.SpeciesAnnotation(libsbml_fpath=ecoli_fpath)
reacanot = ra.ReactionAnnotation(libsbml_fpath=ecoli_fpath)

In [3]:
print(reacanot.exist_annotation)
print([val.getId() for val in specanot.model.getListOfSpecies()])

{'R_PFK': ['RHEA:16112'], 'R_PFL': ['RHEA:11847'], 'R_PGK': ['RHEA:14804'], 'R_PGL': ['RHEA:12559'], 'R_ACALD': ['RHEA:23291'], 'R_AKGt2r': ['RHEA:29014'], 'R_PGM': ['RHEA:15904'], 'R_PIt2r': ['RHEA:29942'], 'R_ALCD2x': ['RHEA:25293'], 'R_ACKr': ['RHEA:11355'], 'R_PPC': ['RHEA:23075'], 'R_ACONTa': ['RHEA:10231'], 'R_ACONTb': ['RHEA:22147'], 'R_ATPM': ['RHEA:13068', 'RHEA:20855'], 'R_PPCK': ['RHEA:18620'], 'R_PPS': ['RHEA:11367'], 'R_ADK1': ['RHEA:12976'], 'R_AKGDH': ['RHEA:27789'], 'R_PTAr': ['RHEA:19524'], 'R_PYK': ['RHEA:18160'], 'R_RPE': ['RHEA:13680'], 'R_CS': ['RHEA:16848'], 'R_SUCCt2_2': ['RHEA:29306'], 'R_ENO': ['RHEA:10167'], 'R_SUCDi': ['RHEA:29190'], 'R_SUCOAS': ['RHEA:17664'], 'R_TALA': ['RHEA:17056'], 'R_TKT2': ['RHEA:27629'], 'R_TPI': ['RHEA:18588'], 'R_EX_ac_e': ['RHEA:27817'], 'R_EX_etoh_e': ['RHEA:35270'], 'R_EX_for_e': ['RHEA:29682'], 'R_EX_h_e': ['RHEA:34982'], 'R_EX_h2o_e': ['RHEA:29670'], 'R_EX_nh4_e': ['RHEA:28750'], 'R_EX_pi_e': ['RHEA:32826'], 'R_FBA': ['RHEA:147

In [4]:
one_comps = reacanot.getReactionComponents('R_PFK')
print(one_comps)
two_comps = reacanot.getReactionComponents(reacanot.model.getReaction('R_PFK'))
print(two_comps)

['M_atp_c', 'M_h_c', 'M_fdp_c', 'M_adp_c', 'M_f6p_c']
['M_atp_c', 'M_h_c', 'M_fdp_c', 'M_adp_c', 'M_f6p_c']


In [5]:
comp_names = [reacanot.model.getSpecies(val).name for val in one_comps]
print(comp_names)

['ATP C10H12N5O13P3', 'H+', 'D-Fructose 1,6-bisphosphate', 'ADP C10H12N5O10P2', 'D-Fructose 6-phosphate']


In [6]:
spec_pred_annotations = specanot.predictAnnotationByName(inp_spec_list=one_comps)

In [7]:
print(specanot.match_score)
print(specanot.chebi)
print(specanot.formula)

{'M_atp_c': 0.4117647058823529, 'M_h_c': 1.0, 'M_fdp_c': 1.0, 'M_adp_c': 0.3529411764705882, 'M_f6p_c': 1.0}
{'M_atp_c': ['CHEBI:135736', 'CHEBI:182955'], 'M_h_c': ['CHEBI:15378'], 'M_fdp_c': ['CHEBI:16905', 'CHEBI:49299'], 'M_adp_c': ['CHEBI:135736', 'CHEBI:456216', 'CHEBI:147398', 'CHEBI:151629', 'CHEBI:152534', 'CHEBI:153980', 'CHEBI:167004', 'CHEBI:167672', 'CHEBI:182955', 'CHEBI:28498', 'CHEBI:31899', 'CHEBI:32411', 'CHEBI:36331', 'CHEBI:42870', 'CHEBI:63450', 'CHEBI:63452', 'CHEBI:68840', 'CHEBI:72990', 'CHEBI:75998', 'CHEBI:77390', 'CHEBI:78443', 'CHEBI:88249', 'CHEBI:89713', 'CHEBI:90217', 'CHEBI:90304', 'CHEBI:90695', 'CHEBI:93296'], 'M_f6p_c': ['CHEBI:15946', 'CHEBI:57579', 'CHEBI:61553', 'CHEBI:61527']}
{'M_atp_c': ['C20O4', 'C18ClN2O6S2'], 'M_h_c': ['H'], 'M_fdp_c': ['C6O12P2'], 'M_adp_c': ['C18N4O11', 'C20N2O5S', 'C18ClN2O6S2', 'C29N6O4S', 'C115N8O85', 'C35N4O4', 'C19O2', 'C21ClN3O2', 'C27O5', 'C12', 'C20O4', 'C16F3IN2O4', 'C101N7O75', 'C18O2', 'C16NO6', 'C26FIN5O4', 'C8NO

In [8]:
reac_pred_annotations = reacanot.predictAnnotation(inp_spec_dict=specanot.formula,
                                                   inp_reac_list=['R_PFK'],
                                                   inp_ref_mat=ref_mat)

In [9]:
reacanot.match_score

{'R_PFK': {'RHEA:12423': 0.8,
  'RHEA:13380': 0.8,
  'RHEA:14216': 0.8,
  'RHEA:15656': 0.8,
  'RHEA:16112': 0.8,
  'RHEA:20108': 0.8}}

In [13]:
chebi2update = reacanot.updateSpeciesByAReaction(inp_rid='R_PFK', inp_spec_dict=specanot.formula,
                                                 inp_rhea='RHEA:12423', inp_ref_mat=ref_mat)
formula2update = {one_k:[ref_shortened_chebi_to_formula[val] for val in chebi2update[one_k]] \
                  for one_k in chebi2update.keys()}
orig_values = [specanot.chebi, specanot.formula]

In [14]:
# Update species annotation
# specanot.match_score = ?
specanot.chebi.update(chebi2update)
specanot.formula.update(formula2update)

In [17]:
new_pred_annotations = reacanot.predictAnnotation(inp_spec_dict=specanot.formula,
                                                   inp_reac_list=['R_PFK'],
                                                   inp_ref_mat=ref_mat)
reacanot.match_score

{'R_PFK': {'RHEA:12423': 1.0,
  'RHEA:13380': 1.0,
  'RHEA:14216': 1.0,
  'RHEA:15656': 1.0,
  'RHEA:16112': 1.0}}

In [20]:
[ref_shortened_chebi_to_formula[val] for val in  ref_rhea_to_chebi['RHEA:12423']]

['C10N5O10P2', 'H', 'C6O12P2', 'C10N5O13P3', 'C6O9P']

In [2]:
specanot = sa.SpeciesAnnotation(libsbml_fpath=ecoli_fpath)
reacanot = ra.ReactionAnnotation(libsbml_fpath=ecoli_fpath)

spec_pred_annotations = specanot.predictAnnotationByName(inp_spec_list=None)
reac_pred_annotations = reacanot.predictAnnotation(inp_spec_dict=specanot.formula,
                                                   inp_reac_list=None,
                                                   inp_ref_mat=ref_mat,
                                                   update=True)

In [21]:
# ranked_one_cands = dict()
# for one_k in reacanot.match_score.keys():
#   one_itm = pd.DataFrame.from_dict(reacanot.match_score[one_k], orient='index', columns=['match_score'])
#   one_itm.sort_values(ascending=False, by='match_score', inplace=True)
#   ranked_one_cands[one_k] = [one_itm.index[0]]

In [15]:
cur_candidates_dict = reacanot.candidates
cur_spec_formula_dict = specanot.formula
cur_reac_match_score = reacanot.sum_match_score
cur_one_cands = reacanot.one_candidates
num_iter=10
show_message=True

In [16]:
flag = False
if show_message:
  print("Initial match score: %.02f" % cur_reac_match_score)
  print("*************************")
for rep in range(0, num_iter):
  if flag:
    break
  if show_message:
    print("Iteration %d" % (rep+1))
  # updated chebi values
  all_upd_spec = dict()
  for one_k in cur_one_cands.keys():
    one_upd_spec = reacanot.updateSpeciesByAReaction(inp_rid=one_k,
                                                     inp_spec_dict=cur_spec_formula_dict,
                                                     inp_rhea=cur_one_cands[one_k][0],
                                                     inp_ref_mat=ref_mat)
    all_upd_spec = tools.updateDictKeyToList(all_upd_spec, one_upd_spec)

  # update species dictionary to use
  upd_spec_formula_dict = dict()
  for one_k in cur_spec_formula_dict.keys():
    if one_k in all_upd_spec.keys():
      upd_spec_formula_dict[one_k] = list(set([ref_shortened_chebi_to_formula[val] for val in all_upd_spec[one_k]]))
    else:
      upd_spec_formula_dict[one_k] = cur_spec_formula_dict[one_k]

  # Using upd_spec_formula_dict, predict reaction again
  new_res_df = reacanot.predictAnnotation(inp_spec_dict=upd_spec_formula_dict,
                                          inp_reac_list=None,
                                          inp_ref_mat=ref_mat,
                                          update=False)
#
  upd_reac_match_score = new_res_df['sum_match_score']
  # Check wheter to continue;
  if upd_reac_match_score > cur_reac_match_score:
    cur_candidates_dict = new_res_df['candidates']
    cur_one_cands = new_res_df['one_candidates']
    cur_spec_formula_dict = upd_spec_formula_dict
    cur_reac_match_score = upd_reac_match_score
    if show_message:
      print("Updated match score: %.02f" % cur_reac_match_score)
      print("*************************")
  else:
    flag = True
    if show_message:
      print("Updated match score: %.02f" % cur_reac_match_score)
      print("Score not increasing. Quitting iteration...")
# if show_message:
# print("\nCalculation finished.")
# return cur_candidates_dict, cur_spec_formula_dict, cur_reac_match_score, new_query_df, rep

Initial match score: 82.94
*************************
Iteration 1
Updated match score: 89.97
*************************
Iteration 2
Updated match score: 90.17
*************************
Iteration 3
Updated match score: 90.17
Score not increasing. Quitting iteration...


# Get Accuracy for both classes

In [41]:
# def getAccuracy(ref_annotation=None):
#   """
#   Compute accuracy of species annotation.
#   A list of annotations of 
#   a single species (identified by each ID) 
#   is considered accurate if it includes
#   the corresponding value of ref_annotation.
#   (More precisely, if there is at least one
#   intersection).
  
#   Parameters
#   ----------
#   ref_annotation: dict
#       {species_id: [str-annotatino]}

#   Returns
#   -------
#   : float
#   """
#   accuracy = []
#   if ref_annotation is None:
#     ref = specanot.exist_annotation
#   else:
#     ref = ref_annotation
#   species_to_test = set(ref.keys()).intersection(set(specanot.formula.keys()))
#   for one_k in species_to_test:
#     if set(ref[one_k]).intersection(specanot.formula[one_k]):
#       accuracy.append(True)
#     else:
#       accuracy.append(False)
#   return np.mean(accuracy)