In [1]:
# Checking biomodels with reaction accuracy 0

import libsbml
import numpy as np
import os
import pandas as pd
import pickle
import sys

PROJ_DIR = "/Users/woosubs/Desktop/AutomateAnnotation/AnnotationRecommender/"
MOD_DIR = os.path.join(PROJ_DIR, "annotation_recommender")
sys.path.append(MOD_DIR)

BASE_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/'
DATA_DIR = os.path.join(BASE_DIR, "DATA")
ALGO_DIR = os.path.join(DATA_DIR, "algo")
CHEBI_DIR = os.path.join(DATA_DIR, "chebi")
RHEA_DIR = os.path.join(DATA_DIR, "rhea")
BIOMODEL_DIR = os.path.join(DATA_DIR, "biomodels/curated_biomodels_31mar2021")

# load reference matrix
with open(os.path.join(ALGO_DIR, 'binary_ref_df.pickle'), 'rb') as handle:
    ref_mat = pickle.load(handle)
# check its shape
print(ref_mat.shape)

with open(os.path.join(RHEA_DIR, 'rhea2formula_reference.pkl'), 'rb') as handle:
    rhea2formula = pickle.load(handle)

rhea_df = pd.read_csv(os.path.join(RHEA_DIR, 'rhea_string_equation.tsv'),
                      sep='\t', index_col=0)


from annotation_recommender import species_annotation as sa
from annotation_recommender import reaction_annotation as ra
from annotation_recommender import constants as cn
from annotation_recommender import iterator as it
from annotation_recommender import tools

(13651, 3790)


In [2]:
rhea_df.head()

Unnamed: 0_level_0,equation
id,Unnamed: 1_level_1
RHEA:10000,H2O + pentanamide = NH4(+) + pentanoate
RHEA:10004,benzyl isothiocyanate = benzyl thiocyanate
RHEA:10008,[protein]-dithiol + a hydroperoxide = [protein...
RHEA:10012,(R)-6-hydroxynicotine + H2O + O2 = 6-hydroxyps...
RHEA:10016,H2O + O-sinapoylcholine = choline + E-sinapate...


In [3]:
null_df = pd.read_csv("null_res.csv", index_col=0)
null_df

Unnamed: 0,spec_accuracy,reac_accuracy,iter_num,species_match_score,reaction_match_score,num_species,num_eval_species,num_reactions,num_eval_reactions,total_match_score
BIOMD0000000691.xml,0.0,0.0,2,0.768421,0.473545,19,12,18,6,8.52381
BIOMD0000000094.xml,0.0,0.0,2,0.487289,0.702063,34,2,45,2,31.592857
BIOMD0000000171.xml,0.0,0.0,2,0.613315,0.493132,12,2,26,4,12.821429
BIOMD0000000248.xml,0.285714,0.0,2,0.785185,0.883333,9,7,5,1,4.416667
BIOMD0000000088.xml,0.294118,0.0,2,0.484809,0.751861,105,17,110,18,82.704762
BIOMD0000000108.xml,0.6,0.0,1,0.704125,0.64902,9,5,17,1,11.033333
BIOMD0000000137.xml,0.666667,0.0,1,0.511878,0.281667,21,3,20,1,5.633333
BIOMD0000000038.xml,0.75,0.0,2,0.683613,0.681429,17,4,10,2,6.814286
BIOMD0000000143.xml,0.875,0.0,1,0.916506,0.745833,20,16,20,3,14.916667
BIOMD0000000292.xml,1.0,0.0,1,0.736111,0.854167,6,4,4,1,3.416667


In [4]:
null_models = list(null_df.index)
for one_biomd in null_models:
  print(one_biomd)

BIOMD0000000691.xml
BIOMD0000000094.xml
BIOMD0000000171.xml
BIOMD0000000248.xml
BIOMD0000000088.xml
BIOMD0000000108.xml
BIOMD0000000137.xml
BIOMD0000000038.xml
BIOMD0000000143.xml
BIOMD0000000292.xml
BIOMD0000000123.xml
BIOMD0000000122.xml


In [5]:
# BIOMD94 and (especially) BIOMD88: need to be careful.. large models.

one_biomd = 'BIOMD0000000292.xml'
one_biomd_fpath = os.path.join(BIOMODEL_DIR, one_biomd)
species_an = sa.SpeciesAnnotation(libsbml_fpath=one_biomd_fpath)
reaction_an = ra.ReactionAnnotation(libsbml_fpath=one_biomd_fpath)

model = species_an.model
print(model.getNumSpecies())
print(model.getNumReactions())
pred_species = species_an.predictAnnotationByName()
pred_reaction = reaction_an.predictAnnotation(inp_spec_dict=species_an.formula)
res = it.iterateAndGetUpdatedResults(spec_cl=species_an,
                                   reac_cl=reaction_an,
                                   num_iter=5,
                                   show_message=False)

6
4


In [6]:
print(species_an.exist_annotation)

{'NADPH': ['C21N7O17P3'], 'ADP': ['C10N5O10P2'], 'ATP': ['C10N5O13P3'], 'NADP': ['C21N7O17P3']}


In [7]:
print(species_an.match_score)

{'NADPH': 1.0, 'ADP': 1.0, 'ATP': 1.0, 'X': 0.0, 'Y': 1.0, 'NADP': 0.41666666666666663}


In [8]:
print(reaction_an.candidates)

{'v4': Index(['RHEA:18412', 'RHEA:18504', 'RHEA:20676', 'RHEA:33450', 'RHEA:33466',
       'RHEA:33470', 'RHEA:34014', 'RHEA:34018', 'RHEA:36382', 'RHEA:46831',
       'RHEA:46835', 'RHEA:54419', 'RHEA:60099'],
      dtype='object'), 'v3': Index(['RHEA:10155', 'RHEA:10195', 'RHEA:10219', 'RHEA:10227', 'RHEA:10263',
       'RHEA:10275', 'RHEA:10279', 'RHEA:10351', 'RHEA:10379', 'RHEA:10491',
       ...
       'RHEA:66771', 'RHEA:66775', 'RHEA:66779', 'RHEA:66823', 'RHEA:67031',
       'RHEA:67083', 'RHEA:67087', 'RHEA:67091', 'RHEA:67095', 'RHEA:67139'],
      dtype='object', length=727), 'v2': Index(['RHEA:12231', 'RHEA:61595'], dtype='object'), 'v1': Index(['RHEA:12231', 'RHEA:61595'], dtype='object')}


In [12]:
print(reaction_an.match_score['v1'])

{'RHEA:12231': 0.7777777777777778, 'RHEA:61595': 0.875}


In [11]:
print(reaction_an.exist_annotation)

{'v1': ['RHEA:19368']}


In [120]:
one_r = model.getReaction('R5')
print([str(val.stoichiometry)+"*"+val.species for val in one_r.getListOfReactants()])
print(["%s (%s)" % (val.species, model.getSpecies(val.species).name) for val in one_r.getListOfReactants()])
print([str(val.stoichiometry)+"*"+val.species for val in one_r.getListOfProducts()])
print(["%s (%s)" % (val.species, model.getSpecies(val.species).name) for val in one_r.getListOfProducts()])

['2.0*O2minus_p', '2.0*H_p']
['O2minus_p (Superoxide)', 'H_p (Hydrogen)']
['1.0*O2_p', '1.0*H2O2_p']
['O2_p (Oxygen)', 'H2O2_p (H2O2)']


In [130]:
np.sum(reaction_an.query_df['R5'])

3

In [112]:
print(model.getReaction('R5').getAnnotationString())

<annotation>
  <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:bqmodel="http://biomodels.net/model-qualifiers/" xmlns:bqbiol="http://biomodels.net/biology-qualifiers/">
    <rdf:Description rdf:about="#metaid_0000057">
      <bqbiol:isVersionOf>
        <rdf:Bag>
          <rdf:li rdf:resource="http://identifiers.org/ec-code/1.15.1.1"/>
        </rdf:Bag>
      </bqbiol:isVersionOf>
      <bqbiol:is>
        <rdf:Bag>
          <rdf:li rdf:resource="http://identifiers.org/kegg.reaction/R00275"/>
        </rdf:Bag>
      </bqbiol:is>
    </rdf:Description>
  </rdf:RDF>
</annotation>


In [114]:
model.getSpecies('H_p').name

'Hydrogen'

In [105]:
print(model.getSpecies('X').getAnnotationString())

<annotation>
  <celldesigner:extension>
    <celldesigner:positionToCompartment>inside</celldesigner:positionToCompartment>
    <celldesigner:speciesIdentity>
      <celldesigner:class>SIMPLE_MOLECULE</celldesigner:class>
      <celldesigner:name>X</celldesigner:name>
    </celldesigner:speciesIdentity>
  </celldesigner:extension>
  <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:bqbiol="http://biomodels.net/biology-qualifiers/">
    <rdf:Description rdf:about="#X">
      <bqbiol:hasPart>
        <rdf:Bag>
          <rdf:li rdf:resource="http://identifiers.org/chebi/CHEBI:16108"/>
          <rdf:li rdf:resource="http://identifiers.org/chebi/CHEBI:17797"/>
          <rdf:li rdf:resource="http://identifiers.org/chebi/CHEBI:17363"/>
          <rdf:li rdf:resource="http://identifiers.org/chebi/CHEBI:16905"/>
          <rdf:li rdf:resource="http://identifiers.org/chebi/CHEBI:15946"/>
          <rdf:li rdf:resource="http://identifiers.org/chebi/CHEBI:48153"/>
         

In [59]:
{model.getSpecies(k).getId():model.getSpecies(k).name for k in species_an.exist_annotation.keys()}

{'NADPH': 'NADPH', 'ADP': 'ADP', 'ATP': 'ATP', 'NADP': 'NADP_super_+'}

In [60]:
{k.getId():k.name for k in model.getListOfSpecies()}

{'NADPH': 'NADPH',
 'ADP': 'ADP',
 'ATP': 'ATP',
 'X': 'X',
 'Y': 'Y',
 'NADP': 'NADP_super_+'}