In [1]:
# Test all testable biomodels (i.e., models with existing annotation)
import libsbml
import numpy as np
import os
import pickle
import pandas as pd
import sys
import matplotlib.pyplot as plt
%matplotlib inline  

PROJ_DIR = "/Users/woosubs/Desktop/AutomateAnnotation/AnnotationRecommender/"
MOD_DIR = os.path.join(PROJ_DIR, "annotation_recommender")
sys.path.append(MOD_DIR)

BIOMD_12 = 'BIOMD0000000012.xml'
BASE_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/'
DATA_DIR = os.path.join(BASE_DIR, "DATA")
ALGO_DIR = os.path.join(DATA_DIR, "algo")
CHEBI_DIR = os.path.join(DATA_DIR, "chebi")
RHEA_DIR = os.path.join(DATA_DIR, "rhea")
BIOMODEL_DIR = os.path.join(DATA_DIR, "biomodels/curated_biomodels_31mar2021")
BIGG_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/DATA/bigg'
ecoli_fpath = os.path.join(BIGG_DIR, "e_coli_core.xml")


from annotation_recommender import species_annotation as sa
from annotation_recommender import reaction_annotation as ra
from annotation_recommender import constants as cn
from annotation_recommender import iterator as it
from annotation_recommender import tools

# chebi to shortened formula
with open(os.path.join(CHEBI_DIR, 'chebi_shortened_formula_30apr2022.pickle'), 'rb') as f:
  ref_shortened_chebi_to_formula = pickle.load(f)
# shortened formula to chebi
with open(os.path.join(CHEBI_DIR, 'shortened_formula_to_chebis_20jul2022.pickle'), 'rb') as f:
  ref_shortened_formula_to_chebi = pickle.load(f)

with open(os.path.join(CHEBI_DIR, 'chebi_synonyms.pickle'), 'rb') as f:
  chebi_synonyms = pickle.load(f)
chebi_low_synonyms = dict()
for one_k in chebi_synonyms.keys():
  chebi_low_synonyms[one_k] = list(set([val.lower() for val in chebi_synonyms[one_k]]))

with open(os.path.join(RHEA_DIR, 'kegg2rhea_master.pickle'), 'rb') as handle:
  ref_kegg2rhea_master = pickle.load(handle)
with open(os.path.join(RHEA_DIR, 'kegg2rhea_bi.pickle'), 'rb') as handle:
  ref_kegg2rhea_bi = pickle.load(handle)

# mapping rhea terms to BI
with open(os.path.join(RHEA_DIR, 'rhea_all2bi.pkl'), 'rb') as handle:
  ref_rhea2bi = pickle.load(handle)

# load reference matrix
with open(os.path.join(ALGO_DIR, 'binary_ref_df.pickle'), 'rb') as handle:
    ref_mat = pickle.load(handle)
# check its shape
print(ref_mat.shape)

(13651, 3790)


In [2]:
with open(os.path.join(os.getcwd(), 'eckegg2rhea.pickle'), 'rb') as handle:
  eckegg2rhea = pickle.load(handle)

In [3]:
len(eckegg2rhea)

131

In [4]:
# collect all reactions
all_reactions_to_test = 0
for one_biomd in eckegg2rhea.keys():
  one_model_reactions = eckegg2rhea[one_biomd]
  all_reactions_to_test += len(one_model_reactions)
print(all_reactions_to_test)

2188


In [141]:
multi_mat = ra.ref_mat.dot(reaction_an.query_df)
maxes = multi_mat.max()

In [144]:
maxes['HXT']

2

In [None]:
## Testing individual reactions
reac_res_df = pd.DataFrame(0,
                      index=range(all_reactions_to_test),
                      columns = ['model', 'reaction_id', 'num_candidates',
                                 'max_match', 'mean_match_score', 'var_match_score',
                                 'accuracy'])
count = 0
for idx, one_biomd in enumerate(list(eckegg2rhea.keys())):
  if idx % 10 == 0:
    print("We are at", idx)
  one_biomd_fpath = os.path.join(BIOMODEL_DIR, one_biomd)
  species_an = sa.SpeciesAnnotation(libsbml_fpath=one_biomd_fpath)
  reaction_an = ra.ReactionAnnotation(libsbml_fpath=one_biomd_fpath)
  pred_species = species_an.predictAnnotationByName()
  pred_reaction = reaction_an.predictAnnotation(inp_spec_dict=species_an.formula)
  multi_mat = ra.ref_mat.dot(reaction_an.query_df)
  maxes = multi_mat.max()
  res = it.iterateAndGetUpdatedResults(spec_cl=species_an,
                                       reac_cl=reaction_an,
                                       num_iter=5,
                                       show_message=False)
  # reference..
  one_model_ref = eckegg2rhea[one_biomd]
  for one_r in one_model_ref.keys():
    existing_annotation = [ref_rhea2bi[val] for val in one_model_ref[one_r] if \
                           val in ref_rhea2bi.keys()]
    if any([val in res['candidates'][one_r] for val in existing_annotation]):
      is_accurate = 1
    else:
      is_accurate = 0
    reac_res_df.loc[count, 'model'] = one_biomd
    reac_res_df.loc[count, 'reaction_id'] = one_r           
    reac_res_df.loc[count, 'num_candidates'] = len(reaction_an.candidates[one_r])
    reac_res_df.loc[count, 'max_match'] = maxes[one_r]
    match_score_dict = reaction_an.match_score[one_r]
    match_score_list = [match_score_dict[val] for val in match_score_dict.keys()]
    reac_res_df.loc[count, 'mean_match_score'] = np.mean(match_score_list)
    reac_res_df.loc[count, 'var_match_score'] = np.var(match_score_list)
    reac_res_df.loc[count, 'accuracy'] = is_accurate
    count += 1

We are at 0
We are at 10


In [None]:
reac_res_df.tail()

In [128]:
reac_res_df.to_csv('individual_reaction_accuracy.csv')

In [127]:
np.sum(reac_res_df['accuracy'])

1352

In [10]:
# Now, do the same thing for CHEBI; 
with open(os.path.join(os.getcwd(), 'chebi_models.pickle'), 'rb') as handle:
  chebi_models = pickle.load(handle)

In [20]:
# collect all species
all_species_to_test = 0
for one_biomd in chebi_models.keys():
  one_model_species = chebi_models[one_biomd]
  all_species_to_test += len(one_model_species)
print(all_species_to_test)

4902


In [48]:
spec_res_df = pd.DataFrame(0,
                           index=range(all_species_to_test),
                           columns = ['model', 'species_id', 'name_used', 'name_length', 
                                      'num_candidates', 'match_score', 'accuracy'])

count = 0
for idx, one_biomd in enumerate(list(chebi_models.keys())):
  if idx % 30 == 0:
    print("We are at", idx)
  model_itm = chebi_models[one_biomd]
  one_biomd_fpath = os.path.join(BIOMODEL_DIR, one_biomd)
  species_an = sa.SpeciesAnnotation(libsbml_fpath=one_biomd_fpath)
  pred_species = species_an.predictAnnotationByName(inp_spec_list=list(model_itm.keys()))
  for one_spec in model_itm.keys(): 
    spec_res_df.loc[count, 'model'] = one_biomd
    spec_res_df.loc[count, 'species_id'] = one_spec
    spec_name = species_an.model.getSpecies(one_spec).name
    if len(spec_name) > 0:
      spec_name_used = spec_name
    else:
      spec_name_used = one_spec
    spec_res_df.loc[count, 'name_used'] = spec_name_used
    spec_res_df.loc[count, 'name_length'] = len(spec_name_used)
    spec_res_df.loc[count, 'num_candidates'] = len(species_an.candidates[one_spec])
    spec_res_df.loc[count, 'match_score'] = species_an.match_score[one_spec]
    predicted = species_an.formula[one_spec]
    referenced = [ref_shortened_chebi_to_formula[val] \
                  for val in model_itm[one_spec]]
    if any(set(referenced).intersection(predicted)):
      spec_res_df.loc[count, 'accuracy'] = 1
    else:
      spec_res_df.loc[count, 'accuracy'] = 0
    count += 1

We are at 0
We are at 30
We are at 60
We are at 90
We are at 120
We are at 150
We are at 180
We are at 210
We are at 240
We are at 270
We are at 300


In [51]:
spec_res_df.tail()

Unnamed: 0,model,species_id,name_used,name_length,num_candidates,match_score,accuracy
4897,BIOMD0000000177.xml,EtOH,EtOH,4,1,1.0,1
4898,BIOMD0000000177.xml,Glycerol,Glycerol,8,1,1.0,1
4899,BIOMD0000000177.xml,Trehalose,Trehalose,9,2,1.0,1
4900,BIOMD0000000177.xml,Succinate,Succinate,9,1,1.0,1
4901,BIOMD0000000177.xml,CO2mito,CO2mito,7,7,0.571429,0


In [52]:
spec_res_df.to_csv('individual_species_accuracy.csv')

In [36]:
# spec_res_df.to_csv('individual_species_accuracy.csv')

# Next step will be to construct two (species & reactions, respectively) regression models to predict accuracy;; :) 

In [38]:
from sklearn.linear_model import LogisticRegressionCV

In [65]:
# First, regresison model for species
X = spec_res_df[['name_length', 'num_candidates', 'match_score']]
y = spec_res_df['accuracy']
clf = LogisticRegressionCV(cv=10, random_state=0, class_weight='balanced').fit(X, y)
print(clf.score(X, y))

0.7876376988984088


In [60]:
print(spec_res_df.shape)
print(np.sum(spec_res_df['accuracy']))

(4902, 7)
3641


In [80]:
clf.predict_proba(X.loc[:10, :])

array([[0.36818348, 0.63181652],
       [0.36760611, 0.63239389],
       [0.36760611, 0.63239389],
       [0.37837246, 0.62162754],
       [0.77636439, 0.22363561],
       [0.35925305, 0.64074695],
       [0.70364392, 0.29635608],
       [0.36760611, 0.63239389],
       [0.35925305, 0.64074695],
       [0.33030819, 0.66969181],
       [0.57380944, 0.42619056]])

In [81]:
clf.predict(X.loc[:10, :])

array([1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0])

In [89]:
# save fitted ML object
filename_spec_res_cv = 'trained_cv_species.sav'
# pickle.dump(clf, open(filename_spec_res_cv, 'wb'))

# Test if saving was done correctly
loaded_model = pickle.load(open(filename_spec_res_cv, 'rb'))
result = loaded_model.predict_proba(X.loc[:10, :])
print(result)

[[0.36818348 0.63181652]
 [0.36760611 0.63239389]
 [0.36760611 0.63239389]
 [0.37837246 0.62162754]
 [0.77636439 0.22363561]
 [0.35925305 0.64074695]
 [0.70364392 0.29635608]
 [0.36760611 0.63239389]
 [0.35925305 0.64074695]
 [0.33030819 0.66969181]
 [0.57380944 0.42619056]]


In [96]:
one_biomd = list(chebi_models.keys())[0]
one_biomd_fpath = os.path.join(BIOMODEL_DIR, one_biomd)
species_an = sa.SpeciesAnnotation(libsbml_fpath=one_biomd_fpath)
model_itm = chebi_models[one_biomd]
pred_species = species_an.predictAnnotationByName(inp_spec_list=list(model_itm.keys()))

In [92]:
species_an.candidates

{'ATP': ['CHEBI:15422', 'CHEBI:30616'],
 'ADP': ['CHEBI:16761', 'CHEBI:456216', 'CHEBI:73342'],
 'AMP': ['CHEBI:16027', 'CHEBI:28971', 'CHEBI:456215']}

In [106]:
np.array(list(data2prediction))

array([[3., 2., 1.],
       [3., 3., 1.]])

In [110]:
specs = ['ATP', 'ADP']
name_lengths = [len(getName(inp_id=val)) for val in specs]
nums_candidates = [len(species_an.candidates[val]) for val in specs]
match_scores = [species_an.match_score[val] for val in specs]
data2prediction = list(zip(name_lengths, nums_candidates, match_scores))
res_preds = loaded_model.predict_proba(data2prediction)

In [111]:
res_preds

array([[0.36818348, 0.63181652],
       [0.36760611, 0.63239389]])

In [114]:
list(zip(specs, [val[1] for val in res_preds]))

[('ATP', 0.631816517626513), ('ADP', 0.6323938919261204)]

In [118]:
def getNameToUse(inp_id):
  """
  Get name to use;
  If .name is not '', use it;
  otherwise use ID
  
  Parameters
  ----------
  inp_id: ID of model element
  
  Returns
  -------
  res_name: str
  """
  one_species = species_an.model.getSpecies(inp_id)
  species_name = one_species.name
  if len(species_name) > 0:
    res_name = species_name
  else:
    res_name = inp_id
  return res_name
# Develop a method to evaluate results using fitted model
def evaluatePredictedSpeciesAnnotation(inp_list, fitted_model=loaded_model):
  """
  Evaluate the quality of annotation;
  for each individual species.
  
  Parameters
  ---------
  inp_list: str-list?
      List of species to evaluate (one or more)

  Returns
  -------
  res: dict {species_id: probability-of-species-prediction-being-correct}
      Information of whether confident or not
  """
  name_lengths = [len(getNameToUse(inp_id=val)) for val in inp_list]
  nums_candidates = [len(species_an.candidates[val]) for val in inp_list]
  match_scores = [species_an.match_score[val] for val in inp_list]
  data2prediction = list(zip(name_lengths, nums_candidates, match_scores))
  # loaded_model is loaded fitted logistic regression CV model
  pred_probs = [val[1] for val in fitted_model.predict_proba(data2prediction)]
  # Collect probability to be correct
  res = {val[0]:val[1] for val in list(zip(inp_list, pred_probs))}
  return res

In [116]:
evaluatePredictedSpeciesAnnotation(inp_list=['ATP', 'ADP'])

{'ATP': 0.631816517626513, 'ADP': 0.6323938919261204}

In [63]:
# re-run using statsmodels;
import statsmodels.api as sm
X = spec_res_df[['name_length', 'num_candidates', 'match_score']]
y = spec_res_df['accuracy']
log_reg = sm.Logit(y, X).fit()

Optimization terminated successfully.
         Current function value: 0.505515
         Iterations 6


In [64]:
print(log_reg.summary())

                           Logit Regression Results                           
Dep. Variable:               accuracy   No. Observations:                 4902
Model:                          Logit   Df Residuals:                     4899
Method:                           MLE   Df Model:                            2
Date:                Thu, 01 Sep 2022   Pseudo R-squ.:                  0.1134
Time:                        15:02:29   Log-Likelihood:                -2478.0
converged:                       True   LL-Null:                       -2794.9
Covariance Type:            nonrobust   LLR p-value:                2.476e-138
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
name_length        0.0088      0.002      3.531      0.000       0.004       0.014
num_candidates    -0.0154      0.003     -5.221      0.000      -0.021      -0.010
match_score        1.5267      0.061

In [129]:
reac_res_df.head()

Unnamed: 0,model,reaction_id,num_candidates,mean_match_score,var_match_score,accuracy
0,BIOMD0000000152.xml,vcat1,65,0.250263,0.022159,0
1,BIOMD0000000152.xml,vcat2,53,0.233109,0.015749,0
2,BIOMD0000000152.xml,vcat3,54,0.360273,0.00773,0
3,BIOMD0000000152.xml,vcat4,89,0.258619,0.024271,0
4,BIOMD0000000152.xml,vcat5,188,0.214104,0.009476,0


In [135]:
# Second, regresison model for reactions
X = reac_res_df[['num_candidates', 'mean_match_score']]
y = reac_res_df['accuracy']
clf = LogisticRegressionCV(cv=10, random_state=0, class_weight='balanced').fit(X, y)
print(clf.score(X, y))

0.6951553930530164


In [None]:
# predict probability
clf.predict_proba(X.loc[:5, :])

In [136]:
# re-run using statsmodels;
import statsmodels.api as sm
X = reac_res_df[['num_candidates', 'mean_match_score', 'var_match_score']]
y = reac_res_df['accuracy']
log_reg = sm.Logit(y, X).fit()
print(log_reg.summary())

Optimization terminated successfully.
         Current function value: 0.619364
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               accuracy   No. Observations:                 2188
Model:                          Logit   Df Residuals:                     2185
Method:                           MLE   Df Model:                            2
Date:                Fri, 02 Sep 2022   Pseudo R-squ.:                 0.06873
Time:                        16:36:38   Log-Likelihood:                -1355.2
converged:                       True   LL-Null:                       -1455.2
Covariance Type:            nonrobust   LLR p-value:                 3.663e-44
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
num_candidates       0.0002   6.42e-05      2.736      0.006    4.98e-05       0.000
mean_match_

In [None]:
# Develop a method to evaluate results using fitted model
def evaluatePredictedReactionAnnotation(inp_list):
  """
  Evaluate the quality of annotation;
  for each individual species.
  
  Parameters
  ---------
  inp_list: str-list?
      List of reactions to evaluate (one or more)

  Returns
  -------
  res: dict {reaction_id: probability-of-species-prediction-being-correct}
      Information of whether confident or not
  """

  nums_candidates = [len(reaction_an.candidates[val]) for val in inp_list]
  max_matches = []
  mean_match_scores = []
  data2prediction = list(zip(nums_candidates, max_matches, mean_match_scores))
  # loaded_model is loaded fitted logistic regression CV model
  pred_probs = [val[1] for val in loaded_model.predict_proba(data2prediction)]
  # Collect probability to be correct
  res = {val[0]:val[1] for val in list(zip(inp_list, pred_probs))}
  return res

  name_lengths = [len(getNameToUse(inp_id=val)) for val in specs]
  nums_candidates = [len(species_an.candidates[val]) for val in specs]
  match_scores = [species_an.match_score[val] for val in specs]
  data2prediction = list(zip(name_lengths, nums_candidates, match_scores))
