In [1]:
# Update species, this time for applying cosine similarity
import collections
import editdistance
import itertools
import libsbml
import numpy as np
import operator
import os
import compress_pickle
import pickle
import pandas as pd
import re
import sys
import matplotlib.pyplot as plt
%matplotlib inline  

BIOMD_12 = 'BIOMD0000000012.xml'
BASE_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/'
DATA_DIR = os.path.join(BASE_DIR, "DATA")
ALGO_DIR = os.path.join(DATA_DIR, "algo")
CHEBI_DIR = os.path.join(DATA_DIR, "chebi")
RHEA_DIR = os.path.join(DATA_DIR, "rhea")
BIOMODEL_DIR = os.path.join(DATA_DIR, "biomodels/curated_biomodels_31mar2021")
BIGG_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/DATA/bigg'
ecoli_fpath = os.path.join(BIGG_DIR, "e_coli_core.xml")

PROJ_DIR = os.path.join(os.getcwd(), os.pardir)
AMAS_DIR = os.path.join(PROJ_DIR, "AMAS")
sys.path.append(PROJ_DIR)

from AMAS import species_annotation as sa
from AMAS import reaction_annotation as ra
from AMAS import recommender
from AMAS import constants as cn
from AMAS import tools

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [2]:
prev_notebook_dir = '/Users/woosubs/Desktop/AutomateAnnotation/AnnotationRecommender/annotation_recommender/notebook'
with open(os.path.join(prev_notebook_dir, 'chebi_models.pickle'), 'rb') as handle:
  chebi_models = pickle.load(handle)

with open(os.path.join(cn.REF_DIR, 'chebi_shortened_formula_comp.lzma'), 'rb') as f:
  ref_shortened_chebi_to_formula = compress_pickle.load(f)
with open(os.path.join(cn.REF_DIR, 'chebi_low_synonyms_comp.lzma'), 'rb') as f:
  chebi_low_synonyms = compress_pickle.load(f)

In [3]:
combs_charcount = compress_pickle.load(os.path.join(cn.REF_DIR, 'charcount_df_scaled.lzma'),
                                       compression="lzma")

In [4]:
charcount_df = combs_charcount.iloc[:, :-2]
chebi_charcount_df = combs_charcount.iloc[:, -2:]

In [5]:
def getCountOfIndividualCharacters(inp_str):
  """
  Get a list of characters
  between a-z and 0-9. 
  
  Parameters
  ----------
  inp_str: str
  
  Returns
  -------
  : list
  """
  return collections.Counter(itertools.chain(*re.findall('[a-z0-9]+', inp_str.lower())))
#
# def getTupleFromDict(inp_dict):
#   """
#   Get a list of tuples,
#   where each tuple has an item and a key
#   from a dictionary. 
#   e.g.) {'key': 'x', 'y'}
#   will be transofrmed as 
#   [('x', 'key'), ('y', 'key')]
   
#   Parameters
#   ----------
#   inp_dict: dict
  
#   Returns
#   -------
#   : list-tuple
#   """
#   res_list = []
#   for one_k in inp_dict.keys():
#     one_itm = inp_dict[one_k]
#     res_list.append([(val, one_k) for val in one_itm])
#   return list(itertools.chain(*res_list))

def prepareCounterQuery(specs,
                        ref_cols,
                        use_id=True):
  """
  Prepare a query vector, which will be used
  as a vector for predictor variables.
  Input will be a list of
  IDs using which names_used will be determined. 
  In addition, querys will also be scaled
  by the length of each vector. 
  
  There is 'use_id' option, so
  if False, directly use the string
  instead of searching for used_name. 
  
  Parameters
  ----------
  list-str: specs
      IDs of species
  ref_cols: list-str
      Column names to use
  use_id: bool
      If False, directly use the string
      If True, use getNameToUse
      
  Returns
  -------
  : pandas.DataFrame
  : dict
  """
  name_used = dict()
  query_mat = pd.DataFrame(0, index=ref_cols, columns=specs)
  for one_spec in specs:
    if use_id:
      name2use = spec_cl.getNameToUse(one_spec)
      # characters are lowered in getCountOfIndividualCharacters()
      char_counts = getCountOfIndividualCharacters(name2use)
      name_used[one_spec] = name2use
    else:
      name2use = one_spec
      # characters are lowered in getCountOfIndividualCharacters()
      char_counts = getCountOfIndividualCharacters(name2use)
      name_used[one_spec] = name2use
    for one_char in char_counts:
      query_mat.loc[one_char, one_spec] = char_counts[one_char] 
  # Now, scale it using the vector distance
  div_row = query_mat.apply(lambda col : np.sqrt(np.sum([val**2 for val in col])), axis = 0)
  norm_query = query_mat.divide(div_row, axis=1)
  return norm_query, name_used

def predictAnnotationByCosineSimilarity(inp_strs=None, inp_ids=None, ref_df=charcount_df):
  """
  Predict annotation by taking cosine distance 
  of character count vectors.
  
  Parameters
  ----------
  inp_strs: list-str
      Strings that will directly used
      for prediction
  inp_ids: list-str
      IDs with which name2use will be
      determined
  ref_df: DataFrame
      Reference database

  Returnsa
  -------
  : dict/None
        {'name_used': str,
         'chebi': [list-ChEBI],
         'match_score': [(ChEBI, float)],
         'formula': [list-formula]} 
    if no name/ID is given, return None
  """
  if inp_ids:
    one_query, name_used = prepareCounterQuery(specs=inp_ids,
                                               ref_cols=ref_df.columns,
                                               use_id=True)
  elif inp_strs:
    one_query, name_used = prepareCounterQuery(specs=inp_strs,
                                               ref_cols=ref_df.columns,
                                               use_id=False)  
  else:
    return None
  multi_mat = ref_df.dot(one_query)
  max_val = multi_mat.max()
  result = dict()
  for one_spec in one_query.columns:
    one_res = dict()
    one_res[cn.NAME_USED] = name_used[one_spec]
    cand_index = multi_mat[abs(multi_mat[one_spec]-max_val[one_spec])<0.00001].index
    # cand_tuples = [chebi_str_tups[val] for val in cand_index]
    # one_res[cn.CHEBI] = list(set([val[1] for val in cand_tuples]))
    one_res[cn.CHEBI] = list(set(chebi_charcount_df.loc[cand_index, 'chebi']))
    one_res[cn.MATCH_SCORE] = [(val, np.round(max_val[one_spec], 2)) \
                               for val in one_res[cn.CHEBI]]
    one_res[cn.FORMULA] = list(set([cn.REF_CHEBI2FORMULA[val] for val in one_res[cn.CHEBI] \
                           if val in cn.REF_CHEBI2FORMULA.keys()])) 
    result[one_spec] = one_res
  return result

In [6]:
cand_index = [0, 100, 2000]
set(chebi_charcount_df.loc[cand_index, 'chebi'])

{'CHEBI:18357', 'CHEBI:28309', 'CHEBI:36683'}

In [7]:
spec_cl = sa.SpeciesAnnotation(libsbml_fpath = ecoli_fpath)
one_query, one_name = prepareCounterQuery(['M_glc__D_e'],charcount_df.columns) 
print(one_name)
one_query.loc['g', 'M_glc__D_e']

{'M_glc__D_e': 'D-Glucose'}


0.35355339059327373

In [8]:
predictAnnotationByCosineSimilarity(inp_strs=['atp'])

{'atp': {'name_used': 'atp',
  'chebi': ['CHEBI:53394',
   'CHEBI:15422',
   'CHEBI:74926',
   'CHEBI:37537',
   'CHEBI:61432',
   'CHEBI:30616',
   'CHEBI:15702'],
  'match_score': [('CHEBI:53394', 1.0),
   ('CHEBI:15422', 1.0),
   ('CHEBI:74926', 1.0),
   ('CHEBI:37537', 1.0),
   ('CHEBI:61432', 1.0),
   ('CHEBI:30616', 1.0),
   ('CHEBI:15702', 1.0)],
  'formula': ['C10N5O13P3', 'O40PW12', 'C8O4', 'C7O4', 'C36O8']}}

In [10]:
recom = recommender.Recommender(libsbml_fpath=ecoli_fpath)

In [11]:
recom.getSpeciesAnnotation(pred_str='amp')

Recommendation(id='amp', credibility=0.97, candidates=[('CHEBI:50070', 1.0), ('CHEBI:60755', 1.0), ('CHEBI:456215', 1.0), ('CHEBI:53780', 1.0), ('CHEBI:6716', 1.0), ('CHEBI:44387', 1.0), ('CHEBI:45021', 1.0), ('CHEBI:37537', 1.0), ('CHEBI:78509', 1.0), ('CHEBI:28971', 1.0), ('CHEBI:176804', 1.0), ('CHEBI:16027', 1.0), ('CHEBI:60882', 1.0), ('CHEBI:64305', 1.0), ('CHEBI:138016', 1.0), ('CHEBI:51135', 1.0)], urls=['https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A50070', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A60755', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A456215', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A53780', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A6716', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A44387', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A45021', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A37537', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A78509',

In [13]:
recom.getSpeciesListAnnotation(pred_strs=['atp', 'amp'])

[Recommendation(id='atp', credibility=0.97, candidates=[('CHEBI:74926', 1.0), ('CHEBI:15702', 1.0), ('CHEBI:61432', 1.0), ('CHEBI:53394', 1.0), ('CHEBI:15422', 1.0), ('CHEBI:37537', 1.0), ('CHEBI:30616', 1.0)], urls=['https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A74926', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A15702', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A61432', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A53394', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A15422', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A37537', 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A30616']),
 Recommendation(id='amp', credibility=0.97, candidates=[('CHEBI:50070', 1.0), ('CHEBI:60755', 1.0), ('CHEBI:456215', 1.0), ('CHEBI:53780', 1.0), ('CHEBI:6716', 1.0), ('CHEBI:44387', 1.0), ('CHEBI:45021', 1.0), ('CHEBI:37537', 1.0), ('CHEBI:78509', 1.0), ('CHEBI:28971', 1.0), ('CHEBI:176804', 1.0), ('CHEBI:16027', 1.0), ('CHE

In [3]:
BIOMD_190_PATH = os.path.join(cn.TEST_DIR, 'BIOMD0000000190.xml')
BIOMD_634_PATH = os.path.join(cn.TEST_DIR, 'BIOMD0000000634.xml')
ONE_SPEC_CAND = ('CHEBI:15414', 1.0)
ONE_SPEC_URL = 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A15414'
TWO_SPEC_CAND = ('CHEBI:15729', 1.0)
TWO_SPEC_URL = 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI%3A15729'

ONE_REAC_CAND = ('RHEA:28830', 1.0)
ONE_REAC_URL = 'https://www.rhea-db.org/rhea/28830'

SPECIES_SAM = 'SAM'
SPECIES_SAM_NAME = 'S-adenosyl-L-methionine'
SPECIES_ORN = 'ORN'
REACTION_ODC = 'ODC'
REACTION_SAMDC = 'SAMdc'

ONE_CHEBI = 'CHEBI:15414'

In [8]:
recom = recommender.Recommender(libsbml_fpath=BIOMD_634_PATH)
# two_specs =  recom.getSpeciesListAnnotation(pred_ids=[SPECIES_SAM, SPECIES_ORN],
#                                             update=True, method='cdist')
# self.assertTrue((ONE_CHEBI, 1.0) in self.recom.species.candidates[SPECIES_SAM])
# one_formula = cn.REF_CHEBI2FORMULA[ONE_CHEBI]
# self.assertTrue(one_formula in self.recom.species.formula[SPECIES_SAM])    

In [10]:
recom.reactions.exist_annotation

{}

In [24]:
refs = {val:recom.species.exist_annotation_formula[val] \
        for val in recom.species.exist_annotation_formula.keys() \
        if recom.species.exist_annotation_formula[val]}
specs2eval = list(refs.keys())

In [25]:
specs2eval

['ATP', 'ADP', 'AMP']

In [29]:
preds_comb = recom.species.predictAnnotationByCosineSimilarity(inp_ids=specs2eval)
preds = {val:preds_comb[val][cn.FORMULA] for val in preds_comb.keys()}

In [38]:
recall = tools.getRecall(ref=refs, pred=preds, mean=True)
precision = tools.getPrecision(ref=refs, pred=preds, mean=True)

In [39]:
recall

1.0

In [40]:
precision

0.12474747474747473

In [34]:
preds

{'ATP': ['C7O4', 'C8O4', 'O40PW12', 'C36O8', 'C10N5O13P3'],
 'ADP': ['C10N5O10P2',
  'C12N3O6',
  'C9N2O2',
  'C22O2',
  'C12N',
  'C20N10O11P2',
  'C26',
  'C3N2O2',
  'C8O2',
  'C5N3',
  'C9Cl2NO'],
 'AMP': ['C16O',
  'C24O4',
  '(C4O2)n',
  'C10N2O2',
  'C6O4',
  'C16N3O4S',
  '(C3NO)n',
  'C36O8',
  'C10ClN7O',
  'C7NO4',
  'C10N5O7P',
  'C79N20O16']}

In [20]:
one_formula = cn.REF_CHEBI2FORMULA[ONE_CHEBI]

In [21]:
one_formula

'C15N6O5S'

In [13]:
recom.species.predictAnnotationByEditDistance('amp')

{'name_used': 'amp',
 'chebi': ['CHEBI:16027', 'CHEBI:28971', 'CHEBI:456215'],
 'match_score': [('CHEBI:16027', 1.0),
  ('CHEBI:28971', 1.0),
  ('CHEBI:456215', 1.0)],
 'formula': ['C16N3O4S', 'C10N5O7P']}