In [1]:
# developing (and testing) Alternative methods for species
import collections
import compress_pickle
import editdistance
import itertools
import libsbml
import numpy as np
import os
import pickle
import pandas as pd
import re
import sys
import time
import matplotlib.pyplot as plt
%matplotlib inline  

BIOMD_12 = 'BIOMD0000000012.xml'
BASE_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/'
DATA_DIR = os.path.join(BASE_DIR, "DATA")
ALGO_DIR = os.path.join(DATA_DIR, "algo")
CHEBI_DIR = os.path.join(DATA_DIR, "chebi")
RHEA_DIR = os.path.join(DATA_DIR, "rhea")
BIOMODEL_DIR = os.path.join(DATA_DIR, "biomodels/curated_biomodels_31mar2021")
BIGG_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/DATA/bigg'

# dir for alternative methods for species
ALT_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/AMAS_suppl/alt_methods_species_data'
ecoli_fpath = os.path.join(BIGG_DIR, "e_coli_core.xml")

PROJ_DIR = os.path.join(os.getcwd(), os.pardir)
AMAS_DIR = os.path.join(PROJ_DIR, "AMAS")
sys.path.append(PROJ_DIR)

from AMAS import species_annotation as sa
from AMAS import reaction_annotation as ra
from AMAS import recommender
from AMAS import constants as cn
from AMAS import iterator as it
from AMAS import tools

In [2]:
with open(os.path.join(cn.REF_DIR, 'chebi_low_synonyms_comp.lzma'), 'rb') as f:
  chebi_low_synonyms = compress_pickle.load(f)
all_chars = set(itertools.chain(*[itertools.chain(*chebi_low_synonyms[val]) for val in chebi_low_synonyms]))

In [3]:
def getCountOfIndividualCharacters(inp_str):
  """
  Get a list of characters
  between a-z and 0-9. 
  
  Parameters
  ----------
  inp_str: str
  
  Returns
  -------
  : list
  """
  return collections.Counter(itertools.chain(*re.findall('[a-z0-9]+', inp_str)))
#
def getTupleFromDict(inp_dict):
  """
  Get a list of tuples,
  where each tuple has an item and a key
  from a dictionary. 
  e.g.) {'key': 'x', 'y'}
  will be transofrmed as 
  [('x', 'key'), ('y', 'key')]
   
  Parameters
  ----------
  inp_dict: dict
  
  Returns
  -------
  : list-tuple
  """
  res_list = []
  for one_k in inp_dict.keys():
    one_itm = inp_dict[one_k]
    res_list.append([(val, one_k) for val in one_itm])
  return list(itertools.chain(*res_list))

In [5]:
with open(os.path.join(ALT_DIR, "chebi_name_id_tuples.pickle"), "rb") as fp:   # Unpickling
  chebi_str_tups = pickle.load(fp)
# chebi_str_tups = getTupleFromDict(chebi_low_synonyms)
chebi_count_tups = [(getCountOfIndividualCharacters(val[0]), val[1]) for val in chebi_str_tups]

In [6]:
# [val for val in chebi_str_tups if editdistance.eval('atp', val[0]) <= 1]
[val for val in chebi_str_tups if val[1]=='CHEBI:15422' or val[1]=='CHEBI:16027']

[("adenosine 5'-triphosphate", 'CHEBI:15422'),
 ("adenosine-5'-triphosphate", 'CHEBI:15422'),
 ('atp', 'CHEBI:15422'),
 ('h4atp', 'CHEBI:15422'),
 ("adenosine 5'-(tetrahydrogen triphosphate)", 'CHEBI:15422'),
 ('adenosine triphosphate', 'CHEBI:15422'),
 ('adenosine phosphate', 'CHEBI:16027'),
 ('pa', 'CHEBI:16027'),
 ('adenosine monophosphate', 'CHEBI:16027'),
 ('adenylate', 'CHEBI:16027'),
 ("5'-adenosine monophosphate", 'CHEBI:16027'),
 ('amp', 'CHEBI:16027'),
 ("ado5'p", 'CHEBI:16027'),
 ("adenosine 5'-phosphate", 'CHEBI:16027'),
 ("5'-o-phosphonoadenosine", 'CHEBI:16027'),
 ('adenosini phosphas', 'CHEBI:16027'),
 ("5'-amp", 'CHEBI:16027'),
 ("5'-adenylic acid", 'CHEBI:16027'),
 ('adenylic acid', 'CHEBI:16027'),
 ("adenosine-5'p", 'CHEBI:16027'),
 ("adenosine-5'-monophosphoric acid", 'CHEBI:16027'),
 ('fosfato de adenosina', 'CHEBI:16027'),
 ("adenosine 5'-(dihydrogen phosphate)", 'CHEBI:16027'),
 ('pado', 'CHEBI:16027'),
 ("phosphate d'adenosine", 'CHEBI:16027'),
 ("adenosine 5'-mo

In [7]:
norm_ref_df = pd.read_csv(os.path.join(ALT_DIR, 'charcount_df_normalized.csv'))
norm_ref_df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,0,1,2,3,4,5,6,7,8,9
0,0.000000,0.000000,0.000000,0.000000,0.530330,0.000000,0.0,0.176777,0.353553,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.200000,0.000000,0.000000,0.000000,0.400000,0.200000,0.0,0.000000,0.400000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.277350,0.000000,0.000000,0.000000,0.554700,0.000000,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.371391,0.000000,0.0,0.185695,0.371391,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.113228,0.113228,0.000000,0.226455,0.452911,0.000000,0.0,0.226455,0.226455,0.0,...,0.000000,0.226455,0.226455,0.000000,0.113228,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
466443,0.375558,0.037556,0.112667,0.075112,0.338002,0.000000,0.0,0.262891,0.112667,0.0,...,0.000000,0.187779,0.375558,0.075112,0.112667,0.075112,0.037556,0.000000,0.037556,0.000000
466444,0.341593,0.136637,0.034159,0.170797,0.307434,0.068319,0.0,0.239115,0.136637,0.0,...,0.000000,0.136637,0.102478,0.170797,0.136637,0.068319,0.034159,0.000000,0.000000,0.068319
466445,0.338241,0.075165,0.150329,0.075165,0.300658,0.037582,0.0,0.225494,0.112747,0.0,...,0.000000,0.150329,0.338241,0.112747,0.112747,0.075165,0.037582,0.000000,0.037582,0.000000
466446,0.257396,0.032174,0.128698,0.193047,0.289570,0.032174,0.0,0.225221,0.257396,0.0,...,0.096523,0.386094,0.225221,0.096523,0.064349,0.032174,0.032174,0.032174,0.032174,0.064349


In [8]:
# Just to make sure the length match
# (i.e., index is the list of chebi terms described in the chebi_str_tups list)
print(len(chebi_str_tups) == norm_ref_df.shape[0])

True


In [9]:
# TODO: construct each query for prediction; and we're ready to build the system!

In [10]:
recom = recommender.Recommender(libsbml_fpath=ecoli_fpath)

In [11]:
sids = list(recom.species.exist_annotation.keys())
print("Number of Species IDs with Annotation: %d" % len(sids))
print(sids)

Number of Species IDs with Annotation: 72
['M_glc__D_e', 'M_gln__L_c', 'M_gln__L_e', 'M_glu__L_c', 'M_glu__L_e', 'M_glx_c', 'M_h2o_c', 'M_h2o_e', 'M_h_c', 'M_h_e', 'M_icit_c', 'M_lac__D_c', 'M_lac__D_e', 'M_mal__L_c', 'M_mal__L_e', 'M_nad_c', 'M_nadh_c', 'M_nadp_c', 'M_nadph_c', 'M_nh4_c', 'M_13dpg_c', 'M_nh4_e', 'M_o2_c', 'M_2pg_c', 'M_o2_e', 'M_3pg_c', 'M_oaa_c', 'M_pep_c', 'M_6pgc_c', 'M_pi_c', 'M_6pgl_c', 'M_pi_e', 'M_ac_c', 'M_pyr_c', 'M_pyr_e', 'M_q8_c', 'M_q8h2_c', 'M_r5p_c', 'M_ru5p__D_c', 'M_ac_e', 'M_acald_c', 'M_s7p_c', 'M_acald_e', 'M_accoa_c', 'M_succ_c', 'M_succ_e', 'M_succoa_c', 'M_acon_C_c', 'M_xu5p__D_c', 'M_actp_c', 'M_adp_c', 'M_akg_c', 'M_akg_e', 'M_amp_c', 'M_atp_c', 'M_cit_c', 'M_co2_c', 'M_co2_e', 'M_coa_c', 'M_dhap_c', 'M_e4p_c', 'M_etoh_c', 'M_etoh_e', 'M_f6p_c', 'M_fdp_c', 'M_for_c', 'M_for_e', 'M_fru_e', 'M_fum_c', 'M_fum_e', 'M_g3p_c', 'M_g6p_c']


In [12]:
# First goal is to get annotations of species and compute the (total & average) processing time. 
# First of all, original method: 
one_spec = 'M_glc__D_e'
multiple_specs = sids[:2]
begin_time = time.time()
# res = recom.species.predictAnnotationByEditDistance(inp_str=recom.species.names[one_spec])
res_editdistance = {val:recom.species.predictAnnotationByEditDistance(inp_str=recom.species.names[val]) \
                    for val in multiple_specs}
end_time = time.time()
# print(res_editdistance)
time_diff = end_time - begin_time
print("Total processing time: %.02f seconds" % time_diff)
print("Average processing time per species: %.02f seconds" % (time_diff/len(multiple_specs)))

Total processing time: 7.70 seconds
Average processing time per species: 3.85 seconds


In [13]:
def prepareCounterQuery(specs,
                        ref_cols=norm_ref_df.columns,
                        use_id=True):
  """
  Prepare a query vector, which will be used
  as a vector for predictor variables.
  Input will be a list of
  IDs using which names_used will be determined. 
  In addition, querys will also be scaled
  by the length of each vector. 
  
  There is 'use_id' option, so
  if False, directly use the string
  instead of searching for used_name. 
  
  Parameters
  ----------
  list-str: specs
      IDs of species
  ref_cols: list-str
      Column names to use
  use_id: bool
      If False, directly use the string
      If True, use getNameToUse
      
  Returns
  -------
  : pandas.DataFrame
  : dict
  """
  name_used = dict()
  query_mat = pd.DataFrame(0, index=ref_cols, columns=specs)
  for one_spec in specs:
    if use_id:
      name2use = recom.species.getNameToUse(one_spec)
      char_counts = getCountOfIndividualCharacters(name2use.lower())
      name_used[one_spec] = name2use
    else:
      name2use = one_spec
      char_counts = getCountOfIndividualCharacters(name2use.lower())
      name_used[one_spec] = name2use
    for one_char in char_counts:
      query_mat.loc[one_char, one_spec] = char_counts[one_char] 
  # Now, scale it using the vector distance
  div_row = query_mat.apply(lambda col : np.sqrt(np.sum([val**2 for val in col])), axis = 0)
  norm_query = query_mat.divide(div_row, axis=1)
  return norm_query, name_used

In [14]:
def predictAnnotationByCosineDistance(inp_specs):
  """
  Predict annotation by taking cosine distance 
  of character count vectors.
  
  Parameters
  ----------
  list-str: species_ids
      IDs of species

  Returnsa
  -------
  : dict
  """
  one_query, name_used = prepareCounterQuery(specs=inp_specs, use_id=True)
  multi_mat = norm_ref_df.dot(one_query)
  max_val = multi_mat.max()
  result = dict()
  for one_spec in one_query.columns:
    one_res = dict()
    one_res[cn.NAME_USED] = name_used[one_spec]
    cand_index = multi_mat[abs(multi_mat[one_spec]-max_val[one_spec])<0.00001].index
    cand_tuples = [chebi_str_tups[val] for val in cand_index]
    one_res[cn.CHEBI] = list(set([val[1] for val in cand_tuples]))
    one_res[cn.MATCH_SCORE] = [(val, np.round(max_val[one_spec], 2)) \
                               for val in one_res[cn.CHEBI]]
    one_res[cn.FORMULA] = list(set([cn.REF_CHEBI2FORMULA[val] for val in one_res[cn.CHEBI] \
                           if val in cn.REF_CHEBI2FORMULA.keys()])) 
    result[one_spec] = one_res
  return result

In [15]:
begin_time = time.time()
res_cosdistance = predictAnnotationByCosineDistance(inp_specs=multiple_specs)
end_time = time.time()
# print(res_cosdistance)
time_diff = end_time - begin_time
print("Total processing time: %.02f seconds" % time_diff)
print("Average processing time per species: %.02f seconds" % (time_diff/len(multiple_specs)))

Total processing time: 0.06 seconds
Average processing time per species: 0.03 seconds


In [16]:
# Third, we'll use boosting algorithm to do this. 


ModuleNotFoundError: No module named 'catboost'

In [17]:
import catboost

ModuleNotFoundError: No module named 'catboost'

In [None]:
# Match quality might be a little bit weaak, as this doesn't consider the ordering, and a lot of special chaaraacters

In [None]:
# TODO: COMPARE/EVALUATE 1. speed 2. recall 3. precision