In [24]:
def getCountOfIndividualCharacters(inp_str):
  """
  Get a list of characters
  between a-z and 0-9. 
  
  Parameters
  ----------
  inp_str: str
  
  Returns
  -------
  : list
  """
  return collections.Counter(itertools.chain(*re.findall('[a-z0-9]+', inp_str.lower())))
#
def getTupleFromDict(inp_dict):
  """
  Get a list of tuples,
  where each tuple has an item and a key
  from a dictionary. 
  e.g.) {'key': 'x', 'y'}
  will be transofrmed as 
  [('x', 'key'), ('y', 'key')]
   
  Parameters
  ----------
  inp_dict: dict
  
  Returns
  -------
  : list-tuple
  """
  res_list = []
  for one_k in inp_dict.keys():
    one_itm = inp_dict[one_k]
    res_list.append([(val, one_k) for val in one_itm])
  return list(itertools.chain(*res_list))


def prepareCounterQuery(specs,
                        ref_cols,
                        use_id=True):
  """
  Prepare a query vector, which will be used
  as a vector for predictor variables.
  Input will be a list of
  IDs using which names_used will be determined. 
  In addition, querys will also be scaled
  by the length of each vector. 
  
  There is 'use_id' option, so
  if False, directly use the string
  instead of searching for used_name. 
  
  Parameters
  ----------
  list-str: specs
      IDs of species
  ref_cols: list-str
      Column names to use
  use_id: bool
      If False, directly use the string
      If True, use getNameToUse
      
  Returns
  -------
  : pandas.DataFrame
  : dict
  """
  name_used = dict()
  query_mat = pd.DataFrame(0, index=ref_cols, columns=specs)
  for one_spec in specs:
    if use_id:
      name2use = recom.species.getNameToUse(one_spec)
      # characters are lowered in getCountOfIndividualCharacters()
      char_counts = getCountOfIndividualCharacters(name2use)
      name_used[one_spec] = name2use
    else:
      name2use = one_spec
      # characters are lowered in getCountOfIndividualCharacters()
      char_counts = getCountOfIndividualCharacters(name2use)
      name_used[one_spec] = name2use
    for one_char in char_counts:
      query_mat.loc[one_char, one_spec] = char_counts[one_char] 
  # Now, scale it using the vector distance
  div_row = query_mat.apply(lambda col : np.sqrt(np.sum([val**2 for val in col])), axis = 0)
  norm_query = query_mat.divide(div_row, axis=1)
  return norm_query, name_used

def predictAnnotationByCosineDistance(inp_specs, ref_df):
  """
  Predict annotation by taking cosine distance 
  of character count vectors.
  
  Parameters
  ----------
  list-str: species_ids
      IDs of species

  Returnsa
  -------
  : dict
  """
  one_query, name_used = prepareCounterQuery(specs=inp_specs,
                                             ref_cols=ref_df.columns,
                                             use_id=True)
  multi_mat = ref_df.dot(one_query)
  max_val = multi_mat.max()
  result = dict()
  for one_spec in one_query.columns:
    one_res = dict()
    one_res[cn.NAME_USED] = name_used[one_spec]
    cand_index = multi_mat[abs(multi_mat[one_spec]-max_val[one_spec])<0.00001].index
    cand_tuples = [chebi_str_tups[val] for val in cand_index]
    one_res[cn.CHEBI] = list(set([val[1] for val in cand_tuples]))
    one_res[cn.MATCH_SCORE] = [(val, np.round(max_val[one_spec], 2)) \
                               for val in one_res[cn.CHEBI]]
    one_res[cn.FORMULA] = list(set([cn.REF_CHEBI2FORMULA[val] for val in one_res[cn.CHEBI] \
                           if val in cn.REF_CHEBI2FORMULA.keys()])) 
    result[one_spec] = one_res
  return result

In [25]:
# developing (and testing) Alternative methods for species, evaluating them
import collections
import copy
import compress_pickle
import editdistance
import itertools
import libsbml
import numpy as np
import os
import pickle
import pandas as pd
import re
import sys
import time
import matplotlib.pyplot as plt
%matplotlib inline  

BIOMD_12 = 'BIOMD0000000012.xml'
BASE_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/'
DATA_DIR = os.path.join(BASE_DIR, "DATA")
ALGO_DIR = os.path.join(DATA_DIR, "algo")
CHEBI_DIR = os.path.join(DATA_DIR, "chebi")
RHEA_DIR = os.path.join(DATA_DIR, "rhea")
BIOMODEL_DIR = os.path.join(DATA_DIR, "biomodels/curated_biomodels_31mar2021")
BIGG_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/DATA/bigg'

# Is for this purpose only
SAUROLAB_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/presentation/sauro_lab/dec8_2022'

# dir for alternative methods for species
ALT_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/AMAS_suppl/alt_methods_species_data'
ecoli_fpath = os.path.join(BIGG_DIR, "e_coli_core.xml")

PROJ_DIR = os.path.join(os.getcwd(), os.pardir)
AMAS_DIR = os.path.join(PROJ_DIR, "AMAS")
sys.path.append(PROJ_DIR)

from AMAS import species_annotation as sa
from AMAS import reaction_annotation as ra
from AMAS import recommender
from AMAS import constants as cn
from AMAS import iterator as it
from AMAS import tools

with open(os.path.join(cn.REF_DIR, 'chebi_low_synonyms_comp.lzma'), 'rb') as f:
  chebi_low_synonyms = compress_pickle.load(f)
all_chars = set(itertools.chain(*[itertools.chain(*chebi_low_synonyms[val]) for val in chebi_low_synonyms]))

with open(os.path.join(ALT_DIR, "chebi_name_id_tuples.pickle"), "rb") as fp:   # Unpickling
  chebi_str_tups = pickle.load(fp)
ref_df_chebis = [val[1] for val in chebi_str_tups]
# chebi_str_tups = getTupleFromDict(chebi_low_synonyms)
chebi_count_tups = [(getCountOfIndividualCharacters(val[0]), val[1]) for val in chebi_str_tups]

norm_ref_df = pd.read_csv(os.path.join(ALT_DIR, 'charcount_df_normalized.csv'))
print(len(chebi_str_tups) == norm_ref_df.shape[0])

True


In [30]:
norm_ref_df['synonym'] = [val[0] for val in chebi_str_tups]
norm_ref_df['chebi'] = [val[1] for val in chebi_str_tups]

In [33]:
# compress_pickle.dump(norm_ref_df,
#                      os.path.join(cn.REF_DIR, 'charcount_df_scaled.lzma'), 
#                      compression="lzma", set_default_extension=False)

In [34]:
# dum_df = compress_pickle.load(os.path.join(cn.REF_DIR, 'charcount_df_scaled.lzma'), 
#                      compression="lzma")

In [21]:
reader = libsbml.SBMLReader()
document = reader.readSBML(ecoli_fpath)
model = document.getModel()
recom = recommender.Recommender(libsbml_fpath=ecoli_fpath)
#
sids = list(recom.species.exist_annotation.keys())
print("Number of Species IDs with Annotation: %d" % len(sids))
print(sids)
multiple_specs = sids[:2]
print("First two species:", multiple_specs)

Number of Species IDs with Annotation: 72
['M_glc__D_e', 'M_gln__L_c', 'M_gln__L_e', 'M_glu__L_c', 'M_glu__L_e', 'M_glx_c', 'M_h2o_c', 'M_h2o_e', 'M_h_c', 'M_h_e', 'M_icit_c', 'M_lac__D_c', 'M_lac__D_e', 'M_mal__L_c', 'M_mal__L_e', 'M_nad_c', 'M_nadh_c', 'M_nadp_c', 'M_nadph_c', 'M_nh4_c', 'M_13dpg_c', 'M_nh4_e', 'M_o2_c', 'M_2pg_c', 'M_o2_e', 'M_3pg_c', 'M_oaa_c', 'M_pep_c', 'M_6pgc_c', 'M_pi_c', 'M_6pgl_c', 'M_pi_e', 'M_ac_c', 'M_pyr_c', 'M_pyr_e', 'M_q8_c', 'M_q8h2_c', 'M_r5p_c', 'M_ru5p__D_c', 'M_ac_e', 'M_acald_c', 'M_s7p_c', 'M_acald_e', 'M_accoa_c', 'M_succ_c', 'M_succ_e', 'M_succoa_c', 'M_acon_C_c', 'M_xu5p__D_c', 'M_actp_c', 'M_adp_c', 'M_akg_c', 'M_akg_e', 'M_amp_c', 'M_atp_c', 'M_cit_c', 'M_co2_c', 'M_co2_e', 'M_coa_c', 'M_dhap_c', 'M_e4p_c', 'M_etoh_c', 'M_etoh_e', 'M_f6p_c', 'M_fdp_c', 'M_for_c', 'M_for_e', 'M_fru_e', 'M_fum_c', 'M_fum_e', 'M_g3p_c', 'M_g6p_c']
First two species: ['M_glc__D_e', 'M_gln__L_c']


In [4]:
# First goal is to get annotations of species and compute the (total & average) processing time. 
# First of all, original method: 
# one_spec = 'M_glc__D_e'
# multiple_specs = sids[:2]

spec_list = sids
begin_time = time.time()
res_editdistance = {val:recom.species.predictAnnotationByEditDistance(inp_str=recom.species.names[val]) \
                    for val in spec_list}
end_time = time.time()
# print(res_editdistance)
time_diff = end_time - begin_time
print("Total processing time for %s species: %.02f seconds" % (len(spec_list), time_diff))
print("Average processing time per species: %.02f seconds" % (time_diff/len(spec_list)))

Total processing time for 72 species: 275.16 seconds
Average processing time per species: 3.82 seconds


In [8]:
form_editdist = {val:res_editdistance[val][cn.FORMULA] for val in res_editdistance.keys()}
form_editdist['M_glc__D_e']

['C6O6']

In [22]:
recall_edist = tools.getRecall(ref=recom.species.exist_annotation_formula,
                               pred=form_editdist)
precision_edist = tools.getPrecision(ref=recom.species.exist_annotation_formula,
                   pred=form_editdist)
print(np.round(recall_edist, 3))
print(np.round(precision_edist, 3))

0.903
0.84


In [14]:
# compress_pickle.dump(res_editdistance,
#                      os.path.join(SAUROLAB_DIR, "res_editdistance.lzma"),
#                      compression="lzma",
#                      set_default_extension=False)

In [4]:
spec_list = sids

begin_time = time.time()
res_cosdistance = predictAnnotationByCosineDistance(inp_specs=spec_list, ref_df=norm_ref_df)
end_time = time.time()
# print(res_cosdistance)
time_diff = end_time - begin_time
print("Total processing time for %s species: %.02f seconds" % (len(spec_list), time_diff))
print("Average processing time per species: %.02f seconds" % (time_diff/len(spec_list)))

Total processing time for 72 species: 1.35 seconds
Average processing time per species: 0.02 seconds


In [None]:
# compress_pickle.dump(res_cosdistance,
#                      os.path.join(SAUROLAB_DIR, "res_cosdistance.lzma"),
#                      compression="lzma",
#                      set_default_extension=False)

In [5]:
form_cosdist = {val:res_cosdistance[val][cn.FORMULA] for val in res_cosdistance.keys()}
form_cosdist['M_glc__D_e']

['C6O6']

In [6]:
recall_cdist = tools.getRecall(ref=recom.species.exist_annotation_formula,
                               pred=form_cosdist)
precision_cdist = tools.getPrecision(ref=recom.species.exist_annotation_formula,
                                     pred=form_cosdist)
print(np.round(recall_cdist, 3))
print(np.round(precision_cdist, 3))

0.931
0.834


In [7]:
comb_df = copy.deepcopy(norm_ref_df)
comb_df['outcome'] = ref_df_chebis
comb_df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,1,2,3,4,5,6,7,8,9,outcome
0,0.000000,0.000000,0.000000,0.000000,0.530330,0.000000,0.0,0.176777,0.353553,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,CHEBI:18357
1,0.200000,0.000000,0.000000,0.000000,0.400000,0.200000,0.0,0.000000,0.400000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,CHEBI:18357
2,0.277350,0.000000,0.000000,0.000000,0.554700,0.000000,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,CHEBI:18357
3,0.000000,0.000000,0.000000,0.000000,0.371391,0.000000,0.0,0.185695,0.371391,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,CHEBI:18357
4,0.113228,0.113228,0.000000,0.226455,0.452911,0.000000,0.0,0.226455,0.226455,0.0,...,0.226455,0.226455,0.000000,0.113228,0.000000,0.000000,0.000000,0.000000,0.000000,CHEBI:18357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
466443,0.375558,0.037556,0.112667,0.075112,0.338002,0.000000,0.0,0.262891,0.112667,0.0,...,0.187779,0.375558,0.075112,0.112667,0.075112,0.037556,0.000000,0.037556,0.000000,CHEBI:99995
466444,0.341593,0.136637,0.034159,0.170797,0.307434,0.068319,0.0,0.239115,0.136637,0.0,...,0.136637,0.102478,0.170797,0.136637,0.068319,0.034159,0.000000,0.000000,0.068319,CHEBI:99996
466445,0.338241,0.075165,0.150329,0.075165,0.300658,0.037582,0.0,0.225494,0.112747,0.0,...,0.150329,0.338241,0.112747,0.112747,0.075165,0.037582,0.000000,0.037582,0.000000,CHEBI:99997
466446,0.257396,0.032174,0.128698,0.193047,0.289570,0.032174,0.0,0.225221,0.257396,0.0,...,0.386094,0.225221,0.096523,0.064349,0.032174,0.032174,0.032174,0.032174,0.064349,CHEBI:99998


In [8]:
comb_df.iloc[:, -1:]

Unnamed: 0,outcome
0,CHEBI:18357
1,CHEBI:18357
2,CHEBI:18357
3,CHEBI:18357
4,CHEBI:18357
...,...
466443,CHEBI:99995
466444,CHEBI:99996
466445,CHEBI:99997
466446,CHEBI:99998


In [18]:
dummy_df = pd.DataFrame.from_dict({'species':['acetate', 'glucose'],
                                   'C':[2,6], 'H':[3,12], 'O':[2,6]})
dummy_df.set_index('species')

Unnamed: 0_level_0,C,H,O
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
acetate,2,3,2
glucose,6,12,6


In [4]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(random_state=0, n_jobs=-1)
begin_time = time.time()
rfc.fit(norm_ref_df, ref_df_chebis)
end_time = time.time()
time_diff = end_time - begin_time
print("Total training time %.02f seconds" % time_diff)

In [1]:
from sklearn.cluster import KMeans

In [None]:
km_model = KMeans(n_clusters=10000)
km_model.fit(norm_ref_df)

In [9]:
norm_ref_df.iloc[:1, :]

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,0.0,0.0,0.53033,0.0,0.0,0.176777,0.353553,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
km_model.predict(norm_ref_df.iloc[:1, :])

array([7], dtype=int32)

In [28]:
getCountOfIndividualCharacters('H2O')

Counter({'h': 1, '2': 1, 'o': 1})

In [23]:
norm_ref_df.shape

(466448, 36)

In [32]:
editdistance.eval('atp', 'medicarpin')

8

In [16]:
pd.__version__

'1.4.2'

In [27]:
from catboost import CatBoostClassifier

In [14]:
model = CatBoostClassifier(iterations=2,
                           learning_rate=1,
                           depth=2)
# Fit model
model.fit(X=norm_ref_df.iloc[:1000,:], y=ref_df_chebis[:1000])

0:	learn: 6.1206135	total: 238ms	remaining: 238ms
1:	learn: 5.7727271	total: 420ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f9d336d7dc0>

In [15]:
one_query, name_used = prepareCounterQuery(specs=multiple_specs, use_id=True)

In [18]:
# Get predicted classes
preds_class = model.predict(one_query.T)

In [19]:
preds_class

array([['CHEBI:28934'],
       ['CHEBI:50750']], dtype=object)

In [32]:

# Initialize data

# index of categorical columns
cat_features = [0, 1]
train_data = [["a", "b", 1, 4, 5, 6],
              ["a", "b", 4, 5, 6, 7],
              ["c", "d", 30, 40, 50, 60]]
train_labels = ['one', 'two', 'three']
eval_data = [["a", "b", 2, 4, 6, 8],
             ["a", "d", 1, 4, 50, 60]]

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=2,
                           learning_rate=1,
                           depth=2)
# Fit model
model.fit(train_data, train_labels, cat_features)
# Get predicted classes
preds_class = model.predict(eval_data)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_data)
# Get predicted RawFormulaVal
preds_raw = model.predict(eval_data, prediction_type='RawFormulaVal')

0:	learn: 0.9089182	total: 246us	remaining: 246us
1:	learn: 0.7584922	total: 630us	remaining: 0us


In [33]:
pd.DataFrame(train_data)

Unnamed: 0,0,1,2,3,4,5
0,a,b,1,4,5,6
1,a,b,4,5,6,7
2,c,d,30,40,50,60


In [34]:
pd.DataFrame(eval_data)

Unnamed: 0,0,1,2,3,4,5
0,a,b,2,4,6,8
1,a,d,1,4,50,60


In [35]:
model.fit(pd.DataFrame(train_data), train_labels, cat_features)

0:	learn: 0.9089182	total: 165us	remaining: 165us
1:	learn: 0.7584922	total: 470us	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f8dfcb40df0>

In [36]:
preds_class = model.predict(pd.DataFrame(eval_data))
preds_class

array([['one'],
       ['one']], dtype=object)

In [37]:
preds_proba = model.predict_proba(eval_data)
preds_proba

array([[0.46837208, 0.26581396, 0.26581396],
       [0.46837208, 0.26581396, 0.26581396]])

In [38]:
preds_raw

array([[ 0.37764425, -0.18882212, -0.18882212],
       [ 0.37764425, -0.18882212, -0.18882212]])

In [37]:
one_query, name_used = prepareCounterQuery(specs=['glucose'],
                                             ref_cols=norm_ref_df.columns,
                                             use_id=False)
one_query.T

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,0,1,2,3,4,5,6,7,8,9
glucose,0.0,0.0,0.377964,0.0,0.377964,0.0,0.377964,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
model.getAnnotationString()

''